Revert Extraction
===

Goal: Extract revisions from the stub-history files.

These files contain sha1's for the text of revisions, which I use to identify reverted revisions.

Every _revision_ can be either:
 - A revert revision
 - A reverted revision
 - A regular revision (and revert target)
 - A regular revision (and non-revert target)
 
For now, I think I'll just save all reverts....

...and perhaps subsequently all revisions? We'll see.

Target daterange: Jan 01, 2018 - Jan 01, 2020

In [1]:
import mwapi
import mwxml
import mwxml.utilities
import mwcli
import mwreverts
import oresapi
import mwparserfromhell

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [16]:
import os
from tqdm import tqdm
import bz2
import gzip
import json
import re
import hashlib
from datetime import datetime
import nltk
import scipy.stats
import para
from itertools import groupby
from collections import Counter

In [4]:
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = git_root_dir[0]
git_root_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback'

In [5]:
raw_data_dir = "/export/scratch2/wiki_data"
derived_data_dir = os.path.join(git_root_dir, "data", "derived")
raw_data_dir, derived_data_dir

('/export/scratch2/wiki_data',
 '/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived')

In [19]:
working_dir = os.path.join(derived_data_dir, 'stub-history-reverts')
os.makedirs(working_dir, exist_ok=True)
working_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived/stub-history-reverts'

In [6]:
article_index_path = os.path.join(raw_data_dir, "enwiki-20200101-pages-articles-multistream-index.txt")
article_index = open(article_index_path).readlines()
len(article_index)

19881980

In [7]:
page_title_dict = {}
for line in tqdm(article_index):
    tokens = line.strip().split(":")
    #page_start_bytes = int(tokens[0])
    page_id = int(tokens[1])
    page_title = "".join(tokens[2:])
    page_title_dict[page_id] = page_title
len(page_title_dict)

100%|██████████| 19881980/19881980 [00:33<00:00, 595704.25it/s]


19881980

In [8]:
stub_history_dir = os.path.join(raw_data_dir, "enwiki-20200101-stub-meta-history-gz")
assert os.path.exists(stub_history_dir)

In [22]:
paths = [os.path.join(stub_history_dir, stub_history_filename) 
         for stub_history_filename in os.listdir(stub_history_dir)
         if stub_history_filename.endswith(".xml.gz")]
len(paths)

27

In [52]:
start_date = datetime.fromisoformat('2018-01-01')
start_timestamp = int(start_date.timestamp())
end_date = datetime.fromisoformat('2020-01-01')
end_timestamp = int(end_date.timestamp())
start_timestamp, end_timestamp

(1514786400, 1577858400)

In [15]:
# https://github.com/mediawiki-utilities/python-mwxml/blob/master/mwxml/utilities/dump2revdocs.py
def dump2revdocs(dump, verbose=False):
    for page in dump:
        if page.namespace == 0 and page.redirect is None:
            for revision in page:
                yield revision.to_json()

In [81]:
def process_dump(dump, ndjson_filepath):
    with open(ndjson_filepath, 'w') as outfile:
        for page in dump:
            if page.namespace != 0 or page.redirect is not None:
                continue
            page_id = page.id
            rev_count = 0

            rev_tups = []
            is_revert_target_set = set()
            is_reverted_set = set()
            is_reverting_set = set()

            # we use a new detector for each page
            detector = mwreverts.Detector(radius=15)
            for revision in page:
                rev_count += 1
                
                # convert each revision to json and extract the relevant info from it
                rev_doc = revision.to_json()
                rev_id = rev_doc['id']
                rev_timestamp = int(datetime.strptime(rev_doc['timestamp'], "%Y-%m-%dT%H:%M:%SZ").timestamp())
                rev_tup = [page_id, rev_id, rev_timestamp]
                rev_tups.append(rev_tup)

                # now, we check if we have identified a new revert
                checksum = rev_doc.get('sha1') or mwreverts.DummyChecksum()
                revert = detector.process(checksum, rev_doc)
                
                # we only consider reverts in the target timerange
                if revert and rev_timestamp >= start_timestamp and rev_timestamp <= end_timestamp:
                    revert_json = revert.to_json()

                    reverting_id = revert_json['reverting']['id']
                    reverted_to_id = revert_json['reverted_to']['id']
                    reverteds_ids = [rev['id'] for rev in revert_json['reverteds']]

                    # keep track of which revision ids are reverts/reverting/reverted-to-targets
                    is_reverting_set.add(reverting_id)
                    is_revert_target_set.add(reverted_to_id)
                    is_reverted_set.update(reverteds_ids)

                    # we save reverts to an ndjson file
                    outfile.write(str(revert_json) + "\n")

            # having processed for reverts, we output all revisions along with their types back to the central process
            for rev_tup in rev_tups:
                page_id, rev_id, rev_timestamp = rev_tup
                if rev_timestamp >= start_timestamp and rev_timestamp <= end_timestamp:
                    is_revert_target = int(rev_id in is_revert_target_set)
                    is_reverted = int(rev_id in is_reverted_set)
                    is_reverting = int(rev_id in is_reverting_set)
                    yield page_id, rev_id, rev_timestamp, is_revert_target, is_reverted, is_reverting

def process_stub_history_filepath(path):
    """
    :path str: string path to a Gzip-ed Wikipedia XML file. Designed to be called with stub history files.
    """
    with gzip.open(path, 'rt', encoding='utf-8', errors='replace') as infile:
        dump = mwxml.Dump.from_file(infile)
        ndjson_filename = os.path.splitext(os.path.basename(path))[0] + "-reverts.ndjson"
        ndjson_filepath = os.path.join(working_dir, ndjson_filename)        
        results = process_dump(dump, ndjson_filepath)
        #rev_docs = dump2revdocs(dump)
        #results = process_rev_docs(rev_docs)
        yield from results

In [None]:
# this cell demonstrates processing a single file
start = datetime.now()
with open(os.path.join(working_dir, 'rev_ids_single_file.csv'), 'w') as outfile:
    for result in process_stub_history_filepath(paths[0]):
        page_id, rev_id, rev_timestamp, is_revert_target, is_reverted, is_reverting = result
        outfile.write(f"{page_id},{rev_id},{rev_timestamp},{is_revert_target},{is_reverted},{is_reverting}\n")
print(f"{datetime.now() - start}")

### Process all files in parallel

In [79]:
# this cell uses para to process all of the history files in parallel
start = datetime.now()
with open(os.path.join(working_dir, 'rev_ids.csv'), 'w') as outfile:
    for result in para.map(process_stub_history_filepath, paths, mappers=len(paths)):
        page_id, rev_id, rev_timestamp, is_revert_target, is_reverted, is_reverting = result
        outfile.write(f"{page_id},{rev_id},{rev_timestamp},{is_revert_target},{is_reverted},{is_reverting}\n")
print(f"{datetime.now() - start}")

13:13:01.393968


Initial runtime was 13 hours, 13 minutes, which is quite reasonable.  The total storage required for the revision ids file and the reverts json was 13GB, which is also quite reasonable.

In total, we identified 5,992,682 reverts in namespace-0 non-redirect enwiki pages from 2018-2020.

We identified 77,287,697 total revisions on the same set of pages.

In [80]:
# 7.8% of revisions are reverts
5992682 / 77287697

0.07753733430561399

The output file is sorted using bash:

```
sort -k1 -n -t, rev_ids.csv > rev_ids_sorted.csv
```