Sample Text Retrieval
===

Getting revision text for the sampled data.

In [1]:
import mwapi
import mwxml
import mwxml.utilities
import mwcli
import mwreverts
import oresapi
import mwparserfromhell

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

In [3]:
import os
from tqdm import tqdm
import bz2
import gzip
import json
import re
import hashlib
from datetime import datetime
import nltk
import scipy.stats
import para
from itertools import groupby
from collections import Counter

In [4]:
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = git_root_dir[0]
git_root_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback'

In [5]:
raw_data_dir = "/export/scratch2/wiki_data"
derived_data_dir = os.path.join(git_root_dir, "data", "derived")
raw_data_dir, derived_data_dir

('/export/scratch2/wiki_data',
 '/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived')

In [6]:
working_dir = os.path.join(derived_data_dir, 'revision_sample')
os.makedirs(working_dir, exist_ok=True)
working_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived/revision_sample'

In [7]:
start_date = datetime.fromisoformat('2018-01-01')
start_timestamp = int(start_date.timestamp())
end_date = datetime.fromisoformat('2020-01-01')
end_timestamp = int(end_date.timestamp())
start_timestamp, end_timestamp

(1514786400, 1577858400)

In [8]:
sample_start_timestamp = start_timestamp
sample_end_date = datetime.fromisoformat('2019-01-01')
sample_end_timestamp = int(end_date.timestamp())

### Load the sample

In [9]:
# read in the sample dataframe
revision_sample_dir = os.path.join(derived_data_dir, 'revision_sample')
sample1_filepath = os.path.join(revision_sample_dir, 'sample1_1M.pkl')
rev_df = pd.read_pickle(sample1_filepath)
len(rev_df)

1000000

In [10]:
rev_df.head()

Unnamed: 0,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp,reverted_rev_ids,reverting_rev_id,reverting_rev_timestamp
29999548,9516095,846835190,1529576038,0,0,0,True,846835009,850539656,1529575914,1531769146,[],-1,-1
12474631,876872,923997118,1572595746,0,0,0,True,923211396,924618182,1572162918,1572930319,[],-1,-1
62536703,53465104,867368228,1541425108,0,0,0,True,867366487,867663184,1541423455,1541590304,[],-1,-1
73902987,60096152,888337768,1552936050,0,0,0,True,888337457,888338754,1552935890,1552936561,[],-1,-1
43843955,28063274,917010365,1569117229,0,0,0,True,915190631,-1,1568247118,-1,[],-1,-1


### Retrieve revision texts

In [11]:
rev_id_list = rev_df.rev_id
len(rev_id_list)

1000000

In [15]:
session = mwapi.Session("https://en.wikipedia.org/w/api.php", user_agent="levon003@umn.edu - revision text retrieval")

In [16]:
params = {
    'action': 'query',
    'format': 'json',
    'pageids': str(43825897),
    'prop': 'info',
    'inprop': 'url'
}
session.get(params)

{'batchcomplete': '',
 'query': {'pages': {'43825897': {'pageid': 43825897,
    'ns': 0,
    'title': 'Bethel Church, Mansfield Woodhouse',
    'contentmodel': 'wikitext',
    'pagelanguage': 'en',
    'pagelanguagehtmlcode': 'en',
    'pagelanguagedir': 'ltr',
    'touched': '2020-02-09T10:07:54Z',
    'lastrevid': 870460930,
    'length': 20559,
    'fullurl': 'https://en.wikipedia.org/wiki/Bethel_Church,_Mansfield_Woodhouse',
    'editurl': 'https://en.wikipedia.org/w/index.php?title=Bethel_Church,_Mansfield_Woodhouse&action=edit',
    'canonicalurl': 'https://en.wikipedia.org/wiki/Bethel_Church,_Mansfield_Woodhouse'}}}}

In [57]:
params = {
    'action': 'query',
    'format': 'json',
    'formatversion': '2',
    'revids': "|".join([str(rev_id) for rev_id in rev_id_list[:50]]),
    'prop': 'revisions',
    'rvprop': 'timestamp|user|comment|content',
    'rvslots': 'main',
}
result = session.get(params)

In [58]:
result.keys()

dict_keys(['batchcomplete', 'query'])

In [59]:
result['batchcomplete']

True

In [60]:
result['query']['pages'][0].keys()

dict_keys(['pageid', 'ns', 'title', 'revisions'])

In [61]:
result['query']['pages'][0]['revisions'][0].keys()

dict_keys(['user', 'anon', 'timestamp', 'slots', 'comment'])

In [62]:
result['query']['pages'][0]['revisions'][0]['slots']['main'].keys()

dict_keys(['contentmodel', 'contentformat', 'content'])

In [63]:
result['query']['pages'][0]['revisions'][0]['slots']['main']['content']

'{{distinguish|text=the [[Qing dynasty]], the last dynasty of Imperial China}}\n{{redirect|Qin Empire||Qin Empire (disambiguation)}}\n{{pp-pc1}}\n{{Use dmy dates|date=November 2013}}\n{{Use British English|date=August 2017}}\n{{Infobox former country\n|native_name = {{big|{{nobold|{{lang|zh|秦}}}}}}\n|conventional_long_name = Qin\n|common_name = Qin dynasty\n|continent = Asia\n|region = East Asia\n|era = [[Imperial era of Chinese history|Imperial]]\n|status = Empire\n|status_text =\n|government_type = [[Absolute monarchy]]\n|p1 = Zhou dynasty\n|p2 = Qin (state)\n|s1 = Eighteen Kingdoms\n|s2 = Han dynasty\n|s3 = Nanyue\n|event_start = {{nowrap|[[Qin\'s wars of unification|Unification of China]]}}\n|year_start = 221 BC\n|event1 = {{nowrap|Death of [[Qin Shi Huang]]}}\n|date_event1 = 210 BC\n|event2 = |date_event2 = |event3 = |date_event3 = |event4 = |date_event4 = |event5 = |date_event5 = \n|event_end = {{nowrap|Surrender to [[Liu Bang]]}}\n|year_end = 206 BC\n|image_map2 = Qin Dynasty.pn

In [42]:
tup = rev_df.iloc[0]
tup

page_id                       9516095
rev_id                      846835190
rev_timestamp              1529576038
is_revert_target                    0
is_reverted                         0
is_reverting                        0
is_sample_eligible               True
prev_rev_id                 846835009
next_rev_id                 850539656
prev_rev_timestamp         1529575914
next_rev_timestamp         1531769146
reverted_rev_ids                   []
reverting_rev_id                   -1
reverting_rev_timestamp            -1
Name: 29999548, dtype: object

### Comparing

In [54]:
params = {
    'action': 'compare',
    'format': 'json',
    'formatversion': '2',
    'fromrev': tup.prev_rev_id,
    'torev': tup.rev_id,
}
session.get(params)

{'compare': {'fromid': 9516095,
  'fromrevid': 846835009,
  'fromns': 0,
  'fromtitle': 'Weihai Dashuibo Airport',
  'toid': 9516095,
  'torevid': 846835190,
  'tons': 0,
  'totitle': 'Weihai Dashuibo Airport',
  'body': '<tr>\n  <td colspan="2" class="diff-lineno">Line 47:</td>\n  <td colspan="2" class="diff-lineno">Line 47:</td>\n</tr>\n<tr>\n  <td class="diff-marker">&#160;</td>\n  <td class="diff-context"><div>| [[Chengdu Airlines]] | [[Changchun Longjia International Airport|Changchun]]</div></td>\n  <td class="diff-marker">&#160;</td>\n  <td class="diff-context"><div>| [[Chengdu Airlines]] | [[Changchun Longjia International Airport|Changchun]]</div></td>\n</tr>\n<tr>\n  <td class="diff-marker">&#160;</td>\n  <td class="diff-context"><div>| [[China Airlines]] | [[Taiwan Taoyuan International Airport|Taipei–Taoyuan]]</div></td>\n  <td class="diff-marker">&#160;</td>\n  <td class="diff-context"><div>| [[China Airlines]] | [[Taiwan Taoyuan International Airport|Taipei–Taoyuan]]</div

In [51]:
from IPython.display import HTML, display

In [55]:
display(HTML(session.get(params)['compare']['body']  ))