Revision Sampling
===

Sample revisions that should be investigated further.

Notably, the identified revisions will have:
 - Text of the identified revisions retrieved
 - ORES scores of the identified revisions retrieved
 
#### Sampling details:

What we're really doing is sampling revision _pairs_.

We sample a non-revert revision and also the prior revision. Then, we check to see if this revision was reverted.

For reverted revisions, we merge in reverts data to establish the time to revert.

For now, we're sampling only in 2018 (so that we have all of 2019 to observe reverts)

In [1]:
import mwapi
import mwxml
import mwxml.utilities
import mwcli
import mwreverts
import oresapi
import mwparserfromhell

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

In [3]:
import os
from tqdm import tqdm
import bz2
import gzip
import json
import re
import hashlib
from datetime import datetime
import nltk
import scipy.stats
import para
from itertools import groupby
from collections import Counter

In [4]:
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = git_root_dir[0]
git_root_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback'

In [5]:
raw_data_dir = "/export/scratch2/wiki_data"
derived_data_dir = os.path.join(git_root_dir, "data", "derived")
raw_data_dir, derived_data_dir

('/export/scratch2/wiki_data',
 '/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived')

In [6]:
working_dir = os.path.join(derived_data_dir, 'revision_sample')
os.makedirs(working_dir, exist_ok=True)
working_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived/revision_sample'

In [7]:
start_date = datetime.fromisoformat('2018-01-01')
start_timestamp = int(start_date.timestamp())
end_date = datetime.fromisoformat('2020-01-01')
end_timestamp = int(end_date.timestamp())
start_timestamp, end_timestamp

(1514786400, 1577858400)

In [8]:
sample_start_timestamp = start_timestamp
sample_end_date = datetime.fromisoformat('2019-01-01')
sample_end_timestamp = int(sample_end_date.timestamp())

### Load in all revisions

Which excludes revisions outside of 2018-2020 and only for pages in ns0 and non-redirects.

In [9]:
start = datetime.now()
stub_history_reverts_dir = os.path.join(derived_data_dir, 'stub-history-reverts')
rev_ids_filepath = os.path.join(stub_history_reverts_dir, 'rev_ids_sorted.csv')
df = pd.read_csv(rev_ids_filepath,
                 header=None, 
                 names=['page_id', 'rev_id', 'rev_timestamp', 'is_revert_target', 'is_reverted', 'is_reverting']
)
print(f"{datetime.now() - start}")
len(df)

0:00:41.786537


77287697

In [10]:
df.head()

Unnamed: 0,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting
0,12,818611292,1515101356,0,0,0
1,12,818613649,1515102279,0,0,0
2,12,818624114,1515106953,1,0,0
3,12,820024812,1515798752,0,1,0
4,12,820025687,1515799060,0,0,1


In [11]:
# number of unique pages in this sample
page_ids_set = set(df.page_id)
len(page_ids_set)

5338582

In [12]:
# Read the revert info
stub_history_reverts_dir = os.path.join(derived_data_dir, 'stub-history-reverts')
revert_df_filepath = os.path.join(stub_history_reverts_dir, 'revert_df.pkl')
revert_df = pd.read_pickle(revert_df_filepath)
len(revert_df)

5992682

In [13]:
revert_df.head(3)

Unnamed: 0,page_id,reverted_count,reverting_rev_id,reverting_timestamp,reverting_user_text,reverting_user_id,reverted_to_rev_id,reverted_to_timestamp,reverted_to_user_text,reverted_to_user_id,reverted_rev_ids,reverted_timestamps,reverted_user_texts,reverted_user_ids,reverting_user_is_anonymous,reverted_to_user_is_anonymous,reverted_users_is_anonymous
0,18754764,1,902766646,1561105801,Malcolmxl5,4076676.0,874884880,1545481829,Hmains,508734.0,[900328973],[1559706521],[2601:199:417F:8EED:A0B0:A6B5:3457:A9B4],[None],False,False,[True]
1,18754831,2,818980415,1515284429,BrownHairedGirl,754619.0,743241620,1475970276,Emir of Wikipedia,28856560.0,"[757539852, 775847398]","[1483188655, 1492452285]","[Ser Amantio di Nicolao, Emir of Wikipedia]","[753665, 28856560]",False,False,"[False, False]"
2,18754831,1,932235094,1577204938,UA3,25923702.0,918313790,1569654385,Monkbot,20483999.0,[932233860],[1577204033],[Qowa],[37692801],False,False,[False]


In [14]:
start = datetime.now()
page_df = df.groupby('page_id').agg({
    'rev_id': len,
    'is_reverted': np.sum,
    'is_reverting': np.sum
}).rename(columns={
    'rev_id': 'revision_count',
    'is_reverted': 'reverted_count',
    'is_reverting': 'revert_count'
})
print(f"{datetime.now() - start}")
len(page_df)

0:00:36.421846


5338582

In [15]:
page_df.sample(3)

Unnamed: 0_level_0,revision_count,reverted_count,revert_count
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
56553320,7,0,0
51009457,5,0,0
42727563,5,0,0


In [16]:
np.quantile(page_df.revision_count, 0.9), \
np.sum(page_df.revision_count>=28) / len(page_df)

(28.0, 0.10042273397692496)

In [17]:
eligible_page_ids = set(page_df[page_df.revision_count>=28].index)
len(eligible_page_ids)

536115

In [18]:
eligible_for_sampling = []
curr_page_id = -1
for tup in tqdm(df.itertuples(), total=len(df)):
    if tup.page_id != curr_page_id:
        curr_page_id = tup.page_id
        # can never sample the FIRST post-2018 revision
        eligible_for_sampling.append(False)
    else:
        is_eligible = tup.rev_timestamp <= sample_end_timestamp and tup.page_id in eligible_page_ids
        eligible_for_sampling.append(is_eligible)

100%|██████████| 77287697/77287697 [02:15<00:00, 568886.66it/s]


In [19]:
df['is_sample_eligible'] = eligible_for_sampling
# after filtering, only 29.6% are actually eligible...
np.sum(df.is_sample_eligible) / len(df)

0.2964584777315851

In [20]:
np.sum(df.is_sample_eligible)

22912593

In [21]:
df = df.reset_index().rename(columns={'index': 'full_index'})

## Sample version 1

Uses the same code as Sample version 2; to see original version go look at an old revision of this notebook.

The original sample was 60.4% eligible due to the bug described in the cell below.

In [None]:
# Version 1 of the sample included a bug in which sample_end_timestamp was set to 2020-01-01
raise ValueError("Recreate sample1 from code above, changing eligibility requirements.")
sample_end_date = datetime.fromisoformat('2020-01-01')
sample_end_timestamp = int(sample_end_date.timestamp())

## Sample version 2

In [23]:
sample = df[df.is_sample_eligible].sample(n=1000000, random_state=45).copy(deep=True)
len(sample)

1000000

In [24]:
# 14.7% of the sample are reverted
np.sum(sample.is_reverted) / len(sample)

0.147456

In [26]:
# 10.7% of the sample are reverts
np.sum(sample.is_reverting) / len(sample)

0.106627

In [27]:
# 6.2% of pages with 1+ revision are included in the sample
len(set(sample.page_id)), len(set(sample.page_id)) / len(page_ids_set)

(332921, 0.06236131616972447)

In [28]:
matched_sample = df.loc[sample.index - 1] #df[df.full_index.isin(sample.full_index.to_numpy() - 1)]
assert len(matched_sample) == len(sample)
assert np.all(sample.page_id.reset_index(drop=True) == matched_sample.page_id.reset_index(drop=True))

In [29]:
sample = sample.assign(prev_rev_id=matched_sample.rev_id.tolist())
sample.head(3)

Unnamed: 0,full_index,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id
61378268,61378268,52232058,831517149,1521608836,0,0,0,True,831516744
15696379,15696379,1501758,841712258,1526588824,0,0,0,True,841712075
1452860,1452860,25239,844335926,1528115396,0,0,0,True,844335334


In [30]:
matched_sample = df.loc[sample.index + 1]
assert len(matched_sample) == len(sample)
sample['next_rev_id'] = -1
idx = np.array(sample.page_id.tolist()) == np.array(matched_sample.page_id.tolist())
print(f"{np.sum(idx)} / {len(sample)} sampled revisions have 1+ subsequent revision in 2018 or 2019.")
sample.loc[idx, 'next_rev_id'] = matched_sample[idx].rev_id.tolist()

999850 / 1000000 sampled revisions have 1+ subsequent revision in 2018 or 2019.


In [31]:
# get the timestamp of the previous and next revisions
rev_id_timestamp_dict = {tup.rev_id: tup.rev_timestamp for tup in tqdm(df.itertuples(), total=len(df))}
sample['prev_rev_timestamp'] = sample.prev_rev_id.map(lambda prev_rev_id: rev_id_timestamp_dict[prev_rev_id])
sample['next_rev_timestamp'] = sample.next_rev_id.map(lambda next_rev_id: rev_id_timestamp_dict[next_rev_id] if next_rev_id != -1 else -1)

100%|██████████| 77287697/77287697 [02:42<00:00, 474877.87it/s]


In [32]:
sample.head()

Unnamed: 0,full_index,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp
61378268,61378268,52232058,831517149,1521608836,0,0,0,True,831516744,831528824,1521608698,1521614309
15696379,15696379,1501758,841712258,1526588824,0,0,0,True,841712075,841712410,1526588726,1526588897
1452860,1452860,25239,844335926,1528115396,0,0,0,True,844335334,844343345,1528114910,1528120731
56885037,56885037,46695824,846301278,1529286250,0,0,0,True,845230648,846302881,1528645143,1529287006
16350465,16350465,1664084,838201915,1524687382,0,0,0,True,838077319,838203213,1524617329,1524687976


In [33]:
sample_reverting_rev_ids = set(sample[sample.is_reverting==1].rev_id)
reverting_rev_id_to_reverted_ids_dict = {
    row.reverting_rev_id: row.reverted_rev_ids
    for row in tqdm(revert_df.itertuples(), total=len(revert_df))
    if row.reverting_rev_id in sample_reverting_rev_ids
}

100%|██████████| 5992682/5992682 [00:21<00:00, 273317.15it/s]


In [34]:
# for reverting revisions in the sample, merge in the list of reverted_rev_ids
# using the dictionary computed in the cell above
reverted_rev_ids_list = []
for tup in sample.itertuples(): 
    if tup.is_reverting == 1:
        reverted_rev_ids = reverting_rev_id_to_reverted_ids_dict[tup.rev_id]
        reverted_rev_ids_list.append(reverted_rev_ids)
    else:
        reverted_rev_ids_list.append([])
sample['reverted_rev_ids'] = reverted_rev_ids_list

In [35]:
reverted_to_reverting_rev_id_dict = {}
for tup in tqdm(revert_df.itertuples(), total=len(revert_df)):
    for rev_id in tup.reverted_rev_ids:
        reverted_to_reverting_rev_id_dict[rev_id] = tup.reverting_rev_id

100%|██████████| 5992682/5992682 [00:26<00:00, 230184.25it/s]


In [36]:
# grab the reverting id for reverted revisions
sample['reverting_rev_id'] = -1
sample.loc[sample.is_reverted==1,'reverting_rev_id'] = [
    reverted_to_reverting_rev_id_dict[rev_id] for rev_id in sample.loc[sample.is_reverted==1].rev_id
]

In [37]:
# merge in the time of the reverting revision
sample['reverting_rev_timestamp'] = -1
reverting_rev_timestamp_dict = {tup.rev_id: tup.rev_timestamp for tup in df[df.is_reverting==1].itertuples()}
sample.loc[sample.is_reverted==1,'reverting_rev_timestamp'] = [
    reverting_rev_timestamp_dict[rev_id] for rev_id in sample.loc[sample.is_reverted==1].reverting_rev_id
]

In [38]:
sample[sample.is_reverted==1].head()

Unnamed: 0,full_index,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp,reverted_rev_ids,reverting_rev_id,reverting_rev_timestamp
68891325,68891325,57413215,866893479,1541160459,0,1,0,True,866893418,866967935,1541160394,1541204058,[],867171652,1541322909
13792197,13792197,1093068,835843368,1523434577,0,1,0,True,830579779,835843386,1521155767,1523434590,[],835843386,1523434590
8733950,8733950,401673,865563402,1540422024,0,1,0,True,864948432,865563810,1540074074,1540422182,[],865564999,1540422665
5751463,5751463,187739,825845673,1518744424,0,1,0,True,825379935,825845681,1518508873,1518744427,[],825845681,1518744427
50493501,50493501,38042480,858876974,1536578862,0,1,0,True,858694708,858877030,1536473591,1536578922,[],858877030,1536578922


In [39]:
sample = sample.drop(columns='full_index')
sample.head(1)

Unnamed: 0,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp,reverted_rev_ids,reverting_rev_id,reverting_rev_timestamp
61378268,52232058,831517149,1521608836,0,0,0,True,831516744,831528824,1521608698,1521614309,[],-1,-1


In [40]:
# save the sample
sample1_filepath = os.path.join(working_dir, 'sample2_1M.pkl')
sample.to_pickle(sample1_filepath)
print("Finished.")

Finished.


In [41]:
# read in the sample dataframe
revision_sample_dir = os.path.join(derived_data_dir, 'revision_sample')
sample1_filepath = os.path.join(revision_sample_dir, 'sample2_1M.pkl')
rev_df = pd.read_pickle(sample1_filepath)
len(rev_df)

1000000

In [42]:
rev_df.head()

Unnamed: 0,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp,reverted_rev_ids,reverting_rev_id,reverting_rev_timestamp
61378268,52232058,831517149,1521608836,0,0,0,True,831516744,831528824,1521608698,1521614309,[],-1,-1
15696379,1501758,841712258,1526588824,0,0,0,True,841712075,841712410,1526588726,1526588897,[],-1,-1
1452860,25239,844335926,1528115396,0,0,0,True,844335334,844343345,1528114910,1528120731,[],-1,-1
56885037,46695824,846301278,1529286250,0,0,0,True,845230648,846302881,1528645143,1529287006,[],-1,-1
16350465,1664084,838201915,1524687382,0,0,0,True,838077319,838203213,1524617329,1524687976,[],-1,-1
