Revision Sampling
===

Sample revisions that should be investigated further.

Notably, the identified revisions will have:
 - Text of the identified revisions retrieved
 - ORES scores of the identified revisions retrieved
 
#### Sampling details:

What we're really doing is sampling revision _pairs_.

We sample a non-revert revision and also the prior revision. Then, we check to see if this revision was reverted.

For reverted revisions, we merge in reverts data to establish the time to revert.

For now, we're sampling only in 2018 (so that we have all of 2019 to observe reverts)

In [1]:
import mwapi
import mwxml
import mwxml.utilities
import mwcli
import mwreverts
import oresapi
import mwparserfromhell

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

In [3]:
import os
from tqdm import tqdm
import bz2
import gzip
import json
import re
import hashlib
from datetime import datetime
import nltk
import scipy.stats
import para
from itertools import groupby
from collections import Counter

In [4]:
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = git_root_dir[0]
git_root_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback'

In [5]:
raw_data_dir = "/export/scratch2/wiki_data"
derived_data_dir = os.path.join(git_root_dir, "data", "derived")
raw_data_dir, derived_data_dir

('/export/scratch2/wiki_data',
 '/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived')

In [6]:
working_dir = os.path.join(derived_data_dir, 'revision_sample')
os.makedirs(working_dir, exist_ok=True)
working_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived/revision_sample'

In [7]:
start_date = datetime.fromisoformat('2018-01-01')
start_timestamp = int(start_date.timestamp())
end_date = datetime.fromisoformat('2020-01-01')
end_timestamp = int(end_date.timestamp())
start_timestamp, end_timestamp

(1514786400, 1577858400)

In [15]:
sample_start_timestamp = start_timestamp
sample_end_date = datetime.fromisoformat('2019-01-01')
sample_end_timestamp = int(end_date.timestamp())

### Load in all revisions

Which excludes revisions outside of 2018-2020 and only for pages in ns0 and non-redirects.

In [10]:
start = datetime.now()
stub_history_reverts_dir = os.path.join(derived_data_dir, 'stub-history-reverts')
rev_ids_filepath = os.path.join(stub_history_reverts_dir, 'rev_ids_sorted.csv')
df = pd.read_csv(rev_ids_filepath,
                 header=None, 
                 names=['page_id', 'rev_id', 'rev_timestamp', 'is_revert_target', 'is_reverted', 'is_reverting']
)
print(f"{datetime.now() - start}")
len(df)

0:00:59.704629


77287697

In [11]:
df.head()

Unnamed: 0,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting
0,12,818611292,1515101356,0,0,0
1,12,818613649,1515102279,0,0,0
2,12,818624114,1515106953,1,0,0
3,12,820024812,1515798752,0,1,0
4,12,820025687,1515799060,0,0,1


In [12]:
# number of unique pages in this sample
page_ids_set = set(df.page_id)
len(page_ids_set)

5338582

In [13]:
# Read the revert info
stub_history_reverts_dir = os.path.join(derived_data_dir, 'stub-history-reverts')
revert_df_filepath = os.path.join(stub_history_reverts_dir, 'revert_df.pkl')
revert_df = pd.read_pickle(revert_df_filepath)
len(revert_df)

5992682

In [109]:
revert_df.head(3)

Unnamed: 0,page_id,reverted_count,reverting_rev_id,reverting_timestamp,reverting_user_text,reverting_user_id,reverted_to_rev_id,reverted_to_timestamp,reverted_to_user_text,reverted_to_user_id,reverted_rev_ids,reverted_timestamps,reverted_user_texts,reverted_user_ids,reverting_user_is_anonymous,reverted_to_user_is_anonymous,reverted_users_is_anonymous
0,18754764,1,902766646,1561105801,Malcolmxl5,4076676.0,874884880,1545481829,Hmains,508734.0,[900328973],[1559706521],[2601:199:417F:8EED:A0B0:A6B5:3457:A9B4],[None],False,False,[True]
1,18754831,2,818980415,1515284429,BrownHairedGirl,754619.0,743241620,1475970276,Emir of Wikipedia,28856560.0,"[757539852, 775847398]","[1483188655, 1492452285]","[Ser Amantio di Nicolao, Emir of Wikipedia]","[753665, 28856560]",False,False,"[False, False]"
2,18754831,1,932235094,1577204938,UA3,25923702.0,918313790,1569654385,Monkbot,20483999.0,[932233860],[1577204033],[Qowa],[37692801],False,False,[False]


In [78]:
page_df = df.groupby('page_id').agg({
    'rev_id': len,
    'is_reverted': np.sum,
    'is_reverting': np.sum
}).rename(columns={
    'rev_id': 'revision_count',
    'is_reverted': 'reverted_count',
    'is_reverting': 'revert_count'
})
len(page_df)

5338582

In [79]:
page_df.sample(3)

Unnamed: 0_level_0,revision_count,reverted_count,revert_count
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12074593,12,2,2
13261080,70,5,3
25008724,39,8,6


In [93]:
np.quantile(page_df.revision_count, 0.9), \
np.sum(page_df.revision_count>=28) / len(page_df)

(28.0, 0.10042273397692496)

In [95]:
eligible_page_ids = set(page_df[page_df.revision_count>=28].index)
len(eligible_page_ids)

536115

In [96]:
eligible_for_sampling = []
curr_page_id = -1
for tup in tqdm(df.itertuples(), total=len(df)):
    if tup.page_id != curr_page_id:
        curr_page_id = tup.page_id
        # can never sample the FIRST post-2018 revision
        eligible_for_sampling.append(False)
    else:
        is_eligible = tup.rev_timestamp <= sample_end_timestamp and tup.page_id in eligible_page_ids
        eligible_for_sampling.append(is_eligible)

100%|██████████| 77287697/77287697 [02:31<00:00, 508588.80it/s]


In [97]:
df['is_sample_eligible'] = eligible_for_sampling
# after filtering, only 60.4% are actually eligible...
np.sum(df.is_sample_eligible) / len(df)

0.6044495801188124

In [108]:
np.sum(df.is_sample_eligible)

46716516

In [163]:
df = df.reset_index().rename(columns={'index': 'full_index'})

## Sample version 1

In [164]:
sample = df[df.is_sample_eligible].sample(n=1000000, random_state=45).copy(deep=True)
len(sample)

1000000

In [165]:
# 14.4% of the sample are reverted
np.sum(sample.is_reverted) / len(sample)

0.144318

In [166]:
# 10.2% of the sample are reverts
np.sum(sample.is_reverting) / len(sample)

0.101835

In [167]:
# 6.9% of pages with 1+ revision are included in the sample
len(set(sample.page_id)), len(set(sample.page_id)) / len(page_ids_set)

(369906, 0.06928918578004421)

In [185]:
matched_sample = df.loc[sample.index - 1] #df[df.full_index.isin(sample.full_index.to_numpy() - 1)]
assert len(matched_sample) == len(sample)
assert np.all(sample.page_id.reset_index(drop=True) == matched_sample.page_id.reset_index(drop=True))

In [186]:
sample = sample.assign(prev_rev_id=matched_sample.rev_id.tolist())
sample.head(3)

Unnamed: 0,full_index,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id
29999548,29999548,9516095,846835190,1529576038,0,0,0,True,846835009
12474631,12474631,876872,923997118,1572595746,0,0,0,True,923211396
62536703,62536703,53465104,867368228,1541425108,0,0,0,True,867366487


In [192]:
matched_sample = df.loc[sample.index + 1]
assert len(matched_sample) == len(sample)
sample['next_rev_id'] = -1
idx = np.array(sample.page_id.tolist()) == np.array(matched_sample.page_id.tolist())
print(f"{np.sum(idx)} / {len(sample)} sampled revisions have no subsequent revision in 2018 or 2019.")
sample.loc[idx, 'next_rev_id'] = matched_sample[idx].rev_id.tolist()

988434 / 1000000 sampled revisions have no subsequent revision in 2018 or 2019.


In [193]:
# get the timestamp of the previous and next revisions
rev_id_timestamp_dict = {tup.rev_id: tup.rev_timestamp for tup in tqdm(df.itertuples(), total=len(df))}
sample['prev_rev_timestamp'] = sample.prev_rev_id.map(lambda prev_rev_id: rev_id_timestamp_dict[prev_rev_id])
sample['next_rev_timestamp'] = sample.next_rev_id.map(lambda next_rev_id: rev_id_timestamp_dict[next_rev_id] if next_rev_id != -1 else -1)

100%|██████████| 77287697/77287697 [02:43<00:00, 473425.10it/s]


In [194]:
sample.head()

Unnamed: 0,full_index,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp
29999548,29999548,9516095,846835190,1529576038,0,0,0,True,846835009,850539656,1529575914,1531769146
12474631,12474631,876872,923997118,1572595746,0,0,0,True,923211396,924618182,1572162918,1572930319
62536703,62536703,53465104,867368228,1541425108,0,0,0,True,867366487,867663184,1541423455,1541590304
73902987,73902987,60096152,888337768,1552936050,0,0,0,True,888337457,888338754,1552935890,1552936561
43843955,43843955,28063274,917010365,1569117229,0,0,0,True,915190631,-1,1568247118,-1


In [195]:
sample_reverting_rev_ids = set(sample[sample.is_reverting==1].rev_id)
reverting_rev_id_to_reverted_ids_dict = {
    row.reverting_rev_id: row.reverted_rev_ids
    for row in tqdm(revert_df.itertuples(), total=len(revert_df))
    if row.reverting_rev_id in sample_reverting_rev_ids
}

100%|██████████| 5992682/5992682 [00:24<00:00, 245734.65it/s]


In [196]:
# for reverting revisions in the sample, merge in the list of reverted_rev_ids
# using the dictionary computed in the cell above
reverted_rev_ids_list = []
for tup in sample.itertuples(): 
    if tup.is_reverting == 1:
        reverted_rev_ids = reverting_rev_id_to_reverted_ids_dict[tup.rev_id]
        reverted_rev_ids_list.append(reverted_rev_ids)
    else:
        reverted_rev_ids_list.append([])
sample['reverted_rev_ids'] = reverted_rev_ids_list

In [197]:
reverted_to_reverting_rev_id_dict = {}
for tup in tqdm(revert_df.itertuples(), total=len(revert_df)):
    for rev_id in tup.reverted_rev_ids:
        reverted_to_reverting_rev_id_dict[rev_id] = tup.reverting_rev_id

100%|██████████| 5992682/5992682 [00:27<00:00, 214421.91it/s]


In [198]:
# grab the reverting id for reverted revisions
sample['reverting_rev_id'] = -1
sample.loc[sample.is_reverted==1,'reverting_rev_id'] = [
    reverted_to_reverting_rev_id_dict[rev_id] for rev_id in sample.loc[sample.is_reverted==1].rev_id
]

In [199]:
# merge in the time of the reverting revision
sample['reverting_rev_timestamp'] = -1
reverting_rev_timestamp_dict = {tup.rev_id: tup.rev_timestamp for tup in df[df.is_reverting==1].itertuples()}
sample.loc[sample.is_reverted==1,'reverting_rev_timestamp'] = [
    reverting_rev_timestamp_dict[rev_id] for rev_id in sample.loc[sample.is_reverted==1].reverting_rev_id
]

In [200]:
sample[sample.is_reverted==1].head()

Unnamed: 0,full_index,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp,reverted_rev_ids,reverting_rev_id,reverting_rev_timestamp
42666032,42666032,26317963,920647883,1570775915,0,1,0,True,917814987,920648257,1569447154,1570776068,[],920648257,1570776068
55000305,55000305,43829579,862458099,1538680042,1,1,1,True,862423229,862463700,1538659358,1538682835,"[862396649, 862396884, 862397156, 862405790, 8...",862463700,1538682835
65485187,65485187,55897429,866634222,1541017336,1,1,0,True,866633465,866634565,1541016993,1541017499,[],866634605,1541017520
10627255,10627255,594582,862308858,1538596392,0,1,0,True,859059391,862314073,1536688570,1538599081,[],862314712,1538599407
2427556,2427556,45400,932799798,1577541584,0,1,0,True,932756057,932799909,1577517415,1577541639,[],932803739,1577544188


In [201]:
sample = sample.drop(columns='full_index')
sample.head(1)

Unnamed: 0,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp,reverted_rev_ids,reverting_rev_id,reverting_rev_timestamp
29999548,9516095,846835190,1529576038,0,0,0,True,846835009,850539656,1529575914,1531769146,[],-1,-1


In [202]:
# save the sample
sample1_filepath = os.path.join(working_dir, 'sample1_1M.pkl')
sample.to_pickle(sample1_filepath)
print("Finished.")

Finished.


In [203]:
# read in the sample dataframe
revision_sample_dir = os.path.join(derived_data_dir, 'revision_sample')
sample1_filepath = os.path.join(revision_sample_dir, 'sample1_1M.pkl')
rev_df = pd.read_pickle(sample1_filepath)
len(rev_df)

1000000

In [204]:
rev_df.head()

Unnamed: 0,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp,reverted_rev_ids,reverting_rev_id,reverting_rev_timestamp
29999548,9516095,846835190,1529576038,0,0,0,True,846835009,850539656,1529575914,1531769146,[],-1,-1
12474631,876872,923997118,1572595746,0,0,0,True,923211396,924618182,1572162918,1572930319,[],-1,-1
62536703,53465104,867368228,1541425108,0,0,0,True,867366487,867663184,1541423455,1541590304,[],-1,-1
73902987,60096152,888337768,1552936050,0,0,0,True,888337457,888338754,1552935890,1552936561,[],-1,-1
43843955,28063274,917010365,1569117229,0,0,0,True,915190631,-1,1568247118,-1,[],-1,-1
