Sample 3 Revision Sampling
===

Includes all but the first 2018 revision in namespace 0.

We're sampling only in 2018 (so that we have all of 2019 to observe reverts)

In [1]:
import mwapi
import mwxml
import mwxml.utilities
import mwcli
import mwreverts
import oresapi
import mwparserfromhell

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

In [3]:
import os
from tqdm import tqdm
import bz2
import gzip
import json
import re
import hashlib
from datetime import datetime
import nltk
import scipy.stats
import para
from itertools import groupby
from collections import Counter

In [4]:
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = git_root_dir[0]
git_root_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback'

In [5]:
raw_data_dir = "/export/scratch2/wiki_data"
derived_data_dir = os.path.join(git_root_dir, "data", "derived")
raw_data_dir, derived_data_dir

('/export/scratch2/wiki_data',
 '/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived')

In [6]:
working_dir = os.path.join(derived_data_dir, 'revision_sample')
os.makedirs(working_dir, exist_ok=True)
working_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived/revision_sample'

In [7]:
# NOTE TIMESTAMP BUG
# These incorrectly use CT, rather than UTC, as the boundaries for the data collection period
# This is completely fine, but it's probably not what most analysts would think
start_date = datetime.fromisoformat('2018-01-01')
start_timestamp = int(start_date.timestamp())
end_date = datetime.fromisoformat('2020-01-01')
end_timestamp = int(end_date.timestamp())
start_timestamp, end_timestamp

(1514786400, 1577858400)

In [8]:
sample_start_timestamp = start_timestamp
sample_end_date = datetime.fromisoformat('2019-01-01')
sample_end_timestamp = int(sample_end_date.timestamp())

### Load in all revisions

Which excludes revisions outside of 2018-2020 and only for pages in ns0 and non-redirects.

In [9]:
start = datetime.now()
stub_history_reverts_dir = os.path.join(derived_data_dir, 'stub-history-reverts')
rev_ids_filepath = os.path.join(stub_history_reverts_dir, 'rev_ids_sorted.csv')
df = pd.read_csv(rev_ids_filepath,
                 header=None, 
                 names=['page_id', 'rev_id', 'rev_timestamp', 'is_revert_target', 'is_reverted', 'is_reverting']
)
print(f"{datetime.now() - start}")
len(df)

0:00:34.623254


77287697

In [10]:
df.head()

Unnamed: 0,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting
0,12,818611292,1515101356,0,0,0
1,12,818613649,1515102279,0,0,0
2,12,818624114,1515106953,1,0,0
3,12,820024812,1515798752,0,1,0
4,12,820025687,1515799060,0,0,1


In [11]:
# number of unique pages in this sample
page_ids_set = set(df.page_id)
len(page_ids_set)

5338582

In [12]:
# Read the revert info
stub_history_reverts_dir = os.path.join(derived_data_dir, 'stub-history-reverts')
revert_df_filepath = os.path.join(stub_history_reverts_dir, 'revert_df.pkl')
revert_df = pd.read_pickle(revert_df_filepath)
len(revert_df)

5992682

In [13]:
revert_df.head(3)

Unnamed: 0,page_id,reverted_count,reverting_rev_id,reverting_timestamp,reverting_user_text,reverting_user_id,reverted_to_rev_id,reverted_to_timestamp,reverted_to_user_text,reverted_to_user_id,reverted_rev_ids,reverted_timestamps,reverted_user_texts,reverted_user_ids,reverting_user_is_anonymous,reverted_to_user_is_anonymous,reverted_users_is_anonymous
0,18754764,1,902766646,1561105801,Malcolmxl5,4076676.0,874884880,1545481829,Hmains,508734.0,[900328973],[1559706521],[2601:199:417F:8EED:A0B0:A6B5:3457:A9B4],[None],False,False,[True]
1,18754831,2,818980415,1515284429,BrownHairedGirl,754619.0,743241620,1475970276,Emir of Wikipedia,28856560.0,"[757539852, 775847398]","[1483188655, 1492452285]","[Ser Amantio di Nicolao, Emir of Wikipedia]","[753665, 28856560]",False,False,"[False, False]"
2,18754831,1,932235094,1577204938,UA3,25923702.0,918313790,1569654385,Monkbot,20483999.0,[932233860],[1577204033],[Qowa],[37692801],False,False,[False]


In [14]:
start = datetime.now()
page_df = df.groupby('page_id').agg({
    'rev_id': len,
    'is_reverted': np.sum,
    'is_reverting': np.sum
}).rename(columns={
    'rev_id': 'revision_count',
    'is_reverted': 'reverted_count',
    'is_reverting': 'revert_count'
})
print(f"{datetime.now() - start}")
len(page_df)

0:00:36.341802


5338582

In [15]:
page_df.sample(3)

Unnamed: 0_level_0,revision_count,reverted_count,revert_count
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1311775,1,0,0
3442760,3,0,0
1192891,12,0,0


In [23]:
eligible_for_sampling = []
curr_page_id = -1
first_page_omitted_in_2018_count = 0
for tup in tqdm(df.itertuples(), total=len(df)):
    if tup.page_id != curr_page_id:
        curr_page_id = tup.page_id
        # can never sample the FIRST post-2018 revision
        eligible_for_sampling.append(False)
        # keep track of the number of revisions that are omitted entirely because they are the first
        # (should be <= the number of unique pages)
        if tup.rev_timestamp <= sample_end_timestamp:
            first_page_omitted_in_2018_count += 1
    else:
        is_eligible = tup.rev_timestamp <= sample_end_timestamp
        eligible_for_sampling.append(is_eligible)
first_page_omitted_in_2018_count

100%|██████████| 77287697/77287697 [02:40<00:00, 480639.23it/s]


4351583

In [17]:
df['is_sample_eligible'] = eligible_for_sampling
# after filtering, only 43.9% are actually eligible...
np.sum(df.is_sample_eligible), np.sum(df.is_sample_eligible) / len(df)

(33964442, 0.43945470389679225)

In [18]:
df = df.reset_index().rename(columns={'index': 'full_index'})

## Sample version 3

Includes all revisions that meet the sampling criteria.

In [19]:
sample = df[df.is_sample_eligible]
len(sample)

33964442

In [20]:
# 11.9% of the sample are reverted
np.sum(sample.is_reverted), np.sum(sample.is_reverted) / len(sample)

(4026444, 0.11854880465870749)

In [21]:
# 9.0% of the sample are reverts
np.sum(sample.is_reverting), np.sum(sample.is_reverting) / len(sample)

(3043796, 0.08961713547362268)

In [22]:
# 6.1% of pages with 1+ revision are included in the sample
len(set(sample.page_id)), len(set(sample.page_id)) / len(page_ids_set)

(3279818, 0.6143612667183908)

In [32]:
matched_sample = df.loc[sample.index - 1]
assert len(matched_sample) == len(sample)
assert np.all(sample.page_id.reset_index(drop=True) == matched_sample.page_id.reset_index(drop=True))

In [33]:
sample = sample.assign(prev_rev_id=matched_sample.rev_id.tolist())
sample.head(3)

Unnamed: 0,full_index,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id
1,1,12,818613649,1515102279,0,0,0,True,818611292
2,2,12,818624114,1515106953,1,0,0,True,818613649
3,3,12,820024812,1515798752,0,1,0,True,818624114


In [34]:
matched_sample = df.loc[sample.index + 1]
assert len(matched_sample) == len(sample)
sample['next_rev_id'] = -1
idx = np.array(sample.page_id.tolist()) == np.array(matched_sample.page_id.tolist())
print(f"{np.sum(idx)} / {len(sample)} sampled revisions have 1+ subsequent revision in 2018 or 2019.")
sample.loc[idx, 'next_rev_id'] = matched_sample[idx].rev_id.tolist()

33424856 / 33964442 sampled revisions have 1+ subsequent revision in 2018 or 2019.


In [35]:
# get the timestamp of the previous and next revisions
rev_id_timestamp_dict = {tup.rev_id: tup.rev_timestamp for tup in tqdm(df.itertuples(), total=len(df))}
sample['prev_rev_timestamp'] = sample.prev_rev_id.map(lambda prev_rev_id: rev_id_timestamp_dict[prev_rev_id])
sample['next_rev_timestamp'] = sample.next_rev_id.map(lambda next_rev_id: rev_id_timestamp_dict[next_rev_id] if next_rev_id != -1 else -1)

100%|██████████| 77287697/77287697 [02:43<00:00, 473897.81it/s]


In [36]:
no_next_rev_count = np.sum(sample.next_rev_timestamp == -1)
print(f"{no_next_rev_count} sample revisions ({no_next_rev_count / len(sample)*100:.2f}%) have no next revision in the collected data range.")

539586 sample revisions (1.59%) have no next revision in the collected data range.


In [37]:
sample.head()

Unnamed: 0,full_index,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp
1,1,12,818613649,1515102279,0,0,0,True,818611292,818624114,1515101356,1515106953
2,2,12,818624114,1515106953,1,0,0,True,818613649,820024812,1515102279,1515798752
3,3,12,820024812,1515798752,0,1,0,True,818624114,820025687,1515106953,1515799060
4,4,12,820025687,1515799060,0,0,1,True,820024812,820703495,1515798752,1516095884
5,5,12,820703495,1516095884,0,0,0,True,820025687,821673418,1515799060,1516597634


In [38]:
sample_reverting_rev_ids = set(sample[sample.is_reverting==1].rev_id)
reverting_rev_id_to_reverted_ids_dict = {
    row.reverting_rev_id: row.reverted_rev_ids
    for row in tqdm(revert_df.itertuples(), total=len(revert_df))
    if row.reverting_rev_id in sample_reverting_rev_ids
}

100%|██████████| 5992682/5992682 [00:24<00:00, 246557.92it/s]


In [39]:
# for reverting revisions in the sample, merge in the list of reverted_rev_ids
# using the dictionary computed in the cell above
reverted_rev_ids_list = []
for tup in sample.itertuples(): 
    if tup.is_reverting == 1:
        reverted_rev_ids = reverting_rev_id_to_reverted_ids_dict[tup.rev_id]
        reverted_rev_ids_list.append(reverted_rev_ids)
    else:
        reverted_rev_ids_list.append([])
sample['reverted_rev_ids'] = reverted_rev_ids_list

In [40]:
reverted_to_reverting_rev_id_dict = {}
for tup in tqdm(revert_df.itertuples(), total=len(revert_df)):
    for rev_id in tup.reverted_rev_ids:
        reverted_to_reverting_rev_id_dict[rev_id] = tup.reverting_rev_id

100%|██████████| 5992682/5992682 [00:26<00:00, 223455.47it/s]


In [41]:
# grab the reverting id for reverted revisions
sample['reverting_rev_id'] = -1
sample.loc[sample.is_reverted==1,'reverting_rev_id'] = [
    reverted_to_reverting_rev_id_dict[rev_id] for rev_id in sample.loc[sample.is_reverted==1].rev_id
]

In [42]:
# merge in the time of the reverting revision
sample['reverting_rev_timestamp'] = -1
reverting_rev_timestamp_dict = {tup.rev_id: tup.rev_timestamp for tup in df[df.is_reverting==1].itertuples()}
sample.loc[sample.is_reverted==1,'reverting_rev_timestamp'] = [
    reverting_rev_timestamp_dict[rev_id] for rev_id in sample.loc[sample.is_reverted==1].reverting_rev_id
]

In [43]:
sample[sample.is_reverted==1].head()

Unnamed: 0,full_index,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp,reverted_rev_ids,reverting_rev_id,reverting_rev_timestamp
3,3,12,820024812,1515798752,0,1,0,True,818624114,820025687,1515106953,1515799060,[],820025687,1515799060
10,10,12,823350009,1517446536,0,1,0,True,822837753,823354824,1517192137,1517448629,[],823354824,1517448629
12,12,12,824519722,1518058732,0,1,0,True,823354824,824519733,1517448629,1518058739,[],824519733,1518058739
15,15,12,825641740,1518642861,0,1,0,True,824867074,825644123,1518243924,1518643933,[],825644123,1518643933
21,21,12,827021220,1519311846,0,1,0,True,826532523,827022627,1519084253,1519312376,[],827025661,1519314642


In [44]:
sample = sample.drop(columns='full_index')
sample.head(1)

Unnamed: 0,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp,reverted_rev_ids,reverting_rev_id,reverting_rev_timestamp
1,12,818613649,1515102279,0,0,0,True,818611292,818624114,1515101356,1515106953,[],-1,-1


In [45]:
# save the sample
sample_filepath = os.path.join(working_dir, 'sample3_all.pkl')
sample.to_pickle(sample_filepath)
print("Finished.")

Finished.


In [46]:
# read in the sample dataframe
revision_sample_dir = os.path.join(derived_data_dir, 'revision_sample')
sample_filepath = os.path.join(revision_sample_dir, 'sample3_all.pkl')
rev_df = pd.read_pickle(sample_filepath)
len(rev_df)

33964442

In [47]:
rev_df.head()

Unnamed: 0,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp,reverted_rev_ids,reverting_rev_id,reverting_rev_timestamp
1,12,818613649,1515102279,0,0,0,True,818611292,818624114,1515101356,1515106953,[],-1,-1
2,12,818624114,1515106953,1,0,0,True,818613649,820024812,1515102279,1515798752,[],-1,-1
3,12,820024812,1515798752,0,1,0,True,818624114,820025687,1515106953,1515799060,[],820025687,1515799060
4,12,820025687,1515799060,0,0,1,True,820024812,820703495,1515798752,1516095884,[820024812],-1,-1
5,12,820703495,1516095884,0,0,0,True,820025687,821673418,1515799060,1516597634,[],-1,-1
