Sample 2 Manual Annotation Sampling
===



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

In [2]:
import os
from tqdm import tqdm
import bz2
import gzip
import json
import re
import hashlib
from datetime import datetime
import nltk
import scipy.stats
import para
from itertools import groupby
from collections import Counter

In [3]:
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = git_root_dir[0]
git_root_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback'

In [4]:
raw_data_dir = "/export/scratch2/wiki_data"
derived_data_dir = os.path.join(git_root_dir, "data", "derived")
raw_data_dir, derived_data_dir

('/export/scratch2/wiki_data',
 '/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived')

In [5]:
stub_history_dir = os.path.join(derived_data_dir, 'stub-history-all-revisions')
stub_history_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived/stub-history-all-revisions'

In [6]:
working_dir = os.path.join(derived_data_dir, 'sample2-manual-annotation-samples')
os.makedirs(working_dir, exist_ok=True)
working_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived/sample2-manual-annotation-samples'

In [7]:
start_date = datetime.fromisoformat('2014-04-01')
start_timestamp = int(start_date.timestamp())
end_date = datetime.fromisoformat('2020-01-01')
end_timestamp = int(end_date.timestamp())
start_timestamp, end_timestamp

(1396328400, 1577858400)

### Load sample 2


In [8]:
# read in the sample dataframe
revision_sample_dir = os.path.join(derived_data_dir, 'revision_sample')
sample2_filepath = os.path.join(revision_sample_dir, 'sample2_1M.pkl')
rev_df = pd.read_pickle(sample2_filepath)
len(rev_df)

1000000

In [9]:
# read in the ORES scores
revision_sample_dir = os.path.join(derived_data_dir, 'revision_sample')
sample2_filepath = os.path.join(revision_sample_dir, 'sample2_ores_scores.csv')
ores_df = pd.read_csv(sample2_filepath, header=None, names=['rev_id', 'damaging_prob', 'damaging_pred', 'goodfaith_prob', 'goodfaith_pred'])
len(ores_df)

1000000

In [10]:
rev_df = pd.merge(rev_df, ores_df, on='rev_id', how='inner')
len(rev_df)

1000000

In [11]:
rev_df.head()

Unnamed: 0,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp,reverted_rev_ids,reverting_rev_id,reverting_rev_timestamp,damaging_prob,damaging_pred,goodfaith_prob,goodfaith_pred
0,52232058,831517149,1521608836,0,0,0,True,831516744,831528824,1521608698,1521614309,[],-1,-1,0.340084,False,0.746885,True
1,1501758,841712258,1526588824,0,0,0,True,841712075,841712410,1526588726,1526588897,[],-1,-1,0.021781,False,0.984908,True
2,25239,844335926,1528115396,0,0,0,True,844335334,844343345,1528114910,1528120731,[],-1,-1,0.068573,False,0.941672,True
3,46695824,846301278,1529286250,0,0,0,True,845230648,846302881,1528645143,1529287006,[],-1,-1,0.021797,False,0.978803,True
4,1664084,838201915,1524687382,0,0,0,True,838077319,838203213,1524617329,1524687976,[],-1,-1,0.029711,False,0.988456,True


## Sample for Bruce Liu

Response to Haiyi's email on Wed, Apr 8, 12:08 PM.

"Would you please share the revision dataset you generated (the revision id, the ORES score, and the community response - whether the revision was reverted or not) with Bruce?"

In [14]:
# write out the full sample as a CSV
sample_subset_filepath = os.path.join(working_dir, f"sample2_2018_bruceliu.csv")
with open(sample_subset_filepath, 'w') as outfile:
    outfile.write("page_id,rev_id,rev_timestamp,is_reverted,is_reverting,damaging_prob,goodfaith_prob\n")
    for t in tqdm(rev_df.itertuples(), total=len(rev_df)):
        url = f"https://en.wikipedia.org/wiki/?diff={t.rev_id}"
        line = f"{t.page_id},{t.rev_id},{t.rev_timestamp},{t.is_reverted},{t.is_reverting},{t.damaging_prob},{t.goodfaith_prob}\n"
        outfile.write(line)

100%|██████████| 1000000/1000000 [00:08<00:00, 124728.00it/s]


### Samples from expected corners

In [12]:
# write out a sample of likelygood reverted revisions
n = 100

likelygood_threshold = 0.329
verylikelybad_threshold = 0.919
likelybad_threshold = 0.641

sample_subset_filepath = os.path.join(working_dir, f"sample2_likelygood_reverted_random{n}.csv")
with open(sample_subset_filepath, 'w') as outfile:
    outfile.write("page_id,rev_id,rev_timestamp,rev_date,is_reverted,is_reverting,damaging_prob,diff_url\n")
    subset = rev_df[(rev_df.damaging_prob <= likelygood_threshold)&(rev_df.is_reverted == 1)]
    print(f"{len(subset)} likelygood reverted revisions")
    subset = subset.sample(n=n, random_state=2)
    for t in subset.itertuples():
        url = f"https://en.wikipedia.org/wiki/?diff={t.rev_id}"
        rev_date = datetime.utcfromtimestamp(t.rev_timestamp).strftime("%Y-%m-%d")
        line = f"{t.page_id},{t.rev_id},{t.rev_timestamp},{rev_date},{t.is_reverted},{t.is_reverting},{t.damaging_prob},{url}\n"
        outfile.write(line)

67971 likelygood reverted revisions


In [15]:
# write out a sample of verylikelybad reverted revisions
n = 100

likelygood_threshold = 0.329
verylikelybad_threshold = 0.919
likelybad_threshold = 0.641

sample_subset_filepath = os.path.join(working_dir, f"sample2_verylikelybad_nonreverted_random{n}.csv")
with open(sample_subset_filepath, 'w') as outfile:
    outfile.write("page_id,rev_id,rev_timestamp,rev_date,is_reverted,is_reverting,damaging_prob,diff_url\n")
    subset = rev_df[(rev_df.damaging_prob >= verylikelybad_threshold)&(rev_df.is_reverted == 0)]
    print(f"{len(subset)} verylikelybad nonreverted revisions")
    subset = subset.sample(n=n, random_state=2)
    for t in subset.itertuples():
        url = f"https://en.wikipedia.org/wiki/?diff={t.rev_id}"
        rev_date = datetime.utcfromtimestamp(t.rev_timestamp).strftime("%Y-%m-%d")
        line = f"{t.page_id},{t.rev_id},{t.rev_timestamp},{rev_date},{t.is_reverted},{t.is_reverting},{t.damaging_prob},{url}\n"
        outfile.write(line)

303 verylikelybad nonreverted revisions
