In [1]:
import numpy as np
import pandas as pd

import os
from tqdm import tqdm
import bz2
import sqlite3
import difflib
import gzip
import json
import re
import hashlib
from datetime import datetime
from datetime import timezone
import scipy.stats
from itertools import groupby
from collections import Counter

import sklearn
import sklearn.ensemble
import sklearn.metrics
import sklearn.calibration
from sklearn.model_selection import cross_val_score

import math

from joblib import dump, load

In [2]:
raw_data_dir = "/export/scratch2/wiki_data"
derived_data_dir = os.path.join('/export/scratch2/levon003/repos/wiki-ores-feedback', "data", "derived")
stub_history_dir = os.path.join(derived_data_dir, 'stub-history-all-revisions')
revision_sample_dir = os.path.join(derived_data_dir, 'revision_sample')
working_dir = os.path.join(derived_data_dir, 'audit')

# ### Data loading and cleaning
# read in the sample dataframe
s = datetime.now()
revision_sample_dir = os.path.join(derived_data_dir, 'revision_sample')
sample3_filepath = os.path.join(revision_sample_dir, 'sample3_all.pkl')
rev_df = pd.read_pickle(sample3_filepath)
print(f"Sample 3 data loaded in {datetime.now() - s}.")

# Load the features (2020-08-01 tsv file)
s = datetime.now()
labeled_revs_dir = os.path.join(derived_data_dir, 'labeled-revs')
sample3_features_dir = os.path.join(labeled_revs_dir, 'sample3-features')
sample3_damaging_filepath = os.path.join(sample3_features_dir, 'sample3.damaging.2020-08-01T05:40:00Z.tsv')
features_df = pd.read_csv(sample3_damaging_filepath, sep='\t', header=0)
print(f"Features data loaded in {datetime.now() - s}.")

# drop the useless 'damaging' column (it is auto-generated)
features_df = features_df.drop(columns='damaging')

# load the rev_ids that correspond to the feature data
revid_filepath = os.path.join(labeled_revs_dir, 'sample3-features', 'rev_id_2020-08-01T05:40:00Z.txt')
rev_id_list = pd.read_csv(revid_filepath, header=None)

Sample 3 data loaded in 0:00:19.056884.
Features data loaded in 0:01:30.916610.


In [3]:
assert len(rev_id_list) == len(features_df)

In [4]:
# Read the revert info
# This dataframe contains additional data beyond what is in the rev_df
s = datetime.now()
stub_history_reverts_dir = os.path.join(derived_data_dir, 'stub-history-reverts')
revert_df_filepath = os.path.join(stub_history_reverts_dir, 'revert_df.pkl')
revert_df = pd.read_pickle(revert_df_filepath)
print(f"Loaded revert data in {datetime.now() - s}.")

# The most important info in the `revert_df` that isn't in the `rev_df` is the username info, which enables the identification of self-reverts.
# `revert_df` has one line per **revert** revision, compared to the `rev_df` which has one line per revision.

# identify self-reverts
is_self_revert_list = []
for row in tqdm(revert_df.itertuples(), total=len(revert_df)):
    is_self_revert = row.reverting_user_text in row.reverted_user_texts
    is_self_revert_list.append(is_self_revert)
revert_df['is_self_revert'] = is_self_revert_list

# now compute the outcome, which is a variant of `rev_df.is_reverted`
reverted_rev_ids = set()
# only count it as a reverted revision if it was not a self-revert
# and it was reverted within one week
threshold = 60 * 60 * 24 * 7 
rs = revert_df[~revert_df.is_self_revert]
for row in tqdm(rs.itertuples(), total=len(rs)):
    reverting_timestamp = row.reverting_timestamp
    for rev_id, timestamp in zip(row.reverted_rev_ids, row.reverted_timestamps):
        if reverting_timestamp - timestamp <= threshold:
            reverted_rev_ids.add(rev_id)

  1%|          | 31976/5992682 [00:00<00:18, 319754.21it/s]

Loaded revert data in 0:00:32.754740.


100%|██████████| 5992682/5992682 [00:21<00:00, 282831.12it/s]
100%|██████████| 5331414/5331414 [00:27<00:00, 191148.87it/s]


In [16]:
rev_id_list.head()

Unnamed: 0,0
0,857219031
1,857219035
2,857219034
3,857219036
4,857219037


In [17]:
# #### Create the actual outcome variable and add it to the features dataframe
# `features_df` contains only the features, not the revision ids. We create a binary outcome column based on the order of the revisions as they were read from the cache (and stored in `cache_rev_id_list`).

is_reverted = [rev_id in reverted_rev_ids for rev_id in rev_id_list.iloc[:,0]]
len(rev_id_list)

11738345

In [18]:
features_df['is_reverted'] = is_reverted

In [None]:
################################################################

# scale X vars
X_test = sklearn.preprocessing.scale(features_df.iloc[:,:-1])

# load model from file
md = load('GB3.joblib')

# predict on new data
s = datetime.now()
y_pred_test_calib = md.predict_proba(X_test)[:,1]

print(f"Prediction completed in {datetime.now() - s}.")

In [None]:
# save prediction results
pred_results = pd.DataFrame()
pred_results['test_calib'] = y_pred_test_calib
pred_results['test_label'] = np.array(features_df['is_reverted'])
print(pred_results.head())
    
results_filepath = os.path.join('/export/scratch2/wastv004/wiki-ores-feedback/', 'prediction_2020-08-01.pkl')
pred_results.to_pickle(results_filepath)
print(results_filepath)