Text Online Vectorization
===

 - Identify vocab from word and document count files
 - Construct appropriate text vectorizers (and TF-IDF transformers if desired)
 - Stream ldjson file with tokenized texts in batches, appending each batch as a sparse matrix to the result
 - Save the result periodically, ultimately resulting in a large sparse matrix

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

In [2]:
import os
from tqdm import tqdm
import bz2
import sqlite3
import difflib
import gzip
import json
import base64
import pickle
import re
import hashlib
from datetime import datetime
from datetime import timezone
import nltk
import scipy.stats
import para
from itertools import groupby
from collections import Counter, defaultdict
import multiprocessing as mp

In [3]:
import deltas
from deltas.tokenizers import wikitext_split
from deltas import segment_matcher

In [4]:
import sklearn
import sklearn.ensemble
import sklearn.metrics
import sklearn.calibration
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import scipy.sparse

In [5]:
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = git_root_dir[0]
git_root_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback'

In [6]:
raw_data_dir = "/export/scratch2/wiki_data"
derived_data_dir = os.path.join(git_root_dir, "data", "derived")
raw_data_dir, derived_data_dir

('/export/scratch2/wiki_data',
 '/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived')

In [7]:
stub_history_dir = os.path.join(derived_data_dir, 'stub-history-all-revisions')
stub_history_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived/stub-history-all-revisions'

In [8]:
revision_sample_dir = os.path.join(derived_data_dir, 'revision_sample')
working_dir = os.path.join(derived_data_dir, 'audit')
working_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived/audit'

### Read sample data

In [9]:
# read in the sample dataframe
s = datetime.now()
revision_sample_dir = os.path.join(derived_data_dir, 'revision_sample')
sample3_filepath = os.path.join(revision_sample_dir, 'sample3_all.pkl')
rev_df = pd.read_pickle(sample3_filepath)
print(f"Sample 3 data loaded in {datetime.now() - s}.")
len(rev_df)

Sample 3 data loaded in 0:00:35.499775.


33964442

In [10]:
rev_df.head()

Unnamed: 0,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp,reverted_rev_ids,reverting_rev_id,reverting_rev_timestamp
1,12,818613649,1515102279,0,0,0,True,818611292,818624114,1515101356,1515106953,[],-1,-1
2,12,818624114,1515106953,1,0,0,True,818613649,820024812,1515102279,1515798752,[],-1,-1
3,12,820024812,1515798752,0,1,0,True,818624114,820025687,1515106953,1515799060,[],820025687,1515799060
4,12,820025687,1515799060,0,0,1,True,820024812,820703495,1515798752,1516095884,[820024812],-1,-1
5,12,820703495,1516095884,0,0,0,True,820025687,821673418,1515799060,1516597634,[],-1,-1


### Load texts into memory

In [11]:
audit_dir = os.path.join(derived_data_dir, 'audit')
text_db_filepath = os.path.join(audit_dir, 'text_2020-07-23T13:08:38Z.sqlite')

In [12]:
def get_db(db_filename):
    db = sqlite3.connect(
            db_filename,
            detect_types=sqlite3.PARSE_DECLTYPES
        )
    db.row_factory = sqlite3.Row
    return db

def get_existing_rev_ids(db_filepath):
    rev_ids = set()
    try:
        db = get_db(db_filepath)
        cursor = db.execute("SELECT rev_id FROM revisionText")
        for result in cursor:
            rev_id = result['rev_id']
            rev_ids.add(rev_id)
    finally:
        db.close()
    return rev_ids

In [13]:
#text_dict_list = []
rev_id_content_dict = {}
rev_id_comment_dict = {}
try:
    db = get_db(text_db_filepath)
    cursor = db.execute("SELECT rev_id, content, comment FROM revisionText")
    for result in tqdm(cursor, total=1106018):
        rev_id = result['rev_id']
        rev_id_content_dict[rev_id] = result['content']
        rev_id_comment_dict[rev_id] = result['comment']
        #comment = result['comment']
        #content = result['content']
        #text_dict_list.append({
        #    'rev_id': rev_id,
        #    'content': content,
        #    'comment': comment
        #})
finally:
    db.close()
len(rev_id_content_dict)

100%|██████████| 1106018/1106018 [02:25<00:00, 7577.31it/s]


1106018

In [13]:
rev_ids_with_text = get_existing_rev_ids(text_db_filepath)
len(rev_ids_with_text)

1106018

In [14]:
#text_df = pd.DataFrame(text_dict_list)
#print(len(text_df))
#text_df.head()

### Add text availability to sample3 revision data

Either join in a dataframe with the text data or just record which entries have text available.

In [15]:
#df = pd.merge(rev_df, text_df, how='left', on='rev_id')
df = rev_df

In [16]:
#df['has_text'] = ~df.content.isna()
df['has_text'] = df.rev_id.map(lambda rev_id: rev_id in rev_id_content_dict)
np.sum(df.has_text), np.sum(df.has_text) / len(df)

(1106018, 0.032563997371133024)

In [17]:
rev_ids_with_text = set(df[df.has_text].rev_id)
df['prev_rev_has_text'] = df.prev_rev_id.map(lambda rev_id: rev_id in rev_ids_with_text)

In [18]:
np.sum(df.prev_rev_has_text), np.sum(df.prev_rev_has_text) / len(df)

(996951, 0.029352786069619517)

In [19]:
np.sum((df.prev_rev_has_text)&(df.has_text))

689050

### Mess around with creating some features

In [20]:
sdf = df[(df.prev_rev_has_text)&(df.has_text)]
len(sdf)

689050

In [21]:
prev_rev_id = sdf.iloc[0].prev_rev_id
curr_rev_id = sdf.iloc[0].rev_id
prev_content = rev_id_content_dict[prev_rev_id]
curr_content = rev_id_content_dict[curr_rev_id]
len(prev_content), len(curr_content)

(154493, 154373)

In [22]:
prev_tokens = wikitext_split.tokenize(prev_content)
curr_tokens = wikitext_split.tokenize(curr_content)

In [28]:
print(list(segment_matcher.diff(prev_tokens, curr_tokens)))

[Equal(name='equal', a1=0, a2=34701, b1=0, b2=34701), Delete(name='delete', a1=34701, a2=34712, b1=34701, b2=34701), Equal(name='equal', a1=34712, a2=34716, b1=34701, b2=34705), Delete(name='delete', a1=34716, a2=34717, b1=34705, b2=34705), Insert(name='insert', a1=34717, a2=34717, b1=34705, b2=34706), Equal(name='equal', a1=34717, a2=34718, b1=34706, b2=34707), Delete(name='delete', a1=34718, a2=34719, b1=34707, b2=34707), Equal(name='equal', a1=34719, a2=34727, b1=34707, b2=34715), Delete(name='delete', a1=34727, a2=34730, b1=34715, b2=34715), Equal(name='equal', a1=34730, a2=34731, b1=34715, b2=34716), Delete(name='delete', a1=34731, a2=34758, b1=34716, b2=34716), Equal(name='equal', a1=34758, a2=51796, b1=34716, b2=51754)]


In [41]:
all_removed_tokens = []
all_inserted_tokens = []
for segment in segment_matcher.diff(prev_tokens, curr_tokens):
    if segment.name == 'equal':
        continue
    elif segment.name == 'delete':
        removed_tokens = prev_tokens[segment.a1:segment.a2]
        #print(' '.join(removed_tokens))
        all_removed_tokens.extend(removed_tokens)
    elif segment.name == 'insert':
        inserted_tokens = curr_tokens[segment.b1:segment.b2]
        #print(' '.join(inserted_tokens))
        all_inserted_tokens.extend(inserted_tokens)
    else:
        raise ValueError('I do not think substitutitions are implemented...')

[Token('According', type='word'),
 Token(' ', type='whitespace'),
 Token('to', type='word'),
 Token(' ', type='whitespace'),
 Token('Cochrane', type='word'),
 Token(' ', type='whitespace'),
 Token('review', type='word'),
 Token(' ', type='whitespace'),
 Token('2018', type='number'),
 Token(',', type='comma'),
 Token(' ', type='whitespace'),
 Token('that', type='word'),
 Token('"', type='etc'),
 Token('(', type='paren_open'),
 Token('EIBI', type='word'),
 Token(')', type='paren_close'),
 Token(' ', type='whitespace'),
 Token('an', type='word'),
 Token(' ', type='whitespace'),
 Token('effective', type='word'),
 Token(' ', type='whitespace'),
 Token('treatment', type='word'),
 Token(' ', type='whitespace'),
 Token('for', type='word'),
 Token(' ', type='whitespace'),
 Token('some', type='word'),
 Token(' ', type='whitespace'),
 Token('children', type='word'),
 Token(' ', type='whitespace'),
 Token('with', type='word'),
 Token(' ', type='whitespace'),
 Token('ASD', type='word'),
 Token('"',

In [39]:
diff = Counter(curr_tokens)
curr_counter = Counter(curr_tokens)
prev_counter = Counter(prev_tokens)
diff.subtract(prev_counter)
len(diff), len(curr_counter), len(prev_counter)

(5439, 5439, 5439)

In [52]:
for token, count in diff.items():
    if count != 0:
        print(f"{repr(token):>40}\t{count}")

           Token(' ', type='whitespace')	-18
                  Token('"', type='etc')	-2
           Token('(', type='paren_open')	-1
          Token(')', type='paren_close')	-1
            Token('2018', type='number')	-1
                Token('or', type='word')	-1
                Token('is', type='word')	-1
              Token('with', type='word')	-1
                Token(',', type='comma')	-1
         Token('treatment', type='word')	-1
              Token('some', type='word')	-1
          Token('children', type='word')	-1
               Token('ASD', type='word')	-1
            Token('review', type='word')	-1
                Token('to', type='word')	-1
              Token('that', type='word')	-1
                Token('an', type='word')	-1
        Token('considered', type='word')	-1
              Token('very', type='word')	-1
               Token('low', type='word')	-1
         Token('effective', type='word')	-1
         Token('According', type='word')	-1
              Token('EIBI', typ

In [55]:
rev_id_tokens_dict = {}
c = 0
MAX_TEXTS = 10000
for row in tqdm(sdf.itertuples(), total=len(sdf)):
    prev_rev_id = row.prev_rev_id
    curr_rev_id = row.rev_id
    if prev_rev_id not in rev_id_tokens_dict:
        prev_content = rev_id_content_dict[prev_rev_id]
        rev_id_tokens_dict[prev_rev_id] = wikitext_split.tokenize(prev_content)
        c += 1
    if curr_rev_id not in rev_id_tokens_dict:
        curr_content = rev_id_content_dict[curr_rev_id]
        rev_id_tokens_dict[curr_rev_id] = wikitext_split.tokenize(curr_content)
        c += 1
    if c >= MAX_TEXTS:
        break
len(rev_id_tokens_dict)

  1%|          | 8262/689050 [08:29<11:39:33, 16.22it/s] 


10001

In [56]:
word_counts = Counter()
for rev_id, tokens in tqdm(rev_id_tokens_dict.items(), total=len(rev_id_tokens_dict)):
    word_counts.update(tokens)
len(word_counts)

100%|██████████| 10001/10001 [00:45<00:00, 219.15it/s]


608805

In [57]:
word_counts.most_common(20)

[(Token(' ', type='whitespace'), 96745876),
 (Token('|', type='bar'), 12301610),
 (Token('=', type='equals'), 7189727),
 (Token(',', type='comma'), 6356344),
 (Token('.', type='period'), 5404204),
 (Token('the', type='word'), 4853522),
 (Token(']]', type='dbrack_close'), 4727430),
 (Token('[[', type='dbrack_open'), 4727351),
 (Token('of', type='word'), 3405190),
 (Token('\n', type='whitespace'), 3273284),
 (Token('-', type='etc'), 2802844),
 (Token('and', type='word'), 2526688),
 (Token('}}', type='dcurly_close'), 1947002),
 (Token('{{', type='dcurly_open'), 1945417),
 (Token('in', type='word'), 1931983),
 (Token("''", type='italic'), 1592096),
 (Token('to', type='word'), 1584792),
 (Token('"', type='etc'), 1481191),
 (Token(')', type='paren_close'), 1379058),
 (Token('(', type='paren_open'), 1377848)]

In [69]:
len([1 for v in word_counts.values() if v >= 100])

50895

In [71]:
labeled_rev_ids = set()
for row in tqdm(sdf.itertuples(), total=len(sdf)):
    prev_rev_id = row.prev_rev_id
    curr_rev_id = row.rev_id
    if prev_rev_id in rev_id_tokens_dict and curr_rev_id in rev_id_tokens_dict:
        labeled_rev_ids.add(curr_rev_id)
len(labeled_rev_ids)

100%|██████████| 689050/689050 [00:02<00:00, 263890.11it/s]


8263

In [73]:
prev_rev_id_dict = {row.rev_id: row.prev_rev_id for row in sdf.itertuples()}

In [72]:
n_features = len([1 for v in word_counts.values() if v >= 100])
n_features

50895

In [74]:
token_index_dict = {tup[0]: i for i, tup in enumerate(word_counts.most_common(n_features))}
len(token_index_dict)

50895

In [77]:
X = np.zeros((len(labeled_rev_ids),n_features))
for row, curr_rev_id in tqdm(enumerate(labeled_rev_ids), total=len(labeled_rev_ids)):
    prev_rev_id = prev_rev_id_dict[rev_id]
    prev_tokens = rev_id_tokens_dict[prev_rev_id]
    curr_tokens = rev_id_tokens_dict[curr_rev_id]
    diff = Counter(curr_tokens)
    prev_counter = Counter(prev_tokens)
    diff.subtract(prev_counter)
    
    for token, count in diff.items():
        if count != 0 and word_counts[token] >= 100:
            X[row,token_index_dict[token]] = count
    X[row,:] /= max(len(curr_tokens), len(prev_tokens))

100%|██████████| 8263/8263 [01:28<00:00, 93.58it/s] 


In [78]:
X.shape

(8263, 50895)

In [79]:
is_reverted_dict = {row.rev_id: row.is_reverted == 1 for row in sdf.itertuples()}
y = np.array([is_reverted_dict[rev_id] for rev_id in labeled_rev_ids])
y.shape

(8263,)

In [100]:
np.sum(y), np.sum(y) / len(y)

(1599, 0.19351325184557666)

In [82]:
np.sum(X == 0)

382788226

In [83]:
# 91% of entries are 0
382788226 / (8263 * 50895)

0.9102185867525333

In [84]:
clf = sklearn.linear_model.LogisticRegression(
    penalty='l2',
    C=1.0,
    solver='lbfgs'
)

In [85]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.20, random_state=500)

In [86]:
s = datetime.now()
print(clf)

    
# train the model
md = clf.fit(X_train, y_train)

print(f"{datetime.now() - s}")

LogisticRegression()
0:00:05.927926


In [87]:
# predict with the model
y_pred_test = md.predict(X_test)
y_pred_test_proba = md.predict_proba(X_test)[:,1]

In [88]:
np.sum(y_test == y_pred_test) / len(y_test)

0.8039927404718693

In [89]:
roc_auc = sklearn.metrics.roc_auc_score(y_test, y_pred_test_proba)
roc_auc

0.5690635769956061

In [92]:
# construct the vocabulary on all of the text documents
# this should only include TRAINING documents, not TESTING documents
s = datetime.now()

def dummy(doc):
    return doc

count_vectorizer = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        max_features=40000
    )

count_vectorizer.fit(rev_id_tokens_dict.values())
print(f"{datetime.now() - s}")

# this is the size of the vocabulary
len(count_vectorizer.vocabulary_)



40000

In [96]:
X_docs = []
for curr_rev_id in tqdm(labeled_rev_ids):
    X_docs.append(rev_id_tokens_dict[curr_rev_id])
X = count_vectorizer.transform(X_docs)

100%|██████████| 8263/8263 [00:00<00:00, 474916.88it/s]


In [97]:
s = datetime.now()
tfidf = TfidfTransformer()
tfidf.fit(X)
print(f"{datetime.now() - s}")

0:00:00.231109


In [99]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.20, random_state=500)
s = datetime.now()

clf = sklearn.linear_model.LogisticRegression(
    penalty='l2',
    C=1.0,
    solver='lbfgs',
    max_iter=1000
)
    
# train the model
md = clf.fit(X_train, y_train)

print(f"{datetime.now() - s}")

0:02:03.295813


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [101]:
y_pred_test = md.predict(X_test)
y_pred_test_proba = md.predict_proba(X_test)[:,1]

pct_predicted_reverted = np.sum(y_pred_test) / len(y_pred_test)
test_acc = np.sum(y_test == y_pred_test) / len(y_test)
roc_auc = sklearn.metrics.roc_auc_score(y_test, y_pred_test_proba)
pct_predicted_reverted, test_acc, roc_auc

(0.09618874773139746, 0.7900786448880823, 0.7026598017631376)

In [103]:
X[0,:]

<1x40000 sparse matrix of type '<class 'numpy.int64'>'
	with 2918 stored elements in Compressed Sparse Row format>

### Experimenting with diff features

In [17]:
diff_list = []
diff_json_filepath = os.path.join(audit_dir, 'diff_2020-07-23T13:08:38Z.ldjson')
skip_count = 0
with open(diff_json_filepath, 'r') as infile:
    for line in tqdm(infile, total=len(rev_ids_with_text)):
        if np.random.random() >= 0.55:
            skip_count += 1
            continue
        diff = json.loads(line)
        diff_list.append(diff)
        if len(diff_list) >= 50000:  # optional early-stopping condition to reduce concurrently loaded data size
            break
len(diff_list), skip_count

  8%|▊         | 90499/1106018 [03:28<39:00, 433.94it/s]  


(50000, 40500)

In [16]:
rev_id_is_reverted_dict = {row.rev_id: row.is_reverted for row in tqdm(rev_df[rev_df.rev_id.isin(rev_ids_with_text)].itertuples())}

1106018it [00:03, 290860.39it/s]


In [19]:
# add reverting information to the diff list
# optionally, can also bring in the content text from the text database
should_add_content_text = False
for diff in tqdm(diff_list):
    rev_id = diff['rev_id']
    diff['is_reverted'] = rev_id_is_reverted_dict[rev_id]
    if should_add_content_text:
        try:
            db = get_db(text_db_filepath)

            cursor = db.execute("SELECT rev_id, content FROM revisionText WHERE rev_id = ?", (rev_id,))
            result = cursor.fetchall()
            if len(result) > 1:
                raise ValueError("WARNING: Duplicated rev_id in database, check integrity.")
            if len(result) == 0:
                raise ValueError(f"Failed to find rev_id {rev_id} in database.")
            result = result[0]
            curr_content = result['content']
        finally:
            db.close()
        diff['content'] = curr_content

100%|██████████| 50000/50000 [00:00<00:00, 154547.81it/s]


#### Compute odds ratios to identify representative words

In [14]:
rev_id_is_reverted_dict = {row.rev_id: row.is_reverted for row in tqdm(rev_df[rev_df.rev_id.isin(rev_ids_with_text)].itertuples())}

1106018it [00:05, 207166.81it/s]


In [15]:
# compute counts for the reverted reverts only, in order to compute Odds Ratios
# oc = occurrence count (document frequency)
content_oc = Counter()
removed_oc = Counter()
inserted_oc = Counter()
reverted_content_oc = Counter()
reverted_removed_oc = Counter()
reverted_inserted_oc = Counter()

diff_json_filepath = os.path.join(audit_dir, 'diff_2020-07-23T13:08:38Z.ldjson')
with open(diff_json_filepath, 'r') as infile:
    for line in tqdm(infile, total=len(rev_ids_with_text)):
        diff = json.loads(line)
        content_set = set(diff['content_tokens'])
        removed_set = set(diff['removed_tokens'])
        inserted_set = set(diff['inserted_tokens'])
        content_oc.update(content_set)
        removed_oc.update(removed_set)
        inserted_oc.update(inserted_set)
        if rev_id_is_reverted_dict[diff['rev_id']] == 1:
            reverted_content_oc.update(content_set)
            reverted_removed_oc.update(removed_set)
            reverted_inserted_oc.update(inserted_set)
print(f"Content tokens: {len(content_oc)} (reverted {len(reverted_content_oc)})")
print(f"Removed tokens: {len(removed_oc)} (reverted {len(reverted_removed_oc)})")
print(f"Inserted tokens: {len(inserted_oc)} (reverted {len(reverted_inserted_oc)})")

 35%|███▍      | 385982/1106018 [37:39<1:10:14, 170.86it/s]

Content tokens: 8143191 (reverted 2910542)
Removed tokens: 499315 (reverted 227893)
Inserted tokens: 592580 (reverted 143858)





In [16]:
print(f"Content tokens: {len(content_oc)} (reverted {len(reverted_content_oc)})")
print(f"Removed tokens: {len(removed_oc)} (reverted {len(reverted_removed_oc)})")
print(f"Inserted tokens: {len(inserted_oc)} (reverted {len(reverted_inserted_oc)})")

Content tokens: 8143191 (reverted 2910542)
Removed tokens: 499315 (reverted 227893)
Inserted tokens: 592580 (reverted 143858)


In [17]:
# print some summary statistics
print("Token document frequency in reverted revisions")
for counter_name, counter in zip(['Article Content', 'Removals', 'Insertions'], [reverted_content_oc, reverted_removed_oc, reverted_inserted_oc]):
    print(counter_name)
    print('='*41)
    for token, count in counter.most_common(14):
        if token == '\n':
            token = 'NEWLINE'
        elif token == ' ':
            token = 'WHITESPACE'
        print(f"{token:>30} {count:>10}")
    print()

Token document frequency in reverted revisions
Article Content
                    WHITESPACE      55643
                       NEWLINE      55484
                            [[      55441
                            ]]      55440
                            

      55417
                            {{      55381
                            }}      55378
                             :      55369
                             ,      55354
                             |      55264
                             .      55251
                            of      55232
                           the      55224
                            ==      55157

Removals
                   REMOVAL_END      38050
                 REMOVAL_START      38050
                    WHITESPACE      22230
                             |      10171
                            ]]       9705
                            [[       9481
                             ,       8789
                             .       8393
   

In [18]:
def compute_token_odds_ratios(total_oc, reverted_oc, n=10000, min_freq=5):
    token_odds_ratio_list = []
    total_all_tokens_count = sum(total_oc.values())
    reverted_all_tokens_count = sum(reverted_oc.values())
    considered_tokens_count = 0
    for token, total_count in tqdm(total_oc.most_common(n)):
        if total_count < min_freq:
            break
        considered_tokens_count += 1
        reverted_count = reverted_oc[token] if token in reverted_oc else 0
        nonreverted_count = total_count - reverted_count
        otherToken_nonreverted_count = (total_all_tokens_count - reverted_all_tokens_count) - nonreverted_count
        otherToken_reverted_count = reverted_all_tokens_count - reverted_count
        
        if nonreverted_count == 0:
            odds_ratio = 999
        else:
            odds_ratio = (reverted_count * otherToken_nonreverted_count) / (otherToken_reverted_count * nonreverted_count)
        token_odds_ratio_list.append((token, odds_ratio, reverted_count, total_count))
    if considered_tokens_count != n:
        print(f"Due to minimum frequency threshold, considered only {considered_tokens_count} / {n} top tokens (total unique: {len(total_oc)}).")
    token_odds_ratio_list.sort(key=lambda tup: tup[1], reverse=True)
    return token_odds_ratio_list

In [24]:
for counter_name, total_oc, reverted_oc in zip(['Article Content', 'Removals', 'Insertions'], [content_oc, removed_oc, inserted_oc], [reverted_content_oc, reverted_removed_oc, reverted_inserted_oc]):
    token_odds_ratio_list = compute_token_odds_ratios(total_oc, reverted_oc, n=50000, min_freq=500)
    print(counter_name)
    print('='*41)
    for tup in token_odds_ratio_list[:40]:
        token, odds_ratio, reverted_count, total_count = tup
        if token == '\n':
            token = 'NEWLINE'
        elif token == ' ':
            token = 'WHITESPACE'
        elif token.isspace():
            token = 'WHITESPACE+'
        print(f"{token:>30} {odds_ratio:>10.3f} ({reverted_count} / {total_count} = {reverted_count / total_count*100:.2f}%)")

100%|██████████| 50000/50000 [00:00<00:00, 285876.00it/s]


Article Content
                           pc1      2.341 (1643 / 5496 = 29.89%)
                       Hungama      2.278 (434 / 1480 = 29.32%)
                         Chowk      2.213 (264 / 919 = 28.73%)
                           Bhi      2.155 (290 / 1029 = 28.18%)
                        Yamuna      2.142 (382 / 1361 = 28.07%)
                          bcdb      2.138 (308 / 1099 = 28.03%)
                         Taran      2.132 (278 / 994 = 27.97%)
                         Sun's      2.116 (360 / 1294 = 27.82%)
                     seafaring      2.110 (284 / 1023 = 27.76%)
                     Firstpost      2.077 (480 / 1749 = 27.44%)
                          IIFA      2.072 (286 / 1044 = 27.39%)
                           BJP      2.069 (648 / 2368 = 27.36%)
                         Ghazi      2.062 (563 / 2062 = 27.30%)
                          Kher      2.059 (261 / 957 = 27.27%)
                     Aurangzeb      2.055 (289 / 1061 = 27.24%)
                          

  3%|▎         | 1406/50000 [00:00<00:00, 279845.84it/s]


Due to minimum frequency threshold, considered only 1406 / 50000 top tokens (total unique: 499315).
Removals
                           Use      4.916 (528 / 796 = 66.33%)
                       Infobox      4.905 (633 / 955 = 66.28%)
                         dates      4.582 (538 / 831 = 64.74%)
                       Reflist      2.987 (328 / 602 = 54.49%)
                      External      2.633 (438 / 853 = 51.35%)
                    References      2.567 (535 / 1055 = 50.71%)
                       caption      2.536 (438 / 869 = 50.40%)
                     sometimes      2.426 (352 / 714 = 49.30%)
                      followed      2.375 (297 / 609 = 48.77%)
                        origin      2.319 (251 / 521 = 48.18%)
                          uses      2.317 (390 / 810 = 48.15%)
                       changes      2.273 (246 / 516 = 47.67%)
                    webarchive      2.253 (308 / 649 = 47.46%)
                         Early      2.244 (313 / 661 = 47.35%)
        

  4%|▎         | 1855/50000 [00:00<00:00, 343598.04it/s]

Due to minimum frequency threshold, considered only 1855 / 50000 top tokens (total unique: 592580).
Insertions
                           you      5.178 (628 / 1396 = 44.99%)
                            my      5.157 (395 / 880 = 44.89%)
                          your      4.727 (274 / 641 = 42.75%)
                            me      4.353 (275 / 675 = 40.74%)
                           big      3.894 (246 / 646 = 38.08%)
                          love      3.474 (231 / 652 = 35.43%)
                          know      3.337 (262 / 759 = 34.52%)
                           get      2.696 (336 / 1125 = 29.87%)
                           got      2.672 (157 / 529 = 29.68%)
                            we      2.566 (293 / 1016 = 28.84%)
                          good      2.563 (300 / 1041 = 28.82%)
                           sex      2.480 (161 / 572 = 28.15%)
                          like      2.331 (630 / 2341 = 26.91%)
                        famous      2.297 (168 / 631 = 26.62%)
  




#### Compute the vocabularies

In [20]:
content_counter = Counter()
removed_counter = Counter()
inserted_counter = Counter()

include_bigrams = False
def get_bigrams(token_list):
    ts = token_list
    return [ts[i] + "_" + ts[i+1] for i in range(len(ts) - 1)]

for diff in tqdm(diff_list, desc='Generating word counts'):
    content_counter.update(diff['content_tokens'])
    removed_counter.update(diff['removed_tokens'])
    inserted_counter.update(diff['inserted_tokens'])
    if include_bigrams:
        content_counter.update(get_bigrams(diff['content_tokens']))
        removed_counter.update(get_bigrams(diff['removed_tokens']))
        inserted_counter.update(get_bigrams(diff['inserted_tokens']))

len(content_counter), len(removed_counter), len(inserted_counter)

Generating word counts: 100%|██████████| 50000/50000 [03:27<00:00, 241.09it/s]


(2857456, 147161, 162179)

In [21]:
# print some summary statistics
for counter_name, counter in zip(['Article Content', 'Removals', 'Insertions'], [content_counter, removed_counter, inserted_counter]):
    print(counter_name)
    print('='*41)
    for token, count in counter.most_common(20):
        if token == '\n':
            token = 'NEWLINE'
        elif token == ' ':
            token = 'WHITESPACE'
        print(f"{token:>30} {count:>10}")
    print()

Article Content
                    WHITESPACE  359758534
                             |   47220198
                             =   26892684
                             ,   23929390
                             .   19249065
                            ]]   18765321
                            [[   18764881
                           the   17144852
                       NEWLINE   15312317
                            of   11832337
                             -   10752339
                           and    8849511
                            }}    7346042
                            {{    7339362
                            in    7090965
                            ''    6925157
                             "    6305490
                            to    5720933
                             )    5693251
                             (    5690791

Removals
                    WHITESPACE    1775020
                             |     223926
                 REMOVAL_START     156144
        

In [22]:
content_vocabulary = [token for token, count in content_counter.most_common(20000)]
removed_vocabulary = [token for token, count in content_counter.most_common(10000)]
inserted_vocabulary = [token for token, count in content_counter.most_common(10000)]

In [23]:
len(content_vocabulary), len(removed_vocabulary), len(inserted_vocabulary)

(20000, 10000, 10000)

In [24]:
# construct the vocabulary on all of the text documents
# this should only include TRAINING documents, not TESTING documents
# for now, it seems mostly innocent to compute the vocab from all documents

def dummy(doc):
    return doc

def get_count_vectorizer(vocabulary):
    vectorizer = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        vocabulary=vocabulary
    )
    return vectorizer

def stream_dict_key(diffs, key):
    for diff in diffs:
        yield diff[key]

In [26]:
s = datetime.now()
content_vectorizer = get_count_vectorizer(content_vocabulary)
X_content = content_vectorizer.fit_transform(stream_dict_key(diff_list, 'content_tokens'))
print(f"Built CountVectorizer for full-page tokens in {datetime.now() - s}")

Built CountVectorizer for full-page tokens in 0:05:04.321360


In [25]:
s = datetime.now()
removed_vectorizer = get_count_vectorizer(removed_vocabulary)
X_removed = removed_vectorizer.fit_transform(stream_dict_key(diff_list, 'removed_tokens'))
print(f"Built CountVectorizer for removed tokens in {datetime.now() - s}")

s = datetime.now()
inserted_vectorizer = get_count_vectorizer(inserted_vocabulary)
X_inserted = inserted_vectorizer.fit_transform(stream_dict_key(diff_list, 'inserted_tokens'))
print(f"Built CountVectorizer for inserted tokens in {datetime.now() - s}")

Built CountVectorizer for removed tokens in 0:00:02.062464
Built CountVectorizer for inserted tokens in 0:00:02.311446


In [67]:
X_content.shape, X_removed.shape, X_inserted.shape

((50000, 20000), (50000, 10000), (50000, 10000))

In [28]:
X = scipy.sparse.hstack((X_content, X_removed, X_inserted))
X.shape

(50000, 40000)

In [29]:
y = np.array([diff['is_reverted'] for diff in diff_list])
y.shape

(50000,)

In [30]:
# percentage reverted in this sample
np.sum(y) / len(y)

0.18394

In [31]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.20, random_state=500)
s = datetime.now()

clf = sklearn.linear_model.LogisticRegression(
    penalty='l2',
    C=1.0,
    solver='lbfgs',
    max_iter=1000
)

#clf = sklearn.svm.LinearSVC(
#    C=0.1,
#    dual=False,
#)
clf = sklearn.linear_model.SGDClassifier(
    loss='log',
    penalty='l2',
    early_stopping=False,
    validation_fraction=0.05,
    verbose=1,
)

# scaling can help some of the solvers converge more rapidly...
X_train = sklearn.preprocessing.scale(X_train, with_mean=False)
X_test = sklearn.preprocessing.scale(X_test, with_mean=False)

# train the model
md = clf.fit(X_train, y_train)

print(f"{datetime.now() - s}")

-- Epoch 1
Norm: 3325.17, NNZs: 39215, Bias: -2.540590, T: 40000, Avg. loss: 1768.632292
Total training time: 0.35 seconds.
-- Epoch 2
Norm: 1943.07, NNZs: 39567, Bias: -2.668044, T: 80000, Avg. loss: 330.334960
Total training time: 0.58 seconds.
-- Epoch 3
Norm: 1435.10, NNZs: 39682, Bias: -2.703326, T: 120000, Avg. loss: 166.109555
Total training time: 0.84 seconds.
-- Epoch 4
Norm: 1161.72, NNZs: 39764, Bias: -2.717208, T: 160000, Avg. loss: 111.601127
Total training time: 1.11 seconds.
-- Epoch 5
Norm: 981.92, NNZs: 39795, Bias: -2.731896, T: 200000, Avg. loss: 80.941698
Total training time: 1.41 seconds.
-- Epoch 6
Norm: 849.42, NNZs: 39816, Bias: -2.724177, T: 240000, Avg. loss: 60.869392
Total training time: 1.76 seconds.
-- Epoch 7
Norm: 754.45, NNZs: 39828, Bias: -2.728652, T: 280000, Avg. loss: 49.893752
Total training time: 2.17 seconds.
-- Epoch 8
Norm: 681.54, NNZs: 39838, Bias: -2.728860, T: 320000, Avg. loss: 41.535114
Total training time: 2.53 seconds.
-- Epoch 9
Norm: 

Norm: 158.32, NNZs: 39930, Bias: -2.763552, T: 2720000, Avg. loss: 2.758155
Total training time: 23.23 seconds.
-- Epoch 69
Norm: 156.76, NNZs: 39930, Bias: -2.763567, T: 2760000, Avg. loss: 2.731135
Total training time: 23.52 seconds.
-- Epoch 70
Norm: 155.31, NNZs: 39931, Bias: -2.763865, T: 2800000, Avg. loss: 2.630099
Total training time: 23.83 seconds.
-- Epoch 71
Norm: 153.78, NNZs: 39931, Bias: -2.763297, T: 2840000, Avg. loss: 2.679160
Total training time: 24.19 seconds.
-- Epoch 72
Norm: 152.36, NNZs: 39931, Bias: -2.763759, T: 2880000, Avg. loss: 2.547628
Total training time: 24.54 seconds.
-- Epoch 73
Norm: 150.98, NNZs: 39931, Bias: -2.763252, T: 2920000, Avg. loss: 2.580961
Total training time: 24.86 seconds.
-- Epoch 74
Norm: 149.71, NNZs: 39931, Bias: -2.763464, T: 2960000, Avg. loss: 2.477265
Total training time: 25.15 seconds.
-- Epoch 75
Norm: 148.49, NNZs: 39932, Bias: -2.764203, T: 3000000, Avg. loss: 2.442070
Total training time: 25.45 seconds.
-- Epoch 76
Norm: 14

In [32]:
y_pred_test = md.predict(X_test)
y_pred_test_proba = md.predict_proba(X_test)[:,1]
#y_pred_test_proba = 1 / (1 + np.exp(-md.decision_function(X_test))) # can use as lazy eval for models without a probability output

pct_predicted_reverted = np.sum(y_pred_test) / len(y_pred_test)
test_acc = np.sum(y_test == y_pred_test) / len(y_test)
roc_auc = sklearn.metrics.roc_auc_score(y_test, y_pred_test_proba)
pct_predicted_reverted, test_acc, roc_auc

(0.1786, 0.7693, 0.6843365312510097)

In [33]:
clf.coef_.shape

(1, 40000)

In [36]:
content_token_weights = list(zip(content_vocabulary, clf.coef_[0,:20000]))

In [38]:
removed_token_weights = list(zip(removed_vocabulary, clf.coef_[0,20000:30000]))
inserted_token_weights = list(zip(inserted_vocabulary, clf.coef_[0,30000:]))

In [49]:
content_token_weights.sort(key=lambda tup: abs(tup[1]), reverse=True)
removed_token_weights.sort(key=lambda tup: abs(tup[1]), reverse=True)
inserted_token_weights.sort(key=lambda tup: abs(tup[1]), reverse=True)

In [50]:
for token, weight in content_token_weights[:30]:
    print(f"{token:>20}   {weight:.3f}")

                SAGE   -2.964
                  Hi   1.875
                 725   1.853
    increaseNegative   -1.828
          References   -1.806
              emerge   1.760
                 443   -1.754
                 431   -1.720
           Frederick   1.710
           Messenger   -1.708
        dictatorship   1.688
         approximate   1.674
           frameless   1.670
                  cn   -1.635
                 pc1   1.629
                 618   1.589
      organizational   -1.589
           spokesman   -1.567
               Alive   1.566
            respects   -1.560
         subdivision   1.553
              select   1.540
                  ==   -1.535
                 224   -1.516
            sciences   1.512
           <nowiki/>   -1.510
             amazing   1.508
          attributed   -1.505
              Alonso   1.490
              image4   -1.484


In [48]:
for token, weight in removed_token_weights[:50]:
    print(f"{token:>20}   {weight:.3f}")

                2018   -5.142
                hold   3.699
        Publications   3.652
           Minnesota   3.641
                  me   -3.596
                wide   3.592
           sovereign   3.480
                 202   3.441
                  Is   -3.427
                  69   -3.423
                   ‘   -3.377
                 480   3.353
                  MF   -3.273
                soil   -3.129
                Fact   3.127
           Microsoft   3.108
            Examples   3.105
             article   -3.024
                  32   3.021
                1920   3.009
                  63   2.991
               until   2.970
                  SS   -2.963
             India's   -2.960
                 GER   -2.959
              causes   2.920
                   '   -2.880
                1952   2.854
            reliable   -2.803
                   [   -2.793
             cropped   2.774
                  43   2.763
            greatest   -2.754
               elite   -2.73

In [47]:
for token, weight in inserted_token_weights[:50]:
    print(f"{token:>20}   {weight:.3f}")

           platforms   -9.751
                  my   6.570
           Meanwhile   -4.333
                 big   4.331
              decade   -3.890
                 Jew   3.760
                 978   -3.692
                 209   3.676
                Jeff   3.673
           suspected   -3.648
               elite   3.531
             website   -3.498
                very   3.484
                  Is   3.415
              little   3.289
                 Use   -3.261
           practiced   3.200
                 124   3.195
                  IT   3.166
                  DF   -3.090
            duration   3.086
               rifle   -3.078
       controversial   3.072
               jstor   -3.031
           September   -2.940
              linear   2.928
          Portuguese   -2.923
              killed   2.922
                 you   2.921
             present   -2.911
             because   2.890
            affected   2.878
             archive   -2.874
             Opening   -2.862