Text Feature Classification
===

Text feature classification of reverts. Messing around with large linear models of text features.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

In [3]:
import os
from tqdm import tqdm
import bz2
import sqlite3
import difflib
import gzip
import json
import base64
import pickle
import re
import hashlib
from datetime import datetime
from datetime import timezone
import nltk
import scipy.stats
import para
from itertools import groupby
from collections import Counter

In [4]:
import deltas

In [5]:
import sklearn
import sklearn.ensemble
import sklearn.metrics
import sklearn.calibration

In [6]:
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = git_root_dir[0]
git_root_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback'

In [7]:
raw_data_dir = "/export/scratch2/wiki_data"
derived_data_dir = os.path.join(git_root_dir, "data", "derived")
raw_data_dir, derived_data_dir

('/export/scratch2/wiki_data',
 '/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived')

In [8]:
stub_history_dir = os.path.join(derived_data_dir, 'stub-history-all-revisions')
stub_history_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived/stub-history-all-revisions'

In [9]:
revision_sample_dir = os.path.join(derived_data_dir, 'revision_sample')
working_dir = os.path.join(derived_data_dir, 'audit')
working_dir

'/export/scratch2/levon003/repos/wiki-ores-feedback/data/derived/audit'

### Read sample data

In [10]:
# read in the sample dataframe
s = datetime.now()
revision_sample_dir = os.path.join(derived_data_dir, 'revision_sample')
sample3_filepath = os.path.join(revision_sample_dir, 'sample3_all.pkl')
rev_df = pd.read_pickle(sample3_filepath)
print(f"Sample 3 data loaded in {datetime.now() - s}.")
len(rev_df)

Sample 3 data loaded in 0:00:28.941707.


33964442

In [11]:
rev_df.head()

Unnamed: 0,page_id,rev_id,rev_timestamp,is_revert_target,is_reverted,is_reverting,is_sample_eligible,prev_rev_id,next_rev_id,prev_rev_timestamp,next_rev_timestamp,reverted_rev_ids,reverting_rev_id,reverting_rev_timestamp
1,12,818613649,1515102279,0,0,0,True,818611292,818624114,1515101356,1515106953,[],-1,-1
2,12,818624114,1515106953,1,0,0,True,818613649,820024812,1515102279,1515798752,[],-1,-1
3,12,820024812,1515798752,0,1,0,True,818624114,820025687,1515106953,1515799060,[],820025687,1515799060
4,12,820025687,1515799060,0,0,1,True,820024812,820703495,1515798752,1516095884,[820024812],-1,-1
5,12,820703495,1516095884,0,0,0,True,820025687,821673418,1515799060,1516597634,[],-1,-1


### Load texts into memory

In [12]:
audit_dir = os.path.join(derived_data_dir, 'audit')
text_db_filepath = os.path.join(audit_dir, 'text_2020-07-23T13:08:38Z.sqlite')

In [13]:
def get_db(db_filename):
    db = sqlite3.connect(
            db_filename,
            detect_types=sqlite3.PARSE_DECLTYPES
        )
    db.row_factory = sqlite3.Row
    return db

In [15]:
text_dict_list = []
try:
    db = get_db(text_db_filepath)
    cursor = db.execute("SELECT rev_id, content, comment FROM revisionText")
    for result in tqdm(cursor, total=1106018):
        rev_id = result['rev_id']
        comment = result['comment']
        content = result['content']
        text_dict_list.append({
            'rev_id': rev_id,
            'content': content,
            'comment': comment
        })
finally:
    db.close()
len(text_dict_list)

100%|██████████| 1106018/1106018 [01:50<00:00, 9973.98it/s] 


1106018

In [16]:
text_df = pd.DataFrame(text_dict_list)
len(text_df)

1106018

In [13]:
# Set the text feature filepath
labeled_revs_dir = os.path.join(derived_data_dir, 'labeled-revs')
sample3_dir = os.path.join(labeled_revs_dir, 'sample3-features')
sample3_damaging_filepath = os.path.join(sample3_dir, 'sample3.mock.w_cache.text.2020-07-23T13:08:38Z.json')
assert os.path.exists(sample3_damaging_filepath)

In [16]:
text_dict_list = []
with open(sample3_damaging_filepath, 'r') as infile:
    for line in tqdm(infile, total=23157371):  # hard-coded length of input data
        observation = json.loads(line)
        if 'cache' not in observation:
            continue
        cache = pickle.loads(base64.b85decode(bytes(observation['cache'], 'ascii')))
        rev_id = observation['rev_id']
        content_key = 'feature.enwiki.revision.text.content'
        comment_key = 'feature.enwiki.revision.text.comment'
        obs_dict = {
            'rev_id': rev_id,
            'content': "",
            'comment': ""
        }
        if content_key in cache:
            content = cache[content_key]
            obs_dict['content'] = content
        if comment_key in cache:
            comment = cache[comment_key]
            obs_dict['comment'] = comment
        text_dict_list.append(obs_dict)
len(text_dict_list)

100%|██████████| 23157371/23157371 [4:40:49<00:00, 1374.40it/s] 


1106018

In [None]:
# TODO will need to write a multiprocessing implementation that takes the texts and puts them in a sqlite database

In [18]:
del text_dict_list
len(text_df)

1106018

In [19]:
text_df.head()

Unnamed: 0,rev_id,content,comment
0,857219031,{{pp-vandalism|small=yes}}\n{{good article}}\n...,
1,857219035,{{main article|Toyota RAV4}}\n{{Infobox electr...,
2,857219034,{{For|the film of the same name|Loose Women (f...,
3,857219036,[[File:Georges Marty.jpg|thumb|Georges Marty]]...,/* External links */
4,857219037,{{Infobox country at games\n| NOC = I...,/* Judo */


In [None]:
s = datetime.now()
text_df_filepath = os.path.join(working_dir, 'text_2020-07-23T13:08:38Z.h5')
text_df.to_hdf(text_df_filepath, 'text_df', mode='w', format='fixed')
print(f"{datetime.now() - s}")

In [None]:
# read the text data from the pickle file
s = datetime.now()
text_df_filepath = os.path.join(working_dir, 'text_2020-07-23T13:08:38Z.pickle')
text_df = pd.read_pickle(text_df_filepath)
print(f"{datetime.now() - s}")
len(text_df)

### Join 