In [324]:
import sys
sys.path.append("..")
%load_ext autoreload
%autoreload 2
import wikidit

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [325]:
from wikidit.preprocessing import _load_backlog, WP10_LABELS
from wikidit.io import load_ndjson

# Train Model

In [326]:
import dill

In [327]:
import os
import os.path
import gzip
import json
import pandas as pd
from joblib import Parallel, delayed

In [328]:
output_dir = "../data/enwiki-labeling_revisions-w_features/"
filenames = [os.path.join(output_dir, f) for f in os.listdir(output_dir)]

In [329]:
def read_labeled(filename):
    out = []
    with gzip.open(filename, "rt") as f:
        for line in f:
            row = json.loads(line)
            del row['wikitext']
            del row['text']
            out.append(row)
    return pd.DataFrame.from_records(out)

In [330]:
revisions = pd.concat(Parallel(n_jobs=6)(delayed(read_labeled)(f) for f in filenames))
revisions['wp10'] = pd.Series(revisions['wp10'], dtype=WP10_DTYPE)

# Create a pipeline

In [331]:
from wikidit.preprocessing import WP10_DTYPE

In [332]:
from sklearn.model_selection import train_test_split

In [371]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn_pandas import DataFrameMapper
import xgboost as xgb
import dill

from sklearn_ordinal import OrdinalClassifier
from sklearn.preprocessing import FunctionTransformer

sqrt_cols = ['words',
             # infobox as a binary
             'backlog_accuracy',
             'backlog_content',
             'backlog_other',
             'backlog_style',
             'backlog_links']

per_words = [
             'headings_per_word',
             'sub_headings_per_word',
             # links
             'images_per_word',
             'categories_per_word',
             'wikilinks_per_word',
             'external_links_per_word',
             # templates
             'main_templates_per_word',
             'cite_templates_per_word',
             'ref_per_word'    
]

binarized_cols = ['coordinates', 'infoboxes']

mapper = DataFrameMapper([
    (sqrt_cols, FunctionTransformer(func=np.sqrt)),
    (binarized_cols, FunctionTransformer(func=lambda x: x.astype(bool))),
    (per_words, None)
])

xgb_params = {
              'n_estimators': 200,
              'silent': True,
              'booster': 'gbtree',
              'objective': 'binary:logistic',
              'seed': 1234
             }

clf = xgb.XGBClassifier(**xgb_params)

In [372]:
pipe = Pipeline([
    ('mapper', mapper),
    ('clf', OrdinalClassifier(clf))
])

# Fit Model on Full Sample

In [373]:
fitted = pipe.fit(X=revisions.copy(), y=revisions['wp10'])



In [374]:
revisions['pred'] = pipe.predict(X=revisions.copy())



In [380]:
rev_ct = pd.crosstab(index=revisions["wp10"], 
                     columns=revisions["pred"])
# rev_ct.index= ["wp10", "pred"]
rev_ct

pred,Stub,Start,C,B,GA,FA
wp10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Stub,4696,729,53,8,0,0
Start,624,3278,1236,296,42,0
C,58,730,2684,1450,506,57
B,30,434,1458,2425,924,215
GA,2,10,207,811,3597,868
FA,0,3,9,248,1266,3470


In [381]:
for cat in ("Stub", "Start", "C", "B", "GA", "FA"): 
    print(cat, np.mean((revisions["pred"] <= cat) == (revisions["wp10"] <= cat)))

Stub 0.953614606464
Start 0.91049839625
C 0.860843819393
B 0.906427337774
GA 0.917776955342
FA 1.0


In [382]:
fitted.score(revisions, y=revisions['wp10'])



0.62145324451023931

In [383]:
revisions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32424 entries, 0 to 5485
Data columns (total 52 columns):
anon                          2265 non-null object
backlog_accuracy              32424 non-null int64
backlog_accuracy_templates    224 non-null object
backlog_content               32424 non-null int64
backlog_content_templates     764 non-null object
backlog_files                 32424 non-null int64
backlog_files_templates       0 non-null object
backlog_links                 32424 non-null int64
backlog_links_templates       699 non-null object
backlog_other                 32424 non-null int64
backlog_other_templates       452 non-null object
backlog_style                 32424 non-null int64
backlog_style_templates       357 non-null object
categories                    32424 non-null int64
categories_per_word           32424 non-null float64
cite_templates                32424 non-null int64
cite_templates_per_word       32424 non-null float64
comment                      

In [384]:
with open("../models/model.pkl", "wb")  as f:
    dill.dump(fitted, f)

# Evaluate Model Peformance

In [385]:
from wikidit.mw import Session, get_page
from wikidit.preprocessing import Featurizer

In [417]:
def predict_page_edits_api(title, model, featurizer=Featurizer(), session=None):
    if session is None:
        session = Session()
    page = get_page(session, title)
    return predict_page_edits(featurizer, page['content'], model)

In [418]:
def predict_page_edits(featurizer, content, pipeline):
    revision = featurizer.parse_content(content)
    del revision['text']

    revision = pd.DataFrame.from_records([revision])
    probs = pipeline.predict_proba(revision)[0, :]
    best_class = str(pipeline.predict(revision)[0])
    
    # If predicted to be FA - nothing else to do.
    if best_class == "FA":
        return {"predicted_class": best_class}
    
    # Create new pipeline for only that class
    pipe2 = Pipeline([('mapper', pipeline.named_steps['mapper']),
                      ('clf', pipeline.named_steps['clf'].named_estimators_[best_class])])

    # Predicted probability for > current predicted class
    prob_class = pipe2.predict_proba(revision)[0, 1]

    # Calc new probabilities for all types of edits
    edits = [(nm, pd.DataFrame.from_records([x])) 
             for nm, x in make_edits(revision.to_dict('records')[0])]
    new_probs = [(nm, pipe2.predict_proba(ed)[0, 1]) for nm, ed in edits]
    change_prob = [(nm, p - prob_class) for nm, p in new_probs]
    top_edits = sorted([(nm, p) for (nm, p) in change_prob if p > 0],
                       key=lambda x: -x[1])
    
    return {
        'predict': best_class,
        'proba': probs,
        'predicted_class_prob': prob_class,
        'change_prob': change_prob,
        'top_edits': top_edits
    }


In [424]:
results = predict_page_edits_api('Data science', fitted, featurizer)



In [426]:
np.round(results['proba'], 2)

array([ 0.  ,  0.01,  0.17,  0.24,  0.31,  0.27])