# Train Model

In [1]:
import sys
sys.path.append("..")
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from sklearn_pandas import cross_val_score, DataFrameMapper
from pandas import Categorical
from sklearn.ensemble import RandomForestClassifier
import mwapi

In [5]:
from wikidit.models import featurize, load_wp10
from sklearn_ordinal import OrdinalClassifier

In [6]:
input_file = "../data/enwiki.labeling_revisions.w_features.nettrom_30k.csv.gz"
revisions = load_wp10(input_file)

# Create a pipeline

In [7]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures

In [8]:
import xgboost as xgb
import dill

In [10]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn.base import BaseEstimator, TransformerMixin

sqrt_cols = ['words',
             
             'headings',
             'sub_headings',
             'images',
             'categories',
             'wikilinks',

             'who_templates',
             'main_templates',
             'cite_templates',
             # infobox as a binary
             'citation_needed',
             'other_templates',

             'ref',
             'smartlists',
             'coordinates']

binarized_cols = ['coordinates', 'infoboxes']

mapper = DataFrameMapper([
    (sqrt_cols, FunctionTransformer(func=np.sqrt)),
    (binarized_cols, FunctionTransformer(func=lambda x: x.astype(bool)))
])

clf = xgb.XGBClassifier(max_depth=5,
                        learning_rate=0.01, 
                        n_estimators=100, silent=True, 
                        objective='binary:logistic')

In [11]:
dtrain = xgb.DMatrix(mapper.fit_transform(revisions))
param = {'max_depth':10, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
num_round = 10




In [12]:
dtrain = xgb.DMatrix(mapper.fit_transform(revisions), label=revisions['wp10'] > 'Start')



In [14]:
xgb.cv(param, dtrain, nfold=5)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.073379,0.001198,0.110258,0.003556
1,0.064119,0.001073,0.109394,0.00153
2,0.05957,0.001968,0.107235,0.002285
3,0.056486,0.001921,0.108284,0.002771
4,0.053225,0.001978,0.108037,0.001737
5,0.050472,0.001933,0.108253,0.001935
6,0.048128,0.001854,0.10887,0.001781
7,0.046077,0.00128,0.110196,0.001407
8,0.044419,0.001446,0.110196,0.001279
9,0.040263,0.00348,0.110165,0.001323


Get categories

In [40]:
pipe = Pipeline([
    ('mapper', mapper),
    ('clf', OrdinalClassifier(clf))
])

# Fit Model on Full Sample

In [41]:
fitted = pipe.fit(X=revisions.copy(), y=revisions['wp10'])



In [44]:
revisions['pred'] = pipe.predict(X = revisions.copy())



In [46]:
rev_ct = pd.crosstab(index=revisions["wp10"], 
                     columns=revisions["pred"])
# rev_ct.index= ["wp10", "pred"]
rev_ct

pred,Stub,Start,C,B,GA,FA
wp10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Stub,4942,523,17,3,1,0
Start,1029,3819,502,98,25,3
C,164,1765,2835,299,301,121
B,111,1293,2102,1133,449,398
GA,131,258,1260,212,2294,1340
FA,114,39,276,236,701,3630


In [56]:
for cat in ("Stub", "Start", "C", "B", "GA", "FA"): 
    print(cat, np.mean((revisions["pred"] <= cat) == (revisions["wp10"] <= cat)))

Stub 0.935449050086
Start 0.860473723168
C 0.801535899334
B 0.882062669627
GA 0.90044411547
FA 1.0


In [None]:
fitted.score(X=revisions.copy(), 

In [None]:
with open("../models/model.pkl", "wb")  as f:
    dill.dump(fitted, f)

# Evaluate Model Peformance

# TODO

1. Cross validate
2. Out of sample
3. Other