# Train Model

In [101]:
import sys
sys.path.append("..")
%reload_ext autoreload
%autoreload 2

In [102]:
import pandas as pd
from sklearn_pandas import cross_val_score, DataFrameMapper
from pandas import Categorical
from sklearn.ensemble import RandomForestClassifier
import mwapi

In [103]:
from wikidit.models import featurize, load_wp10

In [104]:
input_file = "../data/enwiki.labeling_revisions.w_features.nettrom_30k.csv.gz"
revisions = load_wp10(input_file)

# Create a pipeline

In [105]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures

In [106]:
import xgboost as xgb

import dill

In [107]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn.base import BaseEstimator, TransformerMixin

sqrt_cols = ['words',
             'headings',
             'sub_headings',
             'images',
             'categories',
             'wikilinks',

             'who_templates',
             'main_templates',
             'cite_templates',
             # infobox as a binary
             'citation_needed',
             'other_templates',

             'ref',
             'smartlists',
             'coordinates']

binarized_cols = ['coordinates', 'infoboxes']

mapper = DataFrameMapper([
    (sqrt_cols, FunctionTransformer(func=np.sqrt)),
    (binarized_cols, FunctionTransformer(func=lambda x: x.astype(bool)))
])

# clf = LogisticRegressionCV(multi_class='multinomial', 
#                           random_state=1234, 
#                           penalty='l2',
#                           fit_intercept=True, 
#                           n_jobs=-1)
clf = xgb.XGBClassifier(max_depth=5, learning_rate=0.01, n_estimators=100, silent=True, objective='binary:logistic')

Get categories

In [108]:
import numpy as np
import warnings

from sklearn.base import ClassifierMixin
from sklearn.base import TransformerMixin
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.utils import Parallel, delayed
from sklearn.utils.validation import has_fit_parameter, check_is_fitted
from sklearn.utils.metaestimators import _BaseComposition
from sklearn.utils import Bunch


def _parallel_fit_estimator(estimator, X, y, cat):
    """Private function used to fit an estimator to a class within a job."""
    touse = (y >= cat)
    y_transformed = y[touse] > cat
    estimator.fit(X[touse, :], y_transformed)
    return estimator


class OrdinalClassifier(_BaseComposition, ClassifierMixin, TransformerMixin):

    def __init__(self, estimator, n_jobs=None, proba_transform=None, left=True):
        self.estimator = estimator
        self.n_jobs = n_jobs
        self.proba_transform = proba_transform
        self.left = left

    @property
    def named_estimators(self):
        return Bunch(**dict(self.estimator))

    def fit(self, X, y, categories='auto'):
        if not (isinstance(y, pd.Series) and hasattr(y, "cat")):
            raise ValueError("y must be pd.Series object with dtype Categorical")

        # this is hard-coded for categorical variables
        self.classes_ = y.cat.categories

        categories = self.classes_[:-1]
    
        # order of estimators
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                delayed(_parallel_fit_estimator)(clone(self.estimator), X, y, cat)
                for cat in categories)

        self.named_estimators_ = Bunch(**dict())
        for k, e in zip(self.classes_[:-1], self.estimators_):
            self.named_estimators_[k] = e
        return self

    def predict(self, X):
        out = np.argmax(self.predict_proba(X), axis=1)
        out = pd.Categorical.from_codes(out, categories=self.classes_, ordered=True)
        return out

    def _collect_log_probas(self, X):
        """Collect results from clf.predict calls. """
        if hasattr(clf, "predict_log_proba"):
            return [clf.predict_log_proba(X) for clf in self.estimators_]
        else:
            return [np.log(clf.predict_proba(X)) for clf in self.estimators_]

    def _predict_log_proba(self, X):
        """Predict log class probabilities for X"""
        out = np.empty((X.shape[0], len(y.cat.categories)))
        for i, logp in enumerate(self._collect_log_probas(X)):
            if i > 0:
                # add log conditional probability
                logp += out[:, (i, )]
            out[:, i:(i + 2)] = logp
        return out

    @property
    def predict_log_proba(self):
        """Compute probabilities of possible outcomes for samples in X.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        Returns
        ----------
        avg : array-like, shape = [n_samples, n_classes]
            Weighted average probability for each class per sample.
        """
        return self._predict_log_proba
    
    def predict_proba(self, X):
        return np.exp(self.predict_log_proba(X))

    def transform(self, X):
        if self.proba_transform:
            return self.predict_proba(X)
        else:
            return self.predict(X)
    
    def set_params(self, **params):
        """ Setting the parameters for the voting classifier
        Valid parameter keys can be listed with get_params().
        Parameters
        ----------
        **params : keyword arguments
            Specific parameters using e.g. set_params(parameter_name=new_value)
            In addition, to setting the parameters of the ``VotingClassifier``,
            the individual classifiers of the ``VotingClassifier`` can also be
            set or replaced by setting them to None.
        Examples
        --------
        # In this example, the RandomForestClassifier is removed
        clf1 = LogisticRegression()
        clf2 = RandomForestClassifier()
        eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)]
        eclf.set_params(rf=None)
        """
        super(OrdinalClassifier, self)._set_params('estimator', **params)
        return self

    def get_params(self, deep=True):
        return super(OrdinalClassifier, self)._get_params('estimator', deep=deep)


In [109]:
# defaults for now
clf = xgb.XGBClassifier()

pipe = Pipeline([
    ('mapper', mapper),
    ('clf', OrdinalClassifier(clf, n_jobs=4))
])

# Fit Model on Full Sample

In [110]:
fitted = pipe.fit(X=revisions.copy(), y=revisions['wp10'])



In [111]:
np.sum(fitted.predict(revisions.copy()) == revisions['wp10'])



20093

In [112]:
with open("../models/model-xgboost-ordinal.pkl", "wb")  as f:
    dill.dump(fitted, f)

In [114]:
X = np.array([[0.5, 0.2, 0.3], [0.1, 0.2, 0.3]])


array([[ 0.5,  0.2,  0.3],
       [ 0.1,  0.2,  0.3]])

# Evaluate Model Peformance

# TODO

1. Cross validate
2. Out of sample
3. Other