In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix
from nltk.stem.snowball import SnowballStemmer
from sklearn.grid_search import GridSearchCV
from sklearn import datasets
from nltk import word_tokenize



In [2]:
df = pd.read_csv("Handelingen.csv", index_col=0)
df = df.dropna()
#https://stackoverflow.com/questions/29370057/select-dataframe-rows-between-two-dates
df['datum'] = pd.to_datetime(df['datum'])
#mask = (df['datum'] > '2012-11-05') & (df['datum'] <= '2017-03-23')
#df = df.loc[mask]
len(df)

11179

In [3]:
class StemTokenizer(object):
    def __init__(self):
        self.sbs = SnowballStemmer("dutch")
    def __call__(self, doc):
        return [self.sbs.stem(t) for t in word_tokenize(doc)]

In [4]:
Tekst = CountVectorizer(tokenizer=StemTokenizer()).fit_transform(df.tekst)
Tekst = TfidfTransformer().fit_transform(Tekst)

<11179x27274 sparse matrix of type '<class 'numpy.float64'>'
	with 722579 stored elements in Compressed Sparse Row format>

In [9]:
#https://stackoverflow.com/questions/46735847/save-best-params-in-gridsearch-in-a-pandas-dataframe
class EstimatorSelectionHelper:
    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, 
                              verbose=verbose, scoring=scoring, refit=refit)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = [row(k, gsc.cv_validation_scores, gsc.parameters) 
                     for k in self.keys
                     for gsc in self.grid_searches[k].grid_scores_]
        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

X_iris = Tekst
y_iris = df.partij

models = {'SGDClassifier': SGDClassifier(), 'MultinomialNB' : MultinomialNB(), 'KNeighborsClassifier':KNeighborsClassifier()}

params = {'SGDClassifier': { 'loss':('hinge', 'log'), 'penalty':('none', 'l2', 'l1', 'elasticnet')}, 
          'MultinomialNB': { 'alpha':(0,1)}, 
          'KNeighborsClassifier':{'n_neighbors':(1,5,100)}}

helper = EstimatorSelectionHelper(models, params)
helper.fit(X_iris, y_iris, scoring='f1_weighted', cv=10)

helper.score_summary()

Running GridSearchCV for SGDClassifier.
Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   51.6s finished


Running GridSearchCV for MultinomialNB.
Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    1.7s finished


Running GridSearchCV for KNeighborsClassifier.
Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   32.6s finished


Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,alpha,loss,n_neighbors,penalty
1,SGDClassifier,0.235447,0.263933,0.299075,0.0212693,,hinge,,l2
4,SGDClassifier,0.229057,0.261744,0.284889,0.021122,,log,,none
0,SGDClassifier,0.204991,0.254615,0.285968,0.0267359,,hinge,,none
3,SGDClassifier,0.212515,0.251209,0.279021,0.0220961,,hinge,,elasticnet
5,SGDClassifier,0.208609,0.232645,0.262694,0.0148735,,log,,l2
7,SGDClassifier,0.201823,0.228563,0.250349,0.0130872,,log,,elasticnet
8,MultinomialNB,0.172683,0.223984,0.259127,0.024117,0.0,,,
6,SGDClassifier,0.196511,0.216995,0.2334,0.00998975,,log,,l1
2,SGDClassifier,0.171888,0.19331,0.218646,0.0163956,,hinge,,l1
10,KNeighborsClassifier,0.143785,0.188159,0.226244,0.0286439,,,1.0,
