In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix
from nltk.stem.snowball import SnowballStemmer
from sklearn.grid_search import GridSearchCV
from sklearn import datasets
from nltk import word_tokenize
from nltk.corpus import stopwords



In [2]:
#source: http://scikit-learn.org/stable/modules/feature_extraction.html
class StemTokenizer(object):
    def __init__(self):
        self.sbs = SnowballStemmer("dutch")
    def __call__(self, doc):
        return [self.sbs.stem(t) for t in word_tokenize(doc)]

In [3]:
df = pd.read_csv("Handelingen.csv", index_col=0)
df = df.dropna()
#https://stackoverflow.com/questions/29370057/select-dataframe-rows-between-two-dates
df['datum'] = pd.to_datetime(df['datum'])
mask = (df['datum'] > '2012-11-05') & (df['datum'] <= '2017-03-23')
df = df.loc[mask]
df = df[df['partij'].isin(['50PLUS', 'CDA','ChristenUnie','D66','GroenLinks','PVV','PvdA','PvdD','SGP','SP','VVD'])]
Partij = df.partij
print(len(df))
Tekst = CountVectorizer(tokenizer=StemTokenizer(),stop_words=stopwords.words('dutch'), ngram_range=(1,2)).fit_transform(df.tekst)
df.head(5)

144336


Unnamed: 0,achternaam,partij,tekst,file,datum,tags
107136,Wilders,PVV,Mevrouw de voorzitter. Dit kabinet heeft ons m...,h-tk-20122013-100-3.xml,2013-06-26,"['Bestuur | Parlement', 'Financiën | Begroting']"
107137,Roemer,SP,"Voorzitter. Vorige week plaatsten werkgevers, ...",h-tk-20122013-100-3.xml,2013-06-26,"['Bestuur | Parlement', 'Financiën | Begroting']"
107138,Pechtold,D66,Voorzitter. Twee maanden geleden kreeg dit kab...,h-tk-20122013-100-3.xml,2013-06-26,"['Bestuur | Parlement', 'Financiën | Begroting']"
107139,Van Haersma Buma,CDA,Voorzitter. Alleen al in de eerste drie maande...,h-tk-20122013-100-3.xml,2013-06-26,"['Bestuur | Parlement', 'Financiën | Begroting']"
107141,Van Haersma Buma,CDA,Dan het sociaal akkoord. De voorzitter van MKB...,h-tk-20122013-100-3.xml,2013-06-26,"['Bestuur | Parlement', 'Financiën | Begroting']"


In [4]:
del df

In [8]:
#https://stackoverflow.com/questions/46735847/save-best-params-in-gridsearch-in-a-pandas-dataframe
class EstimatorSelectionHelper:
    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, 
                              verbose=verbose, scoring=scoring, refit=refit)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = [row(k, gsc.cv_validation_scores, gsc.parameters) 
                     for k in self.keys
                     for gsc in self.grid_searches[k].grid_scores_]
        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]


In [6]:
df = pd.read_csv('Models.tsv', sep ='\t', index_col=0)
models = {}
for x,y in zip(df.Classifier,df.PIPELINE):
    exec(compile("a="+y,'','exec'))
    models[x] = Pipeline(a)    
params = {}
for clf in set(df.Classifier):
    test = df.loc[df.Classifier == clf]
    test = test.dropna(axis=1, how='all').drop(["PIPELINE","Classifier"], axis =1)
    test = test.to_dict(orient='list')
    testdict = {}
    for x,y in test.items():
        exec(compile("a="+y[0],'','exec'))
        testdict[x] = a   
    params[clf] = testdict
params

{'LogisticClassifier': {'SGD__penalty': ['none', 'l2', 'l1', 'elasticnet']},
 'MultinomialNB': {'MNB__alpha': [0, 0.5, 1]},
 'SVM': {'SGD__penalty': ['none', 'l2', 'l1', 'elasticnet']}}

In [9]:
helper = EstimatorSelectionHelper(models, params)
helper.fit(Tekst, Partij, scoring='f1_weighted', cv=2)

scoresdf = helper.score_summary()
scoresdf.to_csv("Scores.csv")

Running GridSearchCV for LogisticClassifier.
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.9min finished


Running GridSearchCV for MultinomialNB.
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   55.5s finished


Running GridSearchCV for SVM.
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.2min finished


In [10]:
scoresdf

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,MNB__alpha,SGD__penalty
8,SVM,0.340556,0.344019,0.347482,0.00346291,,l2
7,SVM,0.314872,0.31896,0.323048,0.00408798,,none
0,LogisticClassifier,0.30126,0.302487,0.303714,0.00122703,,none
10,SVM,0.26585,0.267044,0.268238,0.00119409,,elasticnet
4,MultinomialNB,0.240791,0.240808,0.240825,1.67727e-05,0.0,
1,LogisticClassifier,0.237929,0.239958,0.241987,0.00202891,,l2
3,LogisticClassifier,0.215631,0.218458,0.221285,0.0028271,,elasticnet
2,LogisticClassifier,0.209787,0.214837,0.219887,0.0050501,,l1
5,MultinomialNB,0.18685,0.189328,0.191806,0.00247772,0.5,
6,MultinomialNB,0.170105,0.172166,0.174228,0.00206138,1.0,


In [11]:
!git pull
!git add Algorithm.ipynb
!git add Scores.csv
!git add Models.tsv
!git commit -m Algorithm.ipynb
!git commit -m Scores.csv
!git commit -m Models.tsv
!git push

Already up to date.


The file will have its original line endings in your working directory.


[master e885d2e] Algorithm.ipynb
 3 files changed, 263 insertions(+), 79 deletions(-)
 rewrite Models.tsv (86%)
 rewrite Scores.csv (100%)
On branch master
Your branch is ahead of 'origin/master' by 1 commit.
  (use "git push" to publish your local commits)

Changes not staged for commit:
	modified:   Scraper.ipynb

Untracked files:
	.ipynb_checkpoints/
	DataInfo.ipynb
	Handelingen.csv
	HandelingenTK/
	HandelingenTKmeta/
	Literatuur/
	Scraper.md
	Untitled.ipynb

no changes added to commit
On branch master
Your branch is ahead of 'origin/master' by 1 commit.
  (use "git push" to publish your local commits)

Changes not staged for commit:
	modified:   Scraper.ipynb

Untracked files:
	.ipynb_checkpoints/
	DataInfo.ipynb
	Handelingen.csv
	HandelingenTK/
	HandelingenTKmeta/
	Literatuur/
	Scraper.md
	Untitled.ipynb

no changes added to commit


To https://github.com/jaspervdh96/Scriptie-Sprekers-TK.git
   2be15fc..e885d2e  master -> master
