In [1]:
import warnings
%matplotlib inline
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import itertools
# Sklearn imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import datasets
from nltk.corpus import stopwords
import nltk
import seaborn as sn
from collections import Counter
from gensim.sklearn_api.w2vmodel import W2VTransformer
from gensim.sklearn_api.d2vmodel import D2VTransformer
from itertools import chain
from decimal import Decimal
from IPython.display import display, HTML
from ast import literal_eval
import xlrd
import openpyxl



In [2]:
def row(key, scores, params):
            d = {
                 'estimator': key,
                 'mean_score': np.mean(scores),
                 'f1-scores': scores,
            }
            return pd.Series({**params,**d})
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
stemmer = nltk.stem.snowball.SnowballStemmer("dutch")

In [None]:
%%time
# Importing data
df = pd.read_csv("Handelingen.csv", index_col=0)
df = df.loc[df['speech category'] == 'Main Speech']

## FIX THIS INTO ONE LINE
df['list_text'] = df.text.apply(lambda x: [stemmer.stem(t) for t in tokenizer.tokenize(x)])
df['normal_text'] = df.list_text.apply(lambda x: ' '.join(x))
df.head(5)

# Data Info

In [None]:
countdf = df.party.value_counts()
with open("Verslag/Spreekbeurten.tex", "w") as f:
    f.write(countdf.to_latex(header=False))
countdf

In [None]:
tekst = chain.from_iterable([list(set(doc)) for doc in df.list_text])
count = Counter(tekst)
samples = list(count.values())

In [None]:
t = Counter(samples)
t = sorted(t.items())
x, y = zip(*t)
plt.loglog(x, y)
plt.show()

In [None]:
tekst = [len(doc) for doc in df.list_text]

In [None]:
t = Counter(tekst)
t = sorted(t.items())
x, y = zip(*t)
plt.plot(x, y)
plt.show()

# Models

In [None]:
def modelsdownload(only_doen=False):
    models_df = pd.read_excel('Models.xlsx', index_col=0)
    if only_doen:
        models_df = models_df.loc[models_df.DOEN == True]
    models_dict = {}
    for x,y in zip(models_df.Classifier,models_df.PIPELINE):
        exec(compile("a="+y,'','exec'),globals())
        models_dict[x] = Pipeline(a)
    return models_df, models_dict
def modelsdownload2():
    models_df = pd.read_excel('Models.xlsx', index_col=0)
    models_dict = {}
    for x,y in zip(models_df.Classifier,models_df.PIPELINE):
        exec(compile("a="+y,'','exec'),globals())
        models_dict[x] = Pipeline(a)
    return models_df, models_dict

In [None]:
models_df, models_dict = modelsdownload(True)
    
params_dict = {}
for clf in set(models_df.Classifier):
    params_df = models_df.loc[models_df.Classifier == clf]
    params_df = params_df.loc[params_df.DOEN == True]
    params_df = params_df.dropna(axis=1, how='all')
    params_df = params_df.drop([col for col in params_df.columns if "__" not in col], axis =1)
    temp_dict = {}
    for x,y in params_df.to_dict(orient='list').items():
        a = literal_eval(y[0])
        temp_dict[x] = a   
    params_dict[clf] = temp_dict
    
text_dict = {x:y for x,y in zip(models_df.Classifier, models_df.RAW)}

In [None]:
for name,pipe in models_dict.items():
    if text_dict[name]:
         text = list(df.list_text)
    else:
        text = list(df.normal_text)
    gs = GridSearchCV(pipe, params_dict[name], cv=5, scoring='f1_weighted')
    gs.fit(text, df.party)
    rows = [row(name, gsc.cv_validation_scores, gsc.parameters) for gsc in gs.grid_scores_]
    df2 = pd.concat(rows, axis=1).T
    pd.concat([df2, pd.read_csv('Scores.csv', index_col=0)], ignore_index=True).sort_values(['mean_score'], ascending=False).to_csv('Scores.csv')
    df4 = pd.read_excel('Models.xlsx', index_col=0)
    df4.loc[df.Classifier==name,"DOEN"] = False
    df4.to_excel('Models.xlsx')
    
#https://stackoverflow.com/questions/46735847/save-best-params-in-gridsearch-in-a-pandas-dataframe
# https://stackoverflow.com/questions/36271413/pandas-merge-nearly-duplicate-rows-based-on-column-value?rq=1

# Best Score

In [None]:
scores = pd.read_csv('Scores.csv', index_col=0).head(1)
scores = scores.dropna(axis=1).reset_index(drop=True)
scores.vect__ngram_range[0] = literal_eval(scores.vect__ngram_range[0])

In [None]:
models_df, models_dict = modelsdownload(False)
pipe = models_dict[scores.estimator[0]]
pipe.set_params(**scores.drop([col for col in scores.columns if "__" not in col], axis=1).to_dict(orient='records')[0])

In [None]:
df_train = df.sample(frac=0.8)
df_test_index = list(set(df.index.tolist()) - set(df_train.index.tolist()))
df_test = df[df.index.isin(df_test_index)]
pipe.fit(list(df_train.normal_text), list(df_train.party))
predicted = pipe.predict(df_test.normal_text)
print(classification_report(df_test.party, predicted))

In [None]:
#source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plt.figure(figsize=(10, 10))
cnf_matrix = confusion_matrix(df_test.party, predicted)
plot_confusion_matrix(cnf_matrix, \
                      classes=sorted(list(df.party.unique())), \
                      title='Confusion matrix best estimator')
plt.show()

# Git

In [None]:
!git pull
!git add CompleteNotebook.ipynb
!git add Bestscore.ipynb
!git add Scraper.ipynb
!git add Scores.csv
!git add Verslag/Scriptie_Sprekers_TK.pdf
!git add Verslag/Spreekbeurten.tex
!git add Verslag/MItable.tex
!git add Models.xlsx
!git commit -m Update
!git push