In [43]:
import corpus as corpus_class
import categories, filters
from filters import std_filters
from sklearn.metrics import f1_score
import pickle

import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
import sklearn
import warnings
warnings.simplefilter("ignore", np.VisibleDeprecationWarning)
warnings.simplefilter("ignore", sklearn.metrics.classification.UndefinedMetricWarning)

In [34]:
qfile_train = 'question_train.csv'
qcatfile_train = 'question_category_train.csv'
catfile = 'category.csv'
qfile_test = 'question_test.csv'
filtees = std_filters()

corpus = corpus_class.corpus( categories.categories(subcategories=False) );
corpus.load(qfile_train, qcatfile_train);
corpus.process(corpus_size=1, test_corpus=False, **filtees);
corpus.save();



In [91]:
corpus = corpus_class.load_from_file()

In [92]:
corpus.simple_split(0);

# Multinomial Bayes

In [85]:
from sklearn.naive_bayes import MultinomialNB

In [90]:
NB_PARAMS = {'alpha': np.logspace(-4,4,50,base=10)}

In [97]:
NB_CV = GridSearchCV(MultinomialNB(), NB_PARAMS, scoring='f1_macro',
                     fit_params=None, n_jobs=-1, iid=False, refit=True,
                     cv=6, verbose=0, pre_dispatch='2*n_jobs', error_score='raise',
                     return_train_score=True)

In [98]:
NB_CV.fit(corpus.X_tr, corpus.y_tr);

In [99]:
NB_CV.best_estimator_

MultinomialNB(alpha=0.12648552168552957, class_prior=None, fit_prior=True)

In [102]:
with open("cv_final/NB_CV", 'wb') as file:
    pickle.dump(NB_CV, file)

# Logistic Regression

In [117]:
from sklearn.linear_model import LogisticRegression

In [118]:
LR_PARAMS = {'C': np.logspace(-4,2,50,base=10)}

In [107]:
LSVM_CV = GridSearchCV(LogisticRegression(), LR_PARAMS, scoring='f1_macro',
                     fit_params=None, n_jobs=-1, iid=False, refit=True,
                     cv=6, verbose=0, pre_dispatch='2*n_jobs', error_score='raise',
                     return_train_score=True)

In [108]:
LR_CV.fit(corpus.X_tr, corpus.y_tr);

In [110]:
LR_CV.best_estimator_

LogisticRegression(C=3.3932217718953299, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [111]:
with open("cv_final/LR_CV", 'wb') as file:
    pickle.dump(LR_CV, file)

# Linear Supported Vector Maschines

In [102]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.svm import LinearSVC

In [103]:
LinearSVM = Pipeline(steps=[
        ('tfidf', TfidfTransformer()),
        ('selection', SelectPercentile(score_func=chi2)),
        ('lsvm', LinearSVC())
    ])

PERCENTILE = np.linspace(50,100,6)
C = np.logspace(-2,2,31,base=10)
INTERCEPT_SCALING = np.logspace(0,3,11,base=10)
CLASS_WEIGHT = [None, 'balanced']

LSVM_PARAMS = [
    {
        'selection__percentile': PERCENTILE,
        'lsvm__C': C,
        'lsvm__intercept_scaling': [1],
        'lsvm__class_weight': [None]
    },
    {
        'selection__percentile': PERCENTILE,
        'lsvm__C': C,
        'lsvm__intercept_scaling': INTERCEPT_SCALING,
        'lsvm__class_weight': ['balanced']
    }
]


In [104]:
LSVM_CV = GridSearchCV(LinearSVM, LSVM_PARAMS, scoring='f1_micro',
                     fit_params=None, n_jobs=-1, iid=False, refit=True,
                     cv=6, verbose=0, pre_dispatch='2*n_jobs', error_score='raise',
                     return_train_score=True)

In [None]:
LSVM_CV.fit(corpus.X_tr, corpus.y_tr);

In [130]:
LSVM_CV.best_estimator_

Pipeline(steps=[('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('selection', SelectPercentile(percentile=90.0,
         score_func=<function chi2 at 0x7fa828522e18>)), ('lsvm', LinearSVC(C=0.29286445646252357, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
     max_iter=1000, multi_class='ovr', penalty='l2', random_state=None,
     tol=0.0001, verbose=0))])

In [None]:
with open("cv_final/LSVM_CV", 'wb+') as file:
    pickle.dump(LSVM_CV, file)

# Reading

In [108]:
with open("cv_final/LSVM_CV", 'rb') as file:
    LSVM_CV = pickle.load(file)

In [109]:
df.columns

Index(['mean_fit_time', 'mean_score_time', 'mean_test_score',
       'mean_train_score', 'param_lsvm__C', 'param_lsvm__class_weight',
       'param_lsvm__intercept_scaling', 'param_selection__percentile',
       'params', 'rank_test_score', 'split0_test_score', 'split0_train_score',
       'split1_test_score', 'split1_train_score', 'split2_test_score',
       'split2_train_score', 'split3_test_score', 'split3_train_score',
       'split4_test_score', 'split4_train_score', 'split5_test_score',
       'split5_train_score', 'std_fit_time', 'std_score_time',
       'std_test_score', 'std_train_score'],
      dtype='object')

In [110]:
import pandas as pd
df = pd.DataFrame( LSVM_CV.cv_results_ ).sort(columns="mean_test_score", ascending=False)
with open("cv_final/html.html", "w+") as file:
    file.write( df.to_html(columns=["mean_test_score","param_lsvm__C", 'param_lsvm__intercept_scaling','param_lsvm__class_weight']) )

  from ipykernel import kernelapp as app


In [124]:
corpus.simple_split(1/6)

<corpus.corpus at 0x7fa819d496a0>

In [128]:
lsvm = LSVM_CV.best_estimator_
lsvm.fit(corpus.X_tr, corpus.y_tr)
f1_score(lsvm.predict(corpus.X_te), corpus.y_te, average="macro"), lsvm.score(corpus.X_te, corpus.y_te)

(0.56271471755168823, 0.61480828558836487)

In [129]:
lsvm2 = LinearSVM.set_params(lsvm__C=0.48, selection__percentile=90, lsvm__class_weight='balanced', lsvm__intercept_scaling=10)
lsvm2.fit(corpus.X_tr, corpus.y_tr)
f1_score(lsvm2.predict(corpus.X_te), corpus.y_te, average="macro"), lsvm2.score(corpus.X_te, corpus.y_te)

(0.56515586288149511, 0.60775672102247691)

In [119]:
np.linspace(50,100,6)

array([  50.,   60.,   70.,   80.,   90.,  100.])

In [112]:
np.logspace(0,3,7,base=10)

array([    1.        ,     3.16227766,    10.        ,    31.6227766 ,
         100.        ,   316.22776602,  1000.        ])