In [1]:
import corpus as corpus_class
import categories, filters
from filters import std_filters
from sklearn.metrics import f1_score
import pickle

from transform import TrainingSupport

import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
import sklearn
import warnings
warnings.simplefilter("ignore", np.VisibleDeprecationWarning)
warnings.simplefilter("ignore", sklearn.metrics.classification.UndefinedMetricWarning)

In [2]:
corpus = corpus_class.load_from_file()

In [3]:
qfile_train = 'question_train.csv'
qcatfile_train = 'question_category_train.csv'
catfile = 'category.csv'
qfile_test = 'question_test.csv'
filtees = std_filters()

corpus = corpus_class.corpus( categories.categories(subcategories=False) );
corpus.load(qfile_train, qcatfile_train);
corpus.process(corpus_size=1, test_corpus=False, **filtees);
corpus.save();

# Multinomial Bayes

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectPercentile, chi2

In [5]:
MNB = Pipeline(steps=[
        ('tr_supp', TrainingSupport()),
        ('tfidf', None),
        ('selection', SelectPercentile(score_func=chi2)),
        ('mnb', MultinomialNB())
    ])

ALPHA = np.logspace(-4,4,50,base=10)
PERCENTILE = np.linspace(50,100,6)
TFIF = []

NB_PARAMS = [
    {
        'tfidf': [TfidfTransformer()],
        'tfidf__use_idf': [True, False],
        'selection__percentile': PERCENTILE,
        'mnb__alpha': ALPHA,
        'mnb__fit_prior': [True, False]
    },
    {
        'tfidf': [None],
        'selection__percentile': PERCENTILE,
        'mnb__alpha': ALPHA,
        'mnb__fit_prior': [True, False]
    }
]

NB_CV = GridSearchCV(MNB, NB_PARAMS, scoring='f1_macro',
                     fit_params=None, n_jobs=-1, iid=False, refit=True,
                     cv=4, verbose=0, pre_dispatch='2*n_jobs', error_score='raise',
                     return_train_score=True)

In [6]:
NB_CV.fit(corpus.X_all, corpus.y);
with open("cv_final/NB_CV", 'wb') as file:
    pickle.dump(NB_CV, file)

In [7]:
NB_CV.best_estimator_.steps

[('tr_supp', <transform.TrainingSupport at 0x7f7ffa648a20>),
 ('tfidf',
  TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
 ('selection', SelectPercentile(percentile=100.0,
           score_func=<function chi2 at 0x7f7ff28f9e18>)),
 ('mnb',
  MultinomialNB(alpha=0.18420699693267145, class_prior=None, fit_prior=False))]

In [51]:
corpus.FREEZE_RANDOM = False
corpus.simple_split(0.33);
NB_CV.best_estimator_.fit(corpus.X_tr, corpus.y_tr)
print( NB_CV.best_estimator_.score(corpus.X_te, corpus.y_te) )
corpus.simple_split(0);

0.585930543188


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
LR_PARAMS = {'C': np.logspace(-4,2,50,base=10)}

In [None]:
LSVM_CV = GridSearchCV(LogisticRegression(), LR_PARAMS, scoring='f1_macro',
                     fit_params=None, n_jobs=-1, iid=False, refit=True,
                     cv=6, verbose=0, pre_dispatch='2*n_jobs', error_score='raise',
                     return_train_score=True)

In [None]:
LR_CV.fit(corpus.X_tr, corpus.y_tr);

In [None]:
LR_CV.best_estimator_

In [None]:
with open("cv_final/LR_CV", 'wb') as file:
    pickle.dump(LR_CV, file)

# Linear Supported Vector Maschines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.svm import LinearSVC

In [None]:
LinearSVM = Pipeline(steps=[
        ('tfidf', TfidfTransformer()),
        ('selection', SelectPercentile(score_func=chi2)),
        ('lsvm', LinearSVC())
    ])

PERCENTILE = np.linspace(50,100,6)
C = np.logspace(-2,2,31,base=10)
INTERCEPT_SCALING = np.logspace(0,3,11,base=10)

LSVM_PARAMS = [
    {
        'selection__percentile': PERCENTILE,
        'lsvm__C': C,
        'lsvm__intercept_scaling': [1],
        'lsvm__class_weight': [None]
    },
    {
        'selection__percentile': PERCENTILE,
        'lsvm__C': C,
        'lsvm__intercept_scaling': INTERCEPT_SCALING,
        'lsvm__class_weight': ['balanced']
    }
]

### ZOOM ###
C_ZOOM2 = np.logspace(-1,0,20,base=10)
LSVM_PARAMS_ZOOM2 = [
    {
        'selection__percentile': [70,80,90,100],
        'lsvm__C': C_ZOOM2,
        'lsvm__intercept_scaling': [1],
        'lsvm__class_weight': [None]
    },
    {
        'selection__percentile': [70,80,90,100],
        'lsvm__C': C_ZOOM2,
        'lsvm__intercept_scaling': [25],
        'lsvm__class_weight': ['balanced']
    }
]

In [None]:
LSVM_CV_ZOOM2 = GridSearchCV(LinearSVM, LSVM_PARAMS_ZOOM2, scoring='f1_macro',
                     fit_params=None, n_jobs=-1, iid=False, refit=True,
                     cv=3, verbose=0, pre_dispatch='2*n_jobs', error_score='raise',
                     return_train_score=True)

In [None]:
LSVM_CV_ZOOM2.fit(corpus.X_tr, corpus.y_tr);

In [None]:
with open("cv_final/LSVM_CV_ZOOM2", 'wb+') as file:
    pickle.dump(LSVM_CV_ZOOM2, file)

In [None]:
LSVM_CV_ZOOM2.best_estimator_

# Reading

In [None]:
import pandas as pd

## LSVM

In [None]:
with open("cv_final/LSVM_CV_ZOOM2", 'rb') as file:
    LSVM_CV = pickle.load(file)
    df = pd.DataFrame( LSVM_CV.cv_results_ ).sort_values("mean_test_score", ascending=False)
with open("cv_final/LSVM_CV_ZOOM2.html", "w+") as file:
    file.write( df.to_html(columns=["mean_test_score",
                                    "mean_fit_time",
                                    "param_lsvm__C",
                                    'param_lsvm__intercept_scaling',
                                    'param_lsvm__class_weight',
                                    'param_selection__percentile']
                          ) 
              )

In [None]:
corpus.simple_split(0.1)

In [None]:
lsvm = LSVM_CV.best_estimator_
lsvm.fit(corpus.X_tr, corpus.y_tr)
f1_score(lsvm.predict(corpus.X_te), corpus.y_te, average="macro"), lsvm.score(corpus.X_te, corpus.y_te)

In [None]:
params = df.loc[60]['params']
lsvm2 = LinearSVM.set_params(**params)
lsvm2.fit(corpus.X_tr, corpus.y_tr)
f1_score(lsvm2.predict(corpus.X_te), corpus.y_te, average="macro"), lsvm2.score(corpus.X_te, corpus.y_te)

## Multinomial NB

In [None]:
with open("cv_final/NB_CV", 'rb') as file:
    NB_CV = pickle.load(file)
    df = pd.DataFrame( NB_CV.cv_results_ ).sort_values("mean_test_score", ascending=False)
with open("cv_final/NB_HTML.html", "w+") as file:
    file.write( df.to_html(columns=["mean_test_score",
                                    "param_mnb__alpha",
                                    'param_mnb__fit_prior',
                                    'param_selection__percentile',
                                    'param_tfidf',
                                    'param_tfidf__use_idf'
                                   ]) )

In [None]:
bool(123124)