In [1]:
import corpus as corpus_class
import categories, filters
from filters import std_filters
from sklearn.metrics import f1_score
import pickle
import pandas as pd

from transform import TrainingSupport

import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.classification import UndefinedMetricWarning

import warnings
warnings.simplefilter("ignore", np.VisibleDeprecationWarning)
warnings.simplefilter("ignore", UndefinedMetricWarning)

In [24]:
corpus = corpus_class.load_from_file()

In [2]:
qfile_train = 'question_train.csv'
qcatfile_train = 'question_category_train.csv'
catfile = 'category.csv'
qfile_test = 'question_test.csv'
filtees = std_filters()

corpus = corpus_class.corpus( categories.categories(subcategories=False) );
corpus.load(qfile_train, qcatfile_train);
corpus.process(corpus_size=1, test_corpus=False, **filtees);
corpus.save();

In [40]:
import time
import pickle

def save_results(cv_results, clf_name):
    f_name = "cv_final/" + clf_name
    f_name += time.strftime("_%Y-%m-%d_%H-%M", time.gmtime())
    f_name += ".pkl"
    with open(f_name, 'wb') as file:
        pickle.dump(cv_results, file)
    return f_name

# Multinomial Bayes

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectPercentile, chi2

In [42]:
MNB = Pipeline(steps=[
        ('tr_supp', TrainingSupport()),
        ('tfidf', None),
        ('selection', SelectPercentile(score_func=chi2)),
        ('mnb', MultinomialNB())
    ])

ALPHA = np.logspace(-3,2,30,base=10)
PERCENTILE = np.linspace(50,100,6)
TFIF = []

NB_PARAMS = [
    {
        'tfidf': [TfidfTransformer()],
        'tfidf__use_idf': [True, False],
        'selection__percentile': PERCENTILE,
        'mnb__alpha': ALPHA,
        'mnb__fit_prior': [True, False]
    },
    {
        'tfidf': [None],
        'selection__percentile': PERCENTILE,
        'mnb__alpha': ALPHA,
        'mnb__fit_prior': [True, False]
    }
]

NB_CV = GridSearchCV(MNB, NB_PARAMS, scoring='f1_macro',
                     fit_params=None, n_jobs=-1, iid=False, refit=True,
                     cv=4, verbose=0, pre_dispatch='2*n_jobs', error_score='raise',
                     return_train_score=True)

In [43]:
NB_CV.fit(corpus.X_all, corpus.y);

In [44]:
save_results(NB_CV, "MNB")

'cv_final/MNB_2017-02-12_10-52.pkl'

In [45]:
NB_CV.best_estimator_.steps

[('tr_supp', <transform.TrainingSupport at 0x7f4076c97710>),
 ('tfidf',
  TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
 ('selection', SelectPercentile(percentile=100.0,
           score_func=<function chi2 at 0x7f4079c3d730>)),
 ('mnb',
  MultinomialNB(alpha=0.1743328822199989, class_prior=None, fit_prior=False))]

In [95]:
corpus.FREEZE_RANDOM = False
corpus.simple_split(0.1);
NB_CV.best_estimator_.fit(corpus.X_tr, corpus.y_tr)
print( NB_CV.best_estimator_.score(corpus.X_te, corpus.y_te) )
corpus.simple_split(0);

0.612481857765


# Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectPercentile, chi2

In [109]:
LR = Pipeline(steps=[
        ('tr_supp', TrainingSupport()),
        ('tfidf', None),
        ('selection', SelectPercentile(score_func=chi2)),
        ('lr', LogisticRegression(penalty='l1'))
    ])

C = np.logspace(-3,2,50,base=10)
PERCENTILE = np.linspace(50,100,6)

LR_PARAMS = [
    {
        'tfidf': [TfidfTransformer()],
        'tfidf__use_idf': [True, False],
        'selection__percentile': PERCENTILE,
        'lr__C': C
    },
    {
        'tfidf': [None],
        'selection__percentile': PERCENTILE,
        'lr__C': C
    }
]

LR_CV = GridSearchCV(LR, LR_PARAMS, scoring='f1_macro',
                     fit_params=None, n_jobs=-1, iid=False, refit=True,
                     cv=4, verbose=0, pre_dispatch='2*n_jobs', error_score='raise',
                     return_train_score=True)

In [110]:
LR_CV.fit(corpus.X_all, corpus.y);

In [111]:
save_results(LR_CV, "LR")

[('tr_supp', <transform.TrainingSupport at 0x7f4078404208>),
 ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False,
           use_idf=False)),
 ('selection', SelectPercentile(percentile=100.0,
           score_func=<function chi2 at 0x7f4079c3d730>)),
 ('lr', LogisticRegression(C=3.7275937203149416, class_weight=None, dual=False,
            fit_intercept=True, intercept_scaling=1, max_iter=100,
            multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
            solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]

In [115]:
(X_tr, y_tr), (X_te, y_te) = corpus.simple_split(0.1);
LR_CV.best_estimator_.fit(X_tr, y_tr)
print( LR_CV.best_estimator_.score(X_te, y_te) )
corpus.simple_split(0);

0.604499274311


In [119]:
LR_CV.best_estimator_.steps

[('tr_supp', <transform.TrainingSupport at 0x7f4078404208>),
 ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False,
           use_idf=False)),
 ('selection', SelectPercentile(percentile=100.0,
           score_func=<function chi2 at 0x7f4079c3d730>)),
 ('lr', LogisticRegression(C=3.7275937203149416, class_weight=None, dual=False,
            fit_intercept=True, intercept_scaling=1, max_iter=100,
            multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
            solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]

# Linear Supported Vector Maschines

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectPercentile, chi2

In [None]:
LinearSVM = Pipeline(steps=[
        ('tr_supp', TrainingSupport()),
        ('tfidf', TfidfTransformer()),
        ('selection', SelectPercentile(score_func=chi2)),
        ('lsvm', LinearSVC())
    ])

PERCENTILE = np.linspace(50,100,6)
C = np.logspace(-2,2,31,base=10)
INTERCEPT_SCALING = np.logspace(0,3,11,base=10)

LSVM_PARAMS = [
    {
        'selection__percentile': PERCENTILE,
        'lsvm__C': C,
        'lsvm__intercept_scaling': [1],
        'lsvm__class_weight': [None]
    },
    {
        'selection__percentile': PERCENTILE,
        'lsvm__C': C,
        'lsvm__intercept_scaling': INTERCEPT_SCALING,
        'lsvm__class_weight': ['balanced']
    }
]

In [None]:
LSVM_CV = GridSearchCV(LinearSVM, LSVM_PARAMS_ZOOM2, scoring='f1_macro',
                     fit_params=None, n_jobs=-1, iid=False, refit=True,
                     cv=3, verbose=0, pre_dispatch='2*n_jobs', error_score='raise',
                     return_train_score=True)

In [None]:
LSVN_CV.fit(corpus.X_all, corpus.y);

In [None]:
save_results(LSVN_CV, "LSVM")
LSVN_CV.best_estimator_.steps

In [None]:
LSVM_CV_ZOOM2.best_estimator_

# Reading

## LSVM

In [None]:
with open("cv_final/LSVM_CV_ZOOM2", 'rb') as file:
    LSVM_CV = pickle.load(file)
    df = pd.DataFrame( LSVM_CV.cv_results_ ).sort_values("mean_test_score", ascending=False)
with open("cv_final/LSVM_CV_ZOOM2.html", "w+") as file:
    file.write( df.to_html(columns=["mean_test_score",
                                    "mean_fit_time",
                                    "param_lsvm__C",
                                    'param_lsvm__intercept_scaling',
                                    'param_lsvm__class_weight',
                                    'param_selection__percentile']
                          ) 
              )

In [None]:
corpus.simple_split(0.1)

In [None]:
lsvm = LSVM_CV.best_estimator_
lsvm.fit(corpus.X_tr, corpus.y_tr)
f1_score(lsvm.predict(corpus.X_te), corpus.y_te, average="macro"), lsvm.score(corpus.X_te, corpus.y_te)

In [None]:
params = df.loc[60]['params']
lsvm2 = LinearSVM.set_params(**params)
lsvm2.fit(corpus.X_tr, corpus.y_tr)
f1_score(lsvm2.predict(corpus.X_te), corpus.y_te, average="macro"), lsvm2.score(corpus.X_te, corpus.y_te)

## Multinomial NB

In [66]:
with open("cv_final/MNB_2017-02-12_10-52.pkl", 'rb') as file:
    NB_CV = pickle.load(file)
    df = pd.DataFrame( NB_CV.cv_results_ ).sort_values("mean_test_score", ascending=False)
with open("cv_final/NB_HTML.html", "w+") as file:
    file.write( df.to_html(columns=["mean_test_score",
                                    "param_mnb__alpha",
                                    'param_mnb__fit_prior',
                                    'param_selection__percentile',
                                    'param_tfidf',
                                    'param_tfidf__use_idf'
                                   ]) )

## Logistic Regression

In [120]:
filename = "LR_2017-02-12_11-44"
with open("cv_final/"+filename+".pkl", 'rb') as file:
    CV = pickle.load(file)
    df = pd.DataFrame( CV.cv_results_ ).sort_values("mean_test_score", ascending=False)

with open("cv_final/html/"+filename+".html", "w+") as file:
    file.write( df.to_html(columns=["mean_fit_time",
                                    "mean_test_score",
                                    "param_lr__C",
                                    'param_selection__percentile',
                                    'param_tfidf',
                                    'param_tfidf__use_idf'
                                   ]) )

In [117]:
df.columns

Index(['mean_fit_time', 'mean_score_time', 'mean_test_score',
       'mean_train_score', 'param_lr__C', 'param_selection__percentile',
       'param_tfidf', 'param_tfidf__use_idf', 'params', 'rank_test_score',
       'split0_test_score', 'split0_train_score', 'split1_test_score',
       'split1_train_score', 'split2_test_score', 'split2_train_score',
       'split3_test_score', 'split3_train_score', 'std_fit_time',
       'std_score_time', 'std_test_score', 'std_train_score'],
      dtype='object')

# Best Classfiers

### Multinomial Bayes MNB

In [13]:
MNB = Pipeline(steps=[
        ('tr_supp', TrainingSupport()),
        ('tfidf', TfidfTransformer()),
        ('selection', SelectPercentile(score_func=chi2)),
        ('mnb', MultinomialNB())
    ])

MNB.set_params(**{
        "mnb__alpha": 0.174333,
        "mnb__fit_prior": False,
        "selection__percentile": 100
    })

with open("best_estimators/MultinomialNB.pkl", "wb+") as file:
    pickle.dump(MNB, file)

### Logistic Regression LR

In [14]:
LR = Pipeline(steps=[
        ('tr_supp', TrainingSupport()),
        ('tfidf', TfidfTransformer()),
        ('selection', SelectPercentile(score_func=chi2)),
        ('lr', LogisticRegression())
    ])

LR.set_params(**{
        "tfidf__use_idf": False,
        "selection__percentile": 100,
        "lr__penalty": 'l1',
        "lr__C": 2.94705
    })

with open("best_estimators/LogisticRegression.pkl", "wb+") as file:
    pickle.dump(LR, file)

### Linear Supported Vector Machine LSVM

In [15]:
from transform import LinearSVC

LinearSVM = Pipeline(steps=[
        ('tr_supp', TrainingSupport()),
        ('tfidf', TfidfTransformer()),
        ('selection', SelectPercentile(score_func=chi2)),
        ('lsvm', LinearSVC())
    ])

LinearSVM.set_params(**{
        "lsvm__class_weight": None,
        "lsvm__intercept_scaling": 1.0,
        "lsvm__C": 0.615848,
        "selection__percentile": 70
    })

with open("best_estimators/LinearSVM.pkl", "wb+") as file:
    pickle.dump(LinearSVM, file)