In [1]:
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score as f1_scorer
import corpus as corpus_class
from filters import std_filters
import categories
import numpy as np
from scipy import sparse
import csv

corpus = corpus_class.corpus( categories.categories() )
corpus.load("question_train.csv", "question_category_train.csv")
corpus.process(**std_filters(), corpus_size=-1)
corpus.save()

In [2]:
corpus = corpus_class.load_from_file()

In [4]:
print(corpus)

14 categories. 
14417 docuemnts loaded from file. 
processed: True 
	 Training-set, Test-set size: None 
		 sentence_filters: ['punctuation_filter'] 
		 word_filters: ['small_word_filter', 'stopword_filter', 'stemming_filter'] 
made numeric features: False 



In [36]:
import time, csv, pickle
import itertools as it

def CV(corpus, clf_class, clf_name, clf_params, feat_params, n_folds):
    file_name = "cv_results/" + clf_name
    file_name += time.strftime("_%Y-%m-%d_%H-%M", time.gmtime())
    header = ["id", "f1", "accuracy", "time"] + list(feat_params[0].keys()) + list(clf_params[0].keys())
    
    csv_file = open(file_name+".csv", 'w+')
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(header)
    
    pkl_file = open(file_name+".pkl", 'wb+')
    results = {}
    
    corpus.cv_split(n_folds)
    
    idx = 0
    for c_par, f_par in it.product(clf_params, feat_params):
        results[idx] = {}
        results[idx]['feat_params'] = f_par
        results[idx]['clf_params'] = c_par
        
        f1_scores, acc_scores, times = [], [], []
        
        fold = 0
        for coprus in corpus:
            results[idx][fold] = {}
            corpus.make_features(**f_par)
            
            t = time.time() ### Time measure A ###
            
            clf = clf_class(**c_par)
            clf.fit(corpus.X_tr, corpus.y_tr)
            y_pred = clf.predict(corpus.X_te)
            
            dt = time.time() - t ### Time measure B ###
            
            acc = clf.score(corpus.X_te, corpus.y_te)
            f1 = f1_scorer(corpus.y_te, y_pred, average="macro")
            
            f1_scores += [ f1 ]
            acc_scores += [ acc ]
            times += [ dt ]
            
            results[idx][fold]['acc_score'] = acc
            results[idx][fold]['y_pred'] = y_pred
            results[idx][fold]['f1_score'] = f1
            results[idx][fold]['time'] = dt
            
            fold += 1

        results[idx]['f1_score'] = np.mean(f1_scores)
        results[idx]['acc_score'] = np.mean(acc_scores)
        results[idx]['time'] = np.mean(times)
        
        row = [ idx ]
        row += [results[idx]['f1_score'], results[idx]['acc_score'], results[idx]['time']]
        row += list(f_par.values()) + list(c_par.values())
        csv_writer.writerow( row )
        print(row)
        
        corpus.reset()
        idx += 1
    
    pickle.dump(results, pkl_file)
    
    csv_file.close()
    pkl_file.close()

In [37]:
LogisticRegression_params()

(sklearn.linear_model.logistic.LogisticRegression,
 'LogisticRegression',
 [{'C': 0.5, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 0.70710678118654757, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 1.4142135623730951, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 2.0, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 2.8284271247461903, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 4.0, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 5.6568542494923806, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 8.0, 'penalty': 'l1', 'solver': 'liblinear'}],
 [{'M': -1}])

In [63]:
CV(corpus, *LogisticRegression_params(), 3)

[0, 0.52902782099113732, 0.5865322917191792, 0.53402058283487952, -1, 2.0, 'l1', 'liblinear']
[1, 0.53052358916347331, 0.58667116569293631, 3.2402540047963462, -1, 2.5, 'l1', 'liblinear']
[2, 0.52836630289326203, 0.58646299164987914, 0.66099397341410315, -1, 3.0, 'l1', 'liblinear']
[3, 0.52717926806137394, 0.58590672814301314, 16.885004997253418, -1, 3.5, 'l1', 'liblinear']
[4, 0.52684002910051975, 0.58521443522141725, 56.349968592325844, -1, 4.0, 'l1', 'liblinear']


In [65]:
with open("cv_results/"+"LogisticRegression_2016-12-25_19-24.pkl", 'rb') as file:
    file.seek(0)
    results = pickle.load(file)
    
    f1_scores = -np.array([results[i]["f1_score"] for i in results])
    best_idx = f1_scores.argsort()
    
    print("{0:6} {1:15} {2:15} {3:15} {4:15} {5:15}".format("id", "f1_score", "acc_score", "M", "C", "time"))
    for idx in best_idx:
        print("{0:<6} {f1_score:<15.3} {acc_score:<15.3} {feat_params[M]:<15} \
{clf_params[C]:<15.3} {time:<15.5}".format(idx, **results[idx])
        )

id     f1_score        acc_score       M               C               time           
1      0.531           0.587           -1              2.5             3.2403         
0      0.529           0.587           -1              2.0             0.53402        
2      0.528           0.586           -1              3.0             0.66099        
3      0.527           0.586           -1              3.5             16.885         
4      0.527           0.585           -1              4.0             56.35          


In [7]:
def MultinomialNB_params():
    clf_class = MultinomialNB
    clf_name = "MultinomialNB"
    clf_params = [{"alpha":M} for M in np.logspace(-5,1,7)]

    M_max = np.log2(len(corpus.all_terms))-1
    feat_params = [{"M":int(M)} for M in np.logspace(6,M_max,7, base=2)]
    feat_params += [{"M":-1}]

    return clf_class, clf_name, clf_params, feat_params

In [61]:
def LogisticRegression_params():
    clf_class = LogisticRegression
    clf_name = "LogisticRegression"
    #clf_params = [{"C":M, "penalty": 'l1', "solver": 'liblinear'} for M in np.logspace(1,2,4, base=2)]
    clf_params = [{"C":M, "penalty": 'l1', "solver": 'liblinear'} for M in np.linspace(2,4,5)]
    feat_params = [{"M":-1}]
    
    return clf_class, clf_name, clf_params, feat_params

In [62]:
LogisticRegression_params()

(sklearn.linear_model.logistic.LogisticRegression,
 'LogisticRegression',
 [{'C': 2.0, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 2.5, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 3.0, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 3.5, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 4.0, 'penalty': 'l1', 'solver': 'liblinear'}],
 [{'M': -1}])