In [1]:
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score as f1_scorer
import corpus as corpus_class
from filters import std_filters
import categories
import numpy as np
from scipy import sparse
import csv

corpus = corpus_class.corpus( categories.categories() )
corpus.load("question_train.csv", "question_category_train.csv")
corpus.process(**std_filters(), corpus_size=-1)
corpus.save()

In [2]:
corpus = corpus_class.load_from_file()

In [11]:
#clf = MultinomialNB(alpha=0.01)
clf = LogisticRegression(penalty='l1', C=4, solver='liblinear')

[ 0.51191244  0.5299111   0.53441785] 0.525413794828


In [265]:
import time, csv, pickle
import itertools as it

def CV(corpus, clf_class, clf_name, clf_params, feat_params, n_folds):
    file_name = "cv_results/" + clf_name
    file_name += time.strftime("_%Y-%m-%d_%H-%M", time.gmtime())
    header = ["id", "f1", "accuracy", "time"] + list(feat_params[0].keys()) + list(clf_params[0].keys())
    
    csv_file = open(file_name+".csv", 'w+')
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(header)
    
    pkl_file = open(file_name+".pkl", 'wb+')
    results = {}
    
    corpus.cv_split(n_folds)
    
    idx = 0
    for c_par, f_par in it.product(clf_params, feat_params):
        results[idx] = {}
        results[idx]['feat_params'] = f_par
        results[idx]['clf_params'] = c_par
        
        f1_scores, acc_scores, times = [], [], []
        
        fold = 0
        for coprus in corpus:
            results[idx][fold] = {}
            corpus.make_features(**f_par)
            
            t = time.time() ### Time measure A ###
            
            clf = clf_class(**c_par)
            clf.fit(corpus.X_tr, corpus.y_tr)
            y_pred = clf.predict(corpus.X_te)
            
            dt = time.time() - t ### Time measure B ###
            
            acc = clf.score(corpus.X_te, corpus.y_te)
            f1 = f1_scorer(corpus.y_te, y_pred, average="macro")
            
            f1_scores += [ f1 ]
            acc_scores += [ acc ]
            times += [ dt ]
            
            results[idx][fold]['acc_score'] = acc
            results[idx][fold]['y_pred'] = y_pred
            results[idx][fold]['f1_score'] = f1
            results[idx][fold]['time'] = dt
            
            fold += 1

        results[idx]['f1_score'] = np.mean(f1_scores)
        results[idx]['acc_score'] = np.mean(acc_scores)
        results[idx]['time'] = np.mean(times)
        
        row = [ idx ]
        row += [results[idx]['f1_score'], results[idx]['acc_score'], results[idx]['time']]
        row += list(f_par.values()) + list(c_par.values())
        csv_writer.writerow( row )
        
        corpus.reset()
        idx += 1
    
    pickle.dump(results, pkl_file)
    
    csv_file.close()
    pkl_file.close()

([{'alpha': 1.0000000000000001e-05},
  {'alpha': 0.0001},
  {'alpha': 0.001},
  {'alpha': 0.01},
  {'alpha': 0.10000000000000001},
  {'alpha': 1.0},
  {'alpha': 10.0}],
 [{'M': 64},
  {'M': 134},
  {'M': 284},
  {'M': 600},
  {'M': 1265},
  {'M': 2667},
  {'M': 5624},
  {'M': -1}])

In [274]:
CV(corpus, *LogisticRegression_params(), 3)

  'precision', 'predicted', average, warn_for)


In [275]:
with open("cv_results/"+"MultinomialNB_2016-12-25_16-48.pkl", 'rb') as file:
    file.seek(0)
    results = pickle.load(file)
    
    f1_scores = -np.array([results[i]["f1_score"] for i in results])
    best_idx = f1_scores.argsort()
    
    print("{0:6} {1:15} {2:15} {3:15} {4:15} {5:15}".format("id", "f1_score", "acc_score", "M", "alpha", "time"))
    for idx in best_idx:
        print("{0:<6} {f1_score:<15.3} {acc_score:<15.3} {feat_params[M]:<15} \
{clf_params[alpha]:<15.2} {time:<15.2}".format(idx, **results[idx])
        )

id     f1_score        acc_score       M               alpha           time           
31     0.521           0.577           -1              0.01            0.026          
30     0.518           0.575           5624            0.01            0.047          
39     0.513           0.594           -1              0.1             0.032          
36     0.513           0.571           1265            0.1             0.023          
23     0.512           0.566           -1              0.001           0.03           
22     0.511           0.567           5624            0.001           0.027          
37     0.51            0.576           2667            0.1             0.019          
28     0.51            0.567           1265            0.01            0.028          
29     0.51            0.567           2667            0.01            0.019          
20     0.508           0.565           1265            0.001           0.029          
38     0.507           0.584           5624

In [None]:
def MultinomialNB_params():
    clf_class = MultinomialNB
    clf_name = "MultinomialNB"
    clf_params = [{"alpha":M} for M in np.logspace(-5,1,7)]

    M_max = np.log2(len(corpus.all_terms))-1
    feat_params = [{"M":int(M)} for M in np.logspace(6,M_max,7, base=2)]
    feat_params += [{"M":-1}]

    return clf_class, clf_name, clf_params, feat_params

In [277]:
def LogisticRegression_params():
    clf_class = LogisticRegression
    clf_name = "LogisticRegression"
    clf_params = [{"C":M, "penalty": 'l1', "solver": 'liblinear'} for M in np.logspace(-5,1,7)]
    feat_params = [{"M":-1}]
    
    return clf_class, clf_name, clf_params, feat_params

In [278]:
LogisticRegression_params()

(sklearn.linear_model.logistic.LogisticRegression,
 'LogisticRegression',
 [{'C': 1.0000000000000001e-05, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 0.0001, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 0.001, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 0.10000000000000001, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'},
  {'C': 10.0, 'penalty': 'l1', 'solver': 'liblinear'}],
 [{'M': -1}])