In [1]:
import corpus as corpus_class
import categories, filters, vocabulary_builders
from feature_extractors import multinomial_model, tfidf
from filters import std_filters

import numpy as np
import time

from sklearn.metrics import f1_score as f1_scorer

from sklearn.decomposition import TruncatedSVD

from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectPercentile, chi2

In [2]:
corpus = corpus_class.load_from_file()

## initializing corpus

In [3]:
qfile_train = 'question_train.csv'
qcatfile_train = 'question_category_train.csv'
catfile = 'category.csv'
qfile_test = 'question_test.csv'
filtees = std_filters()

In [4]:
corpus = corpus_class.corpus( categories.categories(subcategories=True) );
corpus.load(qfile_train, qcatfile_train);
corpus.process(corpus_size=0.8, test_corpus=True, **filtees);
corpus.save();



In [5]:
corpus.simple_split(0);



In [6]:
X_te, y_te = corpus.test_corpus
X_te = corpus.process_example( X_te )



# Standart Avaraging

In [8]:
clf_nb = MultinomialNB(alpha=0.1)
clf_lr = LogisticRegression(C=2.0)

In [11]:
bcf = VotingClassifier(estimators=[('lsvm_clf', lsvm_clf),('lr', clf_lr), ('nb', clf_nb)], voting='hard') #('svc', clf_svc)

In [12]:
bcf.fit(corpus.X_tr, corpus.y_tr);
clf_nb.fit(corpus.X_tr, corpus.y_tr);



In [13]:
bcf.score(X_te, y_te), f1_scorer(bcf.predict(X_te), y_te, average="macro")

  'recall', 'true', average, warn_for)


(0.50069348127600555, 0.33605760905696319)

## SVM

### with LDA

In [None]:
tfidf = TfidfTransformer()
selection = SelectPercentile(score_func=chi2, percentile=90)
lsa = TruncatedSVD(n_components=2000)
lda = LDA(n_components=26)
svm = SVC(kernel="rbf", C=5, probability=True)
lls_clf = Pipeline(steps=[('tfidf', tfidf), ('selection', selection), ('lsa', lsa), ('lda', lda), ('svm', svm)])

In [None]:
lls_clf.fit(corpus.X_tr, corpus.y_tr)

In [None]:
lls_clf.score(X_te, y_te), f1_scorer(lls_clf.predict(X_te), y_te, average="macro")

### LinearSVC

In [14]:
tfidf = TfidfTransformer()
selection = SelectPercentile(score_func=chi2, percentile=100)
lsvm = LinearSVC(C=0.48, intercept_scaling=0.5, class_weight=None, penalty='l2')
lsvm_clf = Pipeline(steps=[('tfidf', tfidf), ('selection', selection), ('lsvm', lsvm)])
def __pri_pro(X): return abs(lsvm_clf.decision_function(X))
lsvm_clf.predict_proba = __pri_pro

In [15]:
lsvm_clf.fit(corpus.X_tr, corpus.y_tr);



In [16]:
lsvm_clf.score(X_te, y_te), f1_scorer(lsvm_clf.predict(X_te), y_te, average="macro")

  'recall', 'true', average, warn_for)


(0.5079750346740638, 0.36385730719395654)

In [None]:
lsvm_clf.predict_proba(X_te)

# Using a Classfier for avaraging

In [None]:
corpus.cv_split(8)
SEED = corpus.random_seed

P = []
lP = []
corpus.reset()
for corpus in corpus:
    clf_nb = MultinomialNB(alpha=0.1)
    clf_nb.fit(corpus.X_tr, corpus.y_tr)
    
    clf_lr = LogisticRegression(C=2.0)
    clf_lr.fit(corpus.X_tr, corpus.y_tr)
    
    lsvm_clf.fit(corpus.X_tr, corpus.y_tr)
    
    CLFS = [clf_nb, clf_lr, lsvm_clf]
    A = [ clf.predict_proba( corpus.X_te ) for clf in CLFS]
    P += [ np.concatenate(A, axis=1) ]
    lP += [ corpus.y_te ]

Xp_tr = np.concatenate(P, axis=0)
yp_tr = np.concatenate(lP, axis=0)

In [None]:
corpus.simple_split(0)

for clf in CLFS:
    clf.fit(corpus.X_tr, corpus.y_tr)

X_te, yp_te = corpus.test_corpus
X_te = corpus.process_example( X_te )    
Xp_te = np.concatenate( [ clf.predict_proba( X_te ) for clf in CLFS] , axis=1)

In [None]:
clf_fin = RandomForestClassifier(n_estimators=500, max_features=6)
clf_fin.fit(Xp_tr, yp_tr)

In [None]:
clf_fin.score(Xp_te, yp_te), f1_scorer(clf_fin.predict(Xp_te), yp_te, average="macro")

# Correlation

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
def corr_plot(y1, y2, corpus=corpus):
    C = np.zeros((14,14))
    for i, j in zip(y1, y2):
        C[i, j] += 1
    
    x = np.repeat(np.arange(14), 14)
    y = np.repeat(np.arange(14).reshape(1,14), 14, axis=0)
    plt.figure(figsize=(6, 4))
    
    lables = corpus.cats[:]
    plt.yticks(np.arange(14), lables, rotation=0, ha='right')

    plt.scatter(x, y, s=C.flatten())

# Evalutation

## Confidence Measures

In [None]:
clf_fin = RandomForestClassifier(n_estimators=500, max_features=5)
clf_fin.fit(Xp_tr, yp_tr)

### for all predicitons

In [None]:
PP = clf_fin.predict_proba(Xp_te)
y_pred = clf_fin.predict(Xp_te)
P = []
for c in range(len(corpus.cats)):
    P += [PP[y_pred == c][:,c]]

In [None]:
P = np.array(P)
sort = np.argsort( [-p.mean() for p in P] )
P = P[sort]

In [None]:
plt.figure(figsize=(7, 5))
plt.title('avarage convidence')
plt.boxplot(P, vert=True, widths=0.7)
lables = corpus.cats[:][sort]
plt.xticks(np.arange(14)+1, lables, rotation=30, ha='right')
plt.show()

### for wrong predicitons

In [None]:
PP = clf_fin.predict_proba(Xp_te)
y_pred = clf_fin.predict(Xp_te)
P = []
for c in range(len(corpus.cats)):
    ps = np.array([PP[i,c] for i in range(len(yp_te)) if (y_pred[i] == c)&(y_pred[i] != y_te[i])])
    P += [ps]

In [None]:
P = np.array(P)
sort = np.argsort( [-p.mean() for p in P] )
P = P[sort]

In [None]:
plt.figure(figsize=(7, 5))
plt.title('avarage convidence when wrong')
plt.boxplot(P, vert=True, widths=0.7)
lables = corpus.cats[:][sort]
plt.xticks(np.arange(14)+1, lables, rotation=30, ha='right')
plt.show() 

In [5]:
a = {1:"hund", 2:"Kate", 3:"Maus"}

In [8]:
a[[1,2,3]]

TypeError: unhashable type: 'list'