In [113]:
import corpus as corpus_class
import categories, filters, vocabulary_builders
from feature_extractors import multinomial_model, tfidf
from filters import std_filters

import numpy as np
import time

from sklearn.decomposition import TruncatedSVD

from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

## initializing corpus

In [2]:
qfile_train = 'question_train.csv'
qcatfile_train = 'question_category_train.csv'
catfile = 'category.csv'
qfile_test = 'question_test.csv'

In [3]:
filtees = std_filters()

In [4]:
corpus = corpus_class.corpus( categories.categories() );
corpus.load(qfile_train, qcatfile_train);
corpus.process(corpus_size=12000, test_corpus=True, **filtees);

In [137]:
X_te, y_te = corpus.test_corpus
X_te = corpus.process_example( X_te )



In [146]:
X_te

<2417x12972 sparse matrix of type '<class 'numpy.float64'>'
	with 8607 stored elements in Compressed Sparse Row format>

### simple split

In [129]:
corpus.simple_split(0);
corpus.make_features(-1);



# Standart Avaraging

In [139]:
clf_nb = MultinomialNB(alpha=0.1)
clf_lr = LogisticRegression(C=2.0)

In [140]:
bcf = VotingClassifier(estimators=[('lr', clf_lr), ('nb', clf_nb)], voting='soft') #('svc', clf_svc)

In [141]:
bcf.fit(corpus.X_tr, corpus.y_tr);
clf_nb.fit(corpus.X_tr, corpus.y_tr);



In [142]:
bcf.score(X_te, y_te)

0.60570955730244103

## SVM

In [143]:
tfidf = TfidfTransformer()
lsa = TruncatedSVD(n_components=1000)
lda = LDA(n_components=13)
svm = SVC(kernel="rbf", C=5, probability=True)
lls_clf = Pipeline(steps=[('tfidf', tfidf), ('lsa', lsa), ('lda', lda), ('svm', svm)])

In [144]:
lls_clf.fit(corpus.X_tr, corpus.y_tr)



Pipeline(steps=[('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('lsa', TruncatedSVD(algorithm='randomized', n_components=1000, n_iter=5,
       random_state=None, tol=0.0)), ('lda', LinearDiscriminantAnalysis(n_components=13, priors=None, shrinkage=None,
          ...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [145]:
lls_clf.score(X_te, y_te)



0.55192387256930076

# Using a Classfier for avaraging

In [86]:
corpus.cv_split(8)
SEED = corpus.random_seed
SEED

1671713064

In [88]:
corpus.cv_split(8)
SEED = corpus.random_seed

P = []
lP = []
corpus.reset()
for corpus in corpus:
    corpus.make_features()
    
    clf_nb = MultinomialNB(alpha=0.1)
    clf_nb.fit(corpus.X_tr, corpus.y_tr)
    
    clf_lr = LogisticRegression(C=2.0)
    clf_lr.fit(corpus.X_tr, corpus.y_tr)
    
    #lls_clf.fit(corpus.X_tr, corpus.y_tr)
    
    CLFS = [clf_nb, clf_lr]
    A = [ clf.predict_proba( corpus.X_te ) for clf in CLFS]
    P += [ np.concatenate(A, axis=1) ]
    lP += [ corpus.y_te ]

Xp_tr = np.concatenate(P, axis=0)
yp_tr = np.concatenate(lP, axis=0)



In [89]:
corpus.simple_split(0)
corpus.make_features(-1)

for clf in CLFS:
    clf.fit(corpus.X_tr, corpus.y_tr)

X_te, yp_te = corpus.test_corpus
X_te = corpus.process_example( X_te )    
Xp_te = np.concatenate( [ clf.predict_proba( X_te ) for clf in CLFS] , axis=1)



In [106]:
#clf_fin = LDA()
#clf_fin = LogisticRegression(C=1.0)
clf_fin = RandomForestClassifier(n_estimators=500, max_features=5)
clf_fin.fit(Xp_tr, yp_tr)
clf_fin.score(Xp_te, yp_te)

0.61522548613984274

In [95]:
Xp_te.shape

(2417, 28)

# Correlation

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
clf_nb = MultinomialNB(alpha=0.1)
clf_lr = LogisticRegression(C=1.0)

In [None]:
def corr_plot(y1, y2, corpus=corpus):
    C = np.zeros((14,14))
    for i, j in zip(y1, y2):
        C[i, j] += 1
    
    x = np.repeat(np.arange(14), 14)
    y = np.repeat(np.arange(14).reshape(1,14), 14, axis=0)
    plt.figure(figsize=(8, 6))
    
    lables = corpus.cats[:]
    plt.yticks(np.arange(14), lables, rotation=0, ha='right')

    plt.scatter(x, y, s=C.flatten())

In [None]:
clf_nb.fit(corpus.X_tr, corpus.y_tr)
clf_lr.fit(corpus.X_tr, corpus.y_tr)

In [None]:
y1 = clf_nb.predict(corpus.X_te)
y2 = clf_lda.predict(lsa.transform(corpus.X_te))
corr_plot(y1, y2)

In [None]:
y1 = clf_nb.predict(corpus.X_te)
y1 = clf_lr.predict(corpus.X_te)
corr_plot(y1, y2)

In [None]:
y1 = clf_lr.predict(corpus.X_te)
y2 = clf_lda.predict(lsa.transform(corpus.X_te))
corr_plot(y1, y2)