In [5]:
import corpus as corpus_class
import categories, filters, vocabulary_builders
from feature_extractors import multinomial_model, tfidf
from filters import std_filters

import numpy as np

from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## initializing corpus

In [6]:
qfile_train = 'question_train.csv'
qcatfile_train = 'question_category_train.csv'
catfile = 'category.csv'
qfile_test = 'question_test.csv'

In [7]:
filtees = std_filters()

In [18]:
corpus = corpus_class.corpus( categories.categories() )
corpus.load(qfile_train, qcatfile_train)
corpus.process(**filtees, corpus_size=-1)

<corpus.corpus at 0x107f75b70>

### simple split

In [35]:
corpus.simple_split(0.33)
corpus.make_features(-1)
print(corpus)

14 categories. 
- loaded from file: True
	 14417 docuemnts loaded from file. 
- processed: True
	 sentence_filters: ['punctuation_filter'] 
	 word_filters: ['small_word_filter', 'stopword_filter', 'stemming_filter'] 
- corpus in simple split:
	 Training-set, Test-set size: (9659, 4758) 
- made numeric features: True
	 vocabulary_builder, M: ig_based_non_uniform, -1 
	 feature_extractor: multinomial_model 



# Standart Avaraging

In [50]:
clf_nb = MultinomialNB(alpha=0.1)
clf_lr = LogisticRegression(C=3.0)
clf_svc = SVC(C=1.0, probability=True)

In [51]:
bcf = VotingClassifier(estimators=[('lr', clf_lr), ('nb', clf_nb), ('svc', clf_svc)], voting='soft')

In [52]:
bcf.fit(corpus.X_tr, corpus.y_tr);

In [53]:
bcf.score(corpus.X_te, corpus.y_te)

0.61174755259320979

# Using a Classfier for avaraging

In [34]:
corpus.cv_split(3)
SEED = corpus.random_seed
SEED

3258533570

In [46]:
P = []
lP = []
corpus.reset()
for corpus in corpus:
    corpus.make_features()
    
    clf_nb = MultinomialNB(alpha=0.1)
    clf_nb.fit(corpus.X_tr, corpus.y_tr)
    clf_lr = LogisticRegression(C=2.0)
    clf_lr.fit(corpus.X_tr, corpus.y_tr)
    clf_svc = SVC(C=1.0, probability=True)
    clf_svc.fit(corpus.X_tr, corpus.y_tr)
    
    A = clf_nb.predict_proba( corpus.X_te )
    B = clf_lr.predict_proba( corpus.X_te )
    C = clf_svc.predict_proba( corpus.X_te )
    
    P += [ np.concatenate((A, B, C), axis=1) ]
    lP += [ corpus.y_te ]
    
Xp_tr = np.concatenate((P[0], P[1]), axis=0)
yp_tr = np.concatenate((lP[0], lP[1]), axis=0)

Xp_te = P[2]
yp_te = lP[2]

In [49]:
clf_fin = RandomForestClassifier(n_estimators=100, max_features=3)
#clf_fin = LogisticRegression(C=1.0)
clf_fin.fit(Xp_tr, yp_tr)
clf_fin.score(Xp_te, yp_te)

0.61508019162674443

In [44]:
print(corpus)

14 categories. 
- loaded from file: True
	 14417 docuemnts loaded from file. 
- processed: True
	 sentence_filters: ['punctuation_filter'] 
	 word_filters: ['small_word_filter', 'stopword_filter', 'stemming_filter'] 
- corpus in cv-split:
	 fold 3 / 3 
	 Training-set, Test-set size: (9616, 4801) 
- made numeric features: True
	 vocabulary_builder, M: ig_based_non_uniform, -1 
	 feature_extractor: multinomial_model 



In [54]:
import pydoc

In [55]:
pydoc.help(corpus_class)

Help on module corpus:

NAME
    corpus

CLASSES
    builtins.object
        corpus
    
    class corpus(builtins.object)
     |  Methods defined here:
     |  
     |  __init__(self, categories)
     |      Initialize self.  See help(type(self)) for accurate signature.
     |  
     |  __iter__(self)
     |  
     |  __next__(self)
     |  
     |  __str__(self)
     |      Return str(self).
     |  
     |  cv_split(self, n_folds, random_seed=None)
     |      This mehtod provides an efficient way to creat n_folds on the corpus for cross validation.
     |      It Counts term_frequencies for each folds seperatly so that they can simply be merged. After
     |      running this method, the corpus becomes an iterable object, which for each iteration creats a new
     |      traing-/ test-set split.
     |  
     |  load(self, filename_questions, filename_categories)
     |  
     |  make_features(self, M=-1, vocabulary_builder=<function ig_based_non_uniform at 0x1052d8d08>, feature_ex