In [6]:
import csv, importlib
import corpus as corpus_class
import categories, filters, vocabulary_builders
from feature_extractors import multinomial_model, tfidf
from filters import std_filters
import numpy as np
from scipy import sparse

In [13]:
corpus = corpus_class.corpus( categories.categories() )
corpus.load("question_train.csv", "question_category_train.csv")
corpus.process(**std_filters(), corpus_size=-1)
corpus.save()

In [14]:
corpus.simple_split(0.1)
corpus.make_features()

<corpus.corpus at 0x103217358>

In [15]:
corpus.X_tr.shape

(12975, 13630)

In [83]:
from sklearn.naive_bayes import BernoulliNB
nb_clf = BernoulliNB()
nb_clf.fit(corpus.X_tr, np.array(corpus.y_tr, dtype=int))
nb_clf.score(corpus.X_te, np.array(corpus.y_te, dtype=int))

0.40932759791713835

In [101]:
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB(alpha=0.1)
nb_clf.fit(corpus.X_tr, corpus.y_tr)
nb_clf.score(corpus.X_te, corpus.y_te)

0.58093728775186781

In [108]:
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()
tree_clf = tree_clf.fit(corpus.X_tr, corpus.y_tr)
tree_clf.score(corpus.X_te, corpus.y_te)

0.48019017432646594

In [113]:
from sklearn.ensemble import RandomForestClassifier
forrest_clf = RandomForestClassifier(n_estimators=20,max_features=100)
forrest_clf = forrest_clf.fit(corpus.X_tr, corpus.y_tr)
forrest_clf.score(corpus.X_te, corpus.y_te)

0.51845143762734891

In [16]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(penalty='l1', C=3, solver='liblinear')
lr_clf = lr_clf.fit(corpus.X_tr, corpus.y_tr)
clf = lr_clf
lr_clf.score(corpus.X_te, corpus.y_te)

0.59431345353675447

In [30]:
from sklearn.linear_model import Lasso
lasso_clf = Lasso(alpha=0.001)
lasso_clf = lasso_clf.fit(corpus.X_tr, corpus.y_tr)
lasso_clf.score(corpus.X_te, corpus.y_te)

0.17696540866359312

# Examples

In [18]:
def mk_pred_expl(expl):
    X_expl = corpus.process_example(expl)
    _, J, V = sparse.find(X_expl)
    print(np.matrix([[*corpus.term_space[J]], [*V]]))
    print("->",corpus.cats[ int(clf.predict(X_expl)[0]) ])

In [19]:
expl = ["Ich liebe meinen Mann nicht. Soll ich es ihm sagen?"]
mk_pred_expl(expl)

[['lieb' 'mann' 'sag' 'soll']
 ['0.396629973749' '0.339594061415' '0.506862224131' '0.685894469379']]
-> liebe_and_beziehung


In [20]:
expl = ["Um wieviel uhr spielt die deutsche Fussball-Nationalmanschaft heute?"]
mk_pred_expl(expl)

[['fussball' 'spielt' 'wieviel' 'deutsch' 'heut' 'uhr' 'nationalmanschaft']
 ['0.323411558563' '0.292215750989' '0.232326079146' '0.284278827882'
  '0.318217206782' '0.425379810347' '0.626908993304']]
-> freizeit_and_sport


In [21]:
expl = ["Wie viel IQ hat der Paul Heller"]
mk_pred_expl(expl)

[['hell' 'paul' 'iq']
 ['0.548327897495' '0.524703343875' '0.651170421438']]
-> stars_and_promis


In [22]:
expl = ["Wie viel Einwohner hat Berlin?"]
mk_pred_expl(expl)

[['einwohn' 'berlin']
 ['0.705801292081' '0.708409864483']]
-> schule


In [23]:
expl = ["Wo spielt Lionel Messi?"]
mk_pred_expl(expl)

[['spielt' 'lionel' 'messi']
 ['0.341987865576' '0.679991110965' '0.648580287094']]
-> freizeit_and_sport


In [24]:
expl = ["Was ist der größte Fluss der Welt?"]
mk_pred_expl(expl)

[['welt' 'grosst' 'fluss']
 ['0.379326640652' '0.495399804197' '0.781466783487']]
-> schule


In [26]:
expl = ["Wo wächst Trüffel?"]
mk_pred_expl(expl)

[['wach']
 ['1.0']]
-> adult


In [27]:
expl = ["Was bedeutet Cornelia?"]
mk_pred_expl(expl)

[['bedeutet' 'cornelia']
 ['0.263991584221' '0.964524983326']]
-> namensforschung


In [28]:
expl = ["Welche Farbe hat der Himmel?"]
mk_pred_expl(expl)

[['himmel' 'farb']
 ['0.744590090358' '0.667521982665']]
-> wissen


In [29]:
print(corpus.cats)

0 (13): schule 
	 31	 32	 33	 34	 35	 36	 86
1 (12): literatur_and_sprache 
	 28	 29	 30	 85
2 (11): namensforschung 
	 26	 27
3 (7): film_and_musik 
	 20	 21	 22	 23
4 (8): stars_and_promis 
	 70	 71	 72	 73
5 (9): computer_and_pc 
	 68	 69
6 (10): alltag 
	 24	 25	 84
7 (14): mensch_and_koerper 
	 37	 38	 39	 40	 41	 42
8 (15): freizeit_and_sport 
	 43	 44	 45	 46	 47
9 (16): wissen 
	 48	 49	 50	 51	 52	 53	 54	 55	 56	 57	 87
10 (17): liebe_and_beziehung 
	 58	 59	 60	 61
11 (18): astrologie 
	 62	 63	 64	 65
12 (19): games_and_spiele 
	 66	 67
13 (74): adult 
	 76	 77	 78	 79	 80	 81	 82	 83



In [41]:
expl = ["Liebt er mich wirklich?"]
mk_pred_expl(expl)

[['liebt' 'wirklich']
 ['0.742457703708' '0.669892945331']]
-> liebe_and_beziehung


In [33]:
expl = ["Wie baue ich eine Bombe?"]
mk_pred_expl(expl)

[['bau' 'bomb']
 ['0.609097135707' '0.793095630598']]
-> wissen


In [34]:
expl = ["Wie baute Dr. Seltsam eine Bombe"]
mk_pred_expl(expl)

[['dr' 'seltsam' 'baut' 'bomb']
 ['0.419077142011' '0.536978176797' '0.497677831868' '0.536978176797']]
-> wissen


In [40]:
expl = ["Welchen Aszentenden hat Zwilling?"]
mk_pred_expl(expl)

[['zwilling']
 ['1.0']]
-> astrologie
