In [36]:
import csv, importlib
import corpus as corpus_class
import categories, filters, vocabulary_builders
from feature_extractors import multinomial_model, tfidf
import feature_extractors
import numpy as np

In [37]:
corpus = corpus_class.load_from_file()
len(corpus.tr_set), len(corpus.te_set)

(10000, 4417)

In [20]:
cats = categories.categories()

corpus = corpus_class.corpus(cats)
corpus.load("question_train.csv", "question_category_train.csv")

sentence_filters = [filters.punctuation_filter]
word_filters = [filters.small_word_filter, filters.stopword_filter, filters.stemming_filter]

corpus.process(sentence_filters, word_filters, tr_set_size=10000);
len(corpus.tr_set), len(corpus.te_set)

(10000, 4417)

In [77]:
term_space = vocabulary_builders.ig_based(corpus, M=200)
corpus.make_features(term_space, feature_extractor=feature_extractors.tfidf)

<corpus.corpus at 0x10aedc978>

In [81]:
from sklearn.naive_bayes import BernoulliNB
nb_clf = BernoulliNB()
nb_clf.fit(corpus.X_tr.T, np.array(corpus.y_tr, dtype=int))
nb_clf.score(corpus.X_te.T, np.array(corpus.y_te, dtype=int))

0.45687117953362011

In [78]:
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB(alpha=0.1)
nb_clf.fit(corpus.X_tr.T, corpus.y_tr)
nb_clf.score(corpus.X_te.T, corpus.y_te)

0.46411591577994116

In [10]:
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()
tree_clf = tree_clf.fit(corpus.X_tr.T, corpus.y_tr)
tree_clf.score(corpus.X_te.T, corpus.y_te)

0.38

In [37]:
from sklearn.ensemble import RandomForestClassifier
forrest_clf = RandomForestClassifier(n_estimators=20,max_features=100)
forrest_clf = forrest_clf.fit(corpus.X_tr.T, corpus.y_tr)
forrest_clf.score(corpus.X_te.T, corpus.y_te)

0.52705456191985511

In [35]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(penalty='l1', C=3)
lr_clf = lr_clf.fit(corpus.X_tr.T, corpus.y_tr)
lr_clf.score(corpus.X_te.T, corpus.y_te)

0.57505093955173192

# Examples

In [53]:
expl = ["Ich liebe meinen Mann nicht. Soll ich es ihm sagen?"]
X_expl = corpus.process_example(expl)
term_space[X_expl.flatten()>0],X_expl[X_expl.flatten()>0].T
print( term_space[X_expl.flatten()>0], X_expl[X_expl.flatten()>0].T )
corpus.cats[ int(nb_clf.predict(X_expl.T)[0]) ]

['mann' 'lieb' 'sag'] [[ 0.46802963  0.54975667  0.69189296]]


'liebe_and_beziehung'

In [58]:
expl = ["Um wieviel uhr spielt die deutsche Fussball-Nationalmanschaft heute?"]
X_expl = corpus.process_example(expl)
print( term_space[X_expl.flatten()>0], X_expl[X_expl.flatten()>0].T )
corpus.cats[ int(nb_clf.predict(X_expl.T)[0]) ]

['wieviel' 'deutsch' 'spielt' 'heut' 'fussball' 'uhr'] [[ 0.29772197  0.36708525  0.377647    0.40277263  0.41598424  0.54655659]]


'freizeit_and_sport'

In [75]:
expl = ["Wichsen Titte Fussball tobias spielen"]
X_expl = corpus.process_example(expl)
print( term_space[X_expl.flatten()>0], X_expl[X_expl.flatten()>0].T )
corpus.cats[ int(nb_clf.predict(X_expl.T)[0]) ]

['spiel' 'fussball' 'tobias' 'titt' 'wichs'] [[ 0.29040278  0.33238984  0.48834327  0.50804452  0.55551312]]


'freizeit_and_sport'