In [1]:
import pandas as pd

In [29]:
genre_df = pd.read_pickle("data/genres_df.pickle")

In [30]:
novel_df = pd.read_pickle("data/novel_df.pickle")

In [32]:
episode_df = pd.read_csv("data/episode_df.csv", index_col=0)

In [33]:
episode_df["ID"] = episode_df["ID"].astype("str")

In [34]:
new_df = genre_df.merge(novel_df.drop("level", axis=1), on="ID").merge(episode_df, on="ID")

In [95]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import LabelKFold
from sklearn.metrics import classification_report

In [45]:
from konlpy.tag import Twitter
pos_tagger = Twitter()

def tokenize_pos(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

model = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_pos)), 
            ('clf', MultinomialNB()),
        ])


In [98]:
X_data = new_df["text"]
y_data = new_df["genre"]
label_data = new_df["ID"]

In [99]:
cv = LabelKFold(label_data, n_folds=2)

In [101]:
for train_index, test_index in cv:
    model.fit(X_data.loc[train_index], y_data.loc[train_index])
    print(classification_report(y_data.loc[test_index], model.predict(X_data[test_index]), digits=4))
    

  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

      101.0     0.6486    0.9318    0.7648       513
      102.0     0.9406    0.6404    0.7620       470
      103.0     0.9912    0.9741    0.9826       464
      104.0     0.4111    0.2176    0.2846       170
      106.0     0.0000    0.0000    0.0000         0
      108.0     0.6000    0.4615    0.5217        52

avg / total     0.8004    0.7741    0.7681      1669

             precision    recall  f1-score   support

      101.0     0.6751    0.9216    0.7793       523
      102.0     0.6364    0.7000    0.6667       320
      103.0     0.9713    0.9854    0.9783       480
      104.0     0.5000    0.1607    0.2432       168
      106.0     0.0000    0.0000    0.0000        12
      108.0     0.1897    0.0663    0.0982       166

avg / total     0.6821    0.7292    0.6876      1669



KeyboardInterrupt: 

In [68]:
%%time
predict = model.predict(X_test)
print(classification_report(y_test, model.predict(X_test), digits=4))

CPU times: user 4min 4s, sys: 668 ms, total: 4min 5s
Wall time: 3min 58s


'             precision    recall  f1-score   support\n\n      101.0     0.9751    0.9955    0.9852       670\n      102.0     0.9943    0.9796    0.9869       538\n      103.0     1.0000    1.0000    1.0000       455\n      104.0     1.0000    0.9686    0.9840       159\n      106.0     1.0000    1.0000    1.0000       103\n      108.0     1.0000    0.9938    0.9969       162\n\navg / total     0.9906    0.9904    0.9904      2087\n'

In [72]:
print(_)

             precision    recall  f1-score   support

      101.0     0.9751    0.9955    0.9852       670
      102.0     0.9943    0.9796    0.9869       538
      103.0     1.0000    1.0000    1.0000       455
      104.0     1.0000    0.9686    0.9840       159
      106.0     1.0000    1.0000    1.0000       103
      108.0     1.0000    0.9938    0.9969       162

avg / total     0.9906    0.9904    0.9904      2087



In [74]:
model.predict_proba(X_test)

array([[  0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   1.00000000e+000],
       [  0.00000000e+000,   0.00000000e+000,   1.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000],
       [  1.00000000e+000,   3.74575540e-225,   0.00000000e+000,
          1.94507552e-247,   9.73069138e-289,   2.52901497e-315],
       ..., 
       [  0.00000000e+000,   3.41306971e-318,   1.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   4.39046198e-271],
       [  3.19829346e-111,   1.47690568e-201,   0.00000000e+000,
          1.00000000e+000,   1.45816618e-182,   8.68007817e-260],
       [  1.00000000e+000,   2.34850085e-236,   0.00000000e+000,
          5.28621093e-299,   3.83921116e-266,   0.00000000e+000]])

In [91]:
predict[2]

array([  1.00000000e+000,   3.74575540e-225,   0.00000000e+000,
         1.94507552e-247,   9.73069138e-289,   2.52901497e-315])