In [1]:
import pandas as pd

In [29]:
main_df = pd.read_csv("data/main_webnovel_df.csv", index_col=0)

In [22]:
episode_df = pd.read_csv("data/episode_df.csv", index_col=0)
episode_df["ID"] = episode_df["ID"].astype("int64")

In [63]:
novel_19_df = pd.read_csv("data/novel_19.csv", encoding="cp949", index_col=0).dropna()

In [64]:
text_df = main_df.merge(episode_df, on="ID")[["ID", "genre", "text"]]

In [65]:
text_df = pd.concat([text_df, novel_19_df]).reset_index(drop=True)

In [66]:
X_train = text_df[text_df["ID"] != 466391].ix[:,2]
y_train = text_df[text_df["ID"] != 466391].ix[:,1]
X_test = text_df[text_df["ID"] == 466391].ix[:,2]
y_test = text_df[text_df["ID"] == 466391].ix[:,1]

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import LabelKFold
from sklearn.metrics import classification_report

In [89]:
vect = TfidfVectorizer()

In [90]:
%%time
vect.fit(test_df.ix[:,2])

Wall time: 3min 10s


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [133]:
def make_predict_proba(ID, vect):
    
    X_train = text_df[text_df["ID"] != ID].ix[:,2]
    y_train = text_df[text_df["ID"] != ID].ix[:,1]
    X_test = text_df[text_df["ID"] == ID].ix[:,2]
    
    model = MultinomialNB()
    model.fit(vect.transform(X_train), y_train)
    predict = model.predict_proba(vect.transform(X_test))
    predict_df = pd.DataFrame(predict, columns=[101, 102, 103, 104, 106, 108, 109])
    if text_df[text_df["ID"] == ID]["ID"].unique()[0] == 101:
        predict_df["genre_proba"] = predict[:, 0]
    elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 102:
        predict_df["genre_proba"] = predict[:, 1]
    elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 103:
        predict_df["genre_proba"] = predict[:, 2]
    elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 104:
        predict_df["genre_proba"] = predict[:, 3]
    elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 106:
        predict_df["genre_proba"] = predict[:, 4]
    elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 108:
        predict_df["genre_proba"] = predict[:, 5]

    predict_df["ID"] = ID
    
    return predict_df
    
    

In [134]:
def make_predict_proba_df():
    
    vect = TfidfVectorizer()
    vect.fit(text_df.ix[:,2])
    
    proba_df = pd.DataFrame(columns = ["ID", 101, 102, 103, 104, 106, 108, 109])
    
    for ID in main_df["ID"]:
        predict_proba_df = make_predict_proba(ID, vect)
        proba_df = pd.concat([proba_df, predict_proba_df])

    proba_df.reset_index(drop=True)
    proba_df.to_csv("data/proba_df.csv")
    
    return proba_df

In [135]:
%%time
proba_df = make_predict_proba_df()

Wall time: 1h 35min 41s


In [247]:
proba_df = proba_df.reset_index()

In [250]:
text_df.head()

Unnamed: 0,ID,genre,text
0,466391,101.0,"\n프롤로그 철컥.현관문이 닫히는 소리와 함께 짙은 어둠이 내려앉았다.“아, 센서등..."
1,466391,101.0,결혼해 보라. 당신은 후회할 것이다.그러면 결혼하지 말라. 당신은 더욱 후회할 것이...
2,466391,101.0,"\n-하이네>야간자율학습 시간 내내, 해수는 노트에 낙서를 했다.결혼. 겨울방학. ..."
3,466391,101.0,\n-안톤 체호프>\n해수를 집 앞에 내려주고 돌아가는 길.혁준은 붉은색으로 바뀐 ...
4,466391,101.0,"\n-조지 고든 바이런>\n혁준은 정호, 유리와의 통화를 끝내고 휴대폰을 내려놓았다..."


In [252]:
predict_list = []
for i in range(len(proba_df)):
    predict_list.append(np.argmax(proba_df.loc[i][2:]))

In [257]:
print(classification_report(text_df[text_df["genre"] != 109]["genre"], predict_list))

             precision    recall  f1-score   support

      101.0       0.34      1.00      0.51      2792
      102.0       0.81      0.02      0.05      2025
      103.0       0.87      0.03      0.06      1829
      104.0       0.00      0.00      0.00       605
      106.0       0.00      0.00      0.00       467
      108.0       0.00      0.00      0.00       629

avg / total       0.50      0.35      0.19      8347



  'precision', 'predicted', average, warn_for)


In [179]:
group_proba_df = proba_df.groupby("ID", as_index=False).agg({101: np.mean, 102: np.mean, 103: np.mean, 104: np.mean, 106: np.mean, 108: np.mean, 109: np.mean})

In [180]:
all_df= main_df.merge(group_proba_df, on="ID")

In [181]:
all_df.rename(columns={101:"romance", 102:"SFfantasy", 103:"matial", 104:"mystery", 106:"lightnovel", 108:"fusion", 109:"adult",}, inplace=True)

In [192]:
X_data = all_df[["genre", "episodes_count", "romance", "SFfantasy", "matial", "mystery", "lightnovel", "fusion", "adult"]]

In [153]:
y_data = all_df[["concern_count"]]

In [188]:
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [189]:
scaler = StandardScaler()

In [194]:
X_data = pd.DataFrame(scaler.fit_transform(X_data), columns=["genre", "episodes_count", "romance", "SFfantasy", "matial", "mystery", "lightnovel", "fusion", "adult"])

In [196]:
model2 = sm.OLS(y_data, sm.add_constant(X_data))

In [197]:
print(model2.fit().summary())

                            OLS Regression Results                            
Dep. Variable:          concern_count   R-squared:                       0.148
Model:                            OLS   Adj. R-squared:                  0.115
Method:                 Least Squares   F-statistic:                     4.476
Date:                Thu, 28 Jul 2016   Prob (F-statistic):           1.98e-05
Time:                        06:46:08   Log-Likelihood:                -2682.9
No. Observations:                 241   AIC:                             5386.
Df Residuals:                     231   BIC:                             5421.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [95.0% Conf. Int.]
----------------------------------------------------------------------------------
const           1.946e+04   1088.247     17.

In [208]:
len(episode_df)

8347

In [209]:
len(proba_df)

8347

In [221]:
stack_df = pd.DataFrame(np.hstack([episode_df, proba_df]), columns=list(episode_df.columns)+list(proba_df.columns))

In [227]:
stack_df.rename(columns={101:"romance", 102:"SFfantasy", 103:"matial", 104:"mystery", 106:"lightnovel", 108:"fusion", 109:"adult",}, inplace=True)

In [231]:
stack_df.ix[:,[2,3,4,5,8,9,10,11,12,13,14]]

Unnamed: 0,is_first,score,score_count,episode_comments_count,romance,SFfantasy,matial,mystery,lightnovel,fusion,adult
0,1,9.92,3889,1219,0.996761,0.00255418,0.000682002,9.6311e-07,2.32203e-07,8.7567e-07,1.18855e-06
1,0,9.93,3530,171,0.993403,0.00552969,0.00106461,8.25206e-07,2.91303e-07,8.75303e-07,6.88777e-07
2,0,9.96,3493,223,0.993623,0.00544217,0.000932973,4.32594e-07,1.31084e-07,4.8975e-07,4.01862e-07
3,0,9.96,3629,223,0.995663,0.00368673,0.000647183,1.15869e-06,2.55982e-07,9.03143e-07,1.09214e-06
4,0,9.97,3830,205,0.986212,0.0117887,0.00197172,8.94634e-06,2.76763e-06,7.55279e-06,8.65453e-06
5,0,9.96,3769,257,0.991155,0.00745319,0.0013853,1.82403e-06,5.89243e-07,1.80845e-06,2.20027e-06
6,0,9.98,3681,142,0.992543,0.00627144,0.00117913,1.83714e-06,6.84919e-07,1.76284e-06,2.42631e-06
7,0,9.99,3271,161,0.997269,0.00218077,0.000544956,1.59446e-06,6.25787e-07,1.50678e-06,1.76338e-06
8,0,9.98,3430,200,0.998006,0.00181251,0.000179985,4.67465e-07,2.11563e-07,4.18389e-07,5.28671e-07
9,0,9.98,3827,195,0.995896,0.00377251,0.000329461,7.33249e-07,2.45185e-07,6.28282e-07,7.12114e-07


In [237]:
model = sm.OLS.from_formula("score ~ is_first + romance + SFfantasy + matial + mystery + lightnovel + fusion + adult", stack_df)

In [240]:
result = model.fit()

MemoryError: 

In [None]:
print(result.sumarry())

In [259]:
all_df

Unnamed: 0,ID,level,genre,main_score,concern_count,episodes_count,comments_count,romance,SFfantasy,matial,mystery,lightnovel,fusion,adult
0,466391,webnovel,101.0,9.98,86094,95,179,0.993969,0.004980,0.001046,1.445343e-06,4.803589e-07,1.434219e-06,1.883553e-06
1,398090,webnovel,101.0,9.98,71748,138,189,0.960641,0.013836,0.025512,2.390496e-06,6.062998e-07,5.936056e-06,1.502595e-06
2,514809,webnovel,101.0,9.96,43120,51,125,0.998369,0.001436,0.000190,9.660413e-07,5.004053e-07,1.100571e-06,1.373403e-06
3,505096,webnovel,101.0,9.95,48354,59,76,0.984675,0.013187,0.002107,7.836291e-06,3.831237e-06,8.578221e-06,1.020554e-05
4,523286,webnovel,101.0,9.97,27443,43,29,0.996526,0.002925,0.000544,1.519839e-06,4.883162e-07,1.427922e-06,1.901099e-06
5,552533,webnovel,101.0,9.92,16120,7,27,0.985303,0.012440,0.002148,3.113375e-05,1.320339e-05,2.737527e-05,3.720595e-05
6,514807,webnovel,101.0,9.96,36196,51,56,0.998335,0.001381,0.000282,5.093238e-07,2.542612e-07,4.404917e-07,8.493947e-07
7,466374,webnovel,101.0,9.86,34715,95,92,0.974345,0.013147,0.012492,3.980548e-06,9.503504e-07,6.043869e-06,4.812399e-06
8,483047,webnovel,101.0,9.98,33926,77,29,0.996668,0.002617,0.000711,5.325223e-07,1.852298e-07,2.596288e-06,4.936195e-07
9,514808,webnovel,101.0,9.92,31654,51,33,0.999400,0.000522,0.000078,9.013786e-08,3.820966e-08,7.226822e-08,1.099518e-07


In [267]:
model = sm.OLS.from_formula("concern_count ~ C(genre) + episodes_count + romance + SFfantasy + matial", all_df)

In [268]:
result = model.fit()

In [269]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:          concern_count   R-squared:                       0.181
Model:                            OLS   Adj. R-squared:                  0.160
Method:                 Least Squares   F-statistic:                     8.604
Date:                Thu, 28 Jul 2016   Prob (F-statistic):           1.84e-08
Time:                        11:00:26   Log-Likelihood:                -2678.3
No. Observations:                 241   AIC:                             5371.
Df Residuals:                     234   BIC:                             5395.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------
Intercept          2.443e+04   1557.19

In [150]:
all_df.head(0)

Unnamed: 0,ID,level,genre,main_score,concern_count,episodes_count,comments_count,101,102,103,104,106,108,109


In [28]:
from konlpy.tag import Twitter
pos_tagger = Twitter()

ImportError: No module named 'konlpy'

In [None]:
def tokenize_pos(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

CountVectorizer(tokenizer=tokenize_pos)

In [None]:
pos_tagger = Twit

In [45]:
from konlpy.tag import Twitter
pos_tagger = Twitter()

def tokenize_pos(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

model = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_pos)), 
            ('clf', MultinomialNB()),
        ])


In [98]:
X_data = new_df["text"]
y_data = new_df["genre"]
label_data = new_df["ID"]

In [99]:
cv = LabelKFold(label_data, n_folds=2)

In [101]:
for train_index, test_index in cv:
    model.fit(X_data.loc[train_index], y_data.loc[train_index])
    print(classification_report(y_data.loc[test_index], model.predict(X_data[test_index]), digits=4))
    

  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

      101.0     0.6486    0.9318    0.7648       513
      102.0     0.9406    0.6404    0.7620       470
      103.0     0.9912    0.9741    0.9826       464
      104.0     0.4111    0.2176    0.2846       170
      106.0     0.0000    0.0000    0.0000         0
      108.0     0.6000    0.4615    0.5217        52

avg / total     0.8004    0.7741    0.7681      1669

             precision    recall  f1-score   support

      101.0     0.6751    0.9216    0.7793       523
      102.0     0.6364    0.7000    0.6667       320
      103.0     0.9713    0.9854    0.9783       480
      104.0     0.5000    0.1607    0.2432       168
      106.0     0.0000    0.0000    0.0000        12
      108.0     0.1897    0.0663    0.0982       166

avg / total     0.6821    0.7292    0.6876      1669



KeyboardInterrupt: 

In [68]:
%%time
predict = model.predict(X_test)
print(classification_report(y_test, model.predict(X_test), digits=4))

CPU times: user 4min 4s, sys: 668 ms, total: 4min 5s
Wall time: 3min 58s


'             precision    recall  f1-score   support\n\n      101.0     0.9751    0.9955    0.9852       670\n      102.0     0.9943    0.9796    0.9869       538\n      103.0     1.0000    1.0000    1.0000       455\n      104.0     1.0000    0.9686    0.9840       159\n      106.0     1.0000    1.0000    1.0000       103\n      108.0     1.0000    0.9938    0.9969       162\n\navg / total     0.9906    0.9904    0.9904      2087\n'

In [72]:
print(_)

             precision    recall  f1-score   support

      101.0     0.9751    0.9955    0.9852       670
      102.0     0.9943    0.9796    0.9869       538
      103.0     1.0000    1.0000    1.0000       455
      104.0     1.0000    0.9686    0.9840       159
      106.0     1.0000    1.0000    1.0000       103
      108.0     1.0000    0.9938    0.9969       162

avg / total     0.9906    0.9904    0.9904      2087



In [74]:
model.predict_proba(X_test)

array([[  0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   1.00000000e+000],
       [  0.00000000e+000,   0.00000000e+000,   1.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000],
       [  1.00000000e+000,   3.74575540e-225,   0.00000000e+000,
          1.94507552e-247,   9.73069138e-289,   2.52901497e-315],
       ..., 
       [  0.00000000e+000,   3.41306971e-318,   1.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   4.39046198e-271],
       [  3.19829346e-111,   1.47690568e-201,   0.00000000e+000,
          1.00000000e+000,   1.45816618e-182,   8.68007817e-260],
       [  1.00000000e+000,   2.34850085e-236,   0.00000000e+000,
          5.28621093e-299,   3.83921116e-266,   0.00000000e+000]])

In [91]:
predict[2]

array([  1.00000000e+000,   3.74575540e-225,   0.00000000e+000,
         1.94507552e-247,   9.73069138e-289,   2.52901497e-315])

In [272]:
sum(all_df["comments_count"])

6310