In [1]:
import pandas as pd
from konlpy.tag import Twitter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
import pickle
import numpy as np
from collections import namedtuple
from gensim.models import doc2vec

pos_tagger = Twitter()

In [2]:
with open('../input/x_train.pickle', 'rb') as f:
    x_train = pickle.load(f)
with open('../input/x_test.pickle', 'rb') as f:
    x_test = pickle.load(f)
with open('../input/y_train.pickle', 'rb') as f:
    y_train = pickle.load(f)
with open('../input/y_test.pickle', 'rb') as f:
    y_test = pickle.load(f)

In [3]:
stop_dr = "../input/stopword.txt"
f = open(stop_dr, 'r')
stopword_korean = set([value.strip() for value in f.readlines()])

In [4]:
def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [5]:
x_train  = [[i.split("/")[0] for i in tokenize(j) if i.split("/")[1] in ['Noun','Verb','Adjective'] and i.split("/")[0] not in stopword_korean ]for j in x_train ]
#x_train = [','.join(i) for i in x_train]

x_test  = [[i.split("/")[0] for i in tokenize(j) if i.split("/")[1] in ['Noun','Verb','Adjective'] and i.split("/")[0] not in stopword_korean ]for j in x_test ]
#x_test = [','.join(i) for i in x_test]

In [6]:
train_docs = [(token, np.where(label==1.0)[0][0]) for token, label in zip(x_train,y_train)]

In [7]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]

In [8]:
doc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer.build_vocab(tagged_train_docs)

for epoch in range(10):
    doc_vectorizer.train(tagged_train_docs,total_examples=doc_vectorizer.corpus_count,epochs=doc_vectorizer.epochs)
    doc_vectorizer.alpha -= 0.002 
    doc_vectorizer.min_alpha = doc_vectorizer.alpha

In [9]:
train_vector = [doc_vectorizer.infer_vector(i) for i in x_train]
test_vector = [doc_vectorizer.infer_vector(i) for i in x_test]

In [10]:
svm = OneVsRestClassifier(SGDClassifier(loss="log", penalty="l2", random_state=40,max_iter=10), n_jobs=-1).fit(train_vector, y_train)
predict = svm.predict_proba(test_vector)
for standard in [x * 0.1 for x in range(1, 10)]:
    print("Probability: ", standard)
    print(classification_report(y_test==1, predict>standard))

Probability:  0.1
             precision    recall  f1-score   support

          0       0.38      0.63      0.47       161
          1       0.21      0.36      0.27        55
          2       0.00      0.00      0.00         1
          3       0.27      0.82      0.41       240
          4       0.00      0.00      0.00        17
          5       0.33      1.00      0.50         1
          6       0.43      0.92      0.59       566
          7       0.44      0.05      0.10        75
          8       0.17      0.59      0.27       170
          9       0.34      0.37      0.35       199
         10       0.34      0.98      0.51       455

avg / total       0.34      0.76      0.45      1940

Probability:  0.2
             precision    recall  f1-score   support

          0       0.55      0.50      0.52       161
          1       0.26      0.15      0.19        55
          2       0.00      0.00      0.00         1
          3       0.39      0.62      0.48       240
      

  'precision', 'predicted', average, warn_for)
