# To 호현

- 기본 라이브러리 호출

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import json
import os
import pickle
import sys

from konlpy.tag import Kkma, Okt
import gc

import re
from sklearn.feature_extraction.text import CountVectorizer

from collections import defaultdict

- 감정 사전 호출

In [2]:
from ksenticnet_kaist import *

ksenticnet = get_ksenticnet()

keys = list(ksenticnet.keys())
senticvals = [[float(i) for i in val[:4]] for val in  ksenticnet.values()]
sentiments = []
polarity = []
semantics = []
for key, val in ksenticnet.items():
    for i in val[4:]:
        if i in ['positive', 'negative']:
            polar_ind = val.index(i)
            sentiments.append(val[4 : polar_ind])
            polarity.append(val[polar_ind : polar_ind+2])
            semantics.append(val[polar_ind+2 :])
            break

ksenticnets = defaultdict(dict)
for key, val, senti, p, seman in zip(keys, 
                                     senticvals, 
                                     sentiments, 
                                     polarity, 
                                     semantics):
    ksenticnets[key]['sentic_value'] = val
    ksenticnets[key]['sentiment'] = senti
    ksenticnets[key]['polarity'] = p
    ksenticnets[key]['semantic'] = seman

f = lambda x : [i if i > 0 else 0 for i in x]
g = lambda x : [abs(i) if i < 0 else 0 for i in x]
scores = np.array(list(map(lambda x : f(x) + g(x), senticvals)))
scores /= scores.sum(axis=1).reshape(-1, 1)

class KSenticNet():
    keys = {j : i for i, j in  enumerate(keys)}
    scores = scores

- SentimentLDAGibbsSampler 객체

In [3]:
MAX_VOCAB_SIZE = 50000

def sampleFromDirichlet(alpha):
    return np.random.dirichlet(alpha)

def sampleFromCategorical(theta):
    theta = theta / np.sum(theta)
    return np.random.multinomial(1, theta).argmax()

def word_indices(wordOccurenceVec):
    for idx in wordOccurenceVec.nonzero()[0]:
        for i in range(int(wordOccurenceVec[idx])):
            yield idx
            
class KSenticNet():
    keys = {j : i for i, j in  enumerate(keys)}
    scores = scores
    
class SentimentLDAGibbsSampler:
    
    def __init__(self, numTopics, alpha, beta, gamma, numSentiments=2):
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.numTopics = numTopics
        self.numSentiments = numSentiments
        
    def processSingleReview(self, review, st, d=None, stopwords=None):
        letters_only = re.sub('[^ㄱ-하-ㅣ가-힣]', ' ', review).strip()
        if not stopwords:
            stops = list('의가이은들는좀잘걍과도를자에와한것') + ['으로', '하다']
        else:
            stops = stopwords
        words = st.morphs(letters_only, stem=True, norm=True)
        meaningful_words = [w for w in words if w not in stops]
        return ' '.join(meaningful_words)
    
    def processReviews(self, reviews, st, saveAs=None, saveOverride=False, 
                       do_preprocess=True, return_processed_review=False):
        import os
        import dill
        if not saveOverride and saveAs and os.path.isfile(saveAs):
            [wordOccurenceMatrix, self.vectorizer] = dill.load(open(saveAs, 'r'))
            return wordOccurenceMatrix
        if do_preprocess:
            processed_reviews = []
            for i, review in enumerate(reviews):
                if (i + 1) % 10000 == 0:
                    print(' Review {} of {}'.format(i + 1, len(reviews)))
                processed_reviews.append(self.processSingleReview(review, st, i))
        else:
            processed_reviews = reviews
        if return_processed_review:
            return processed_reviews
        self.vectorizer = CountVectorizer(analyzer='word',
                                          tokenizer=None,
                                          preprocessor=None,
                                          max_features=MAX_VOCAB_SIZE)
        train_data_features = self.vectorizer.fit_transform(processed_reviews)
        wordOccurenceMatrix = train_data_features
        if saveAs:
            dill.dump([wordOccurenceMatrix, self.vectorizer], open(saveAs, 'w'))
        return wordOccurenceMatrix
    
    def _initialize_(self, reviews, st, saveAs=None, saveOverride=False, do_preprocess=True):
        self.wordOccurenceMatrix = self.processReviews(reviews, st, saveAs, saveOverride, do_preprocess)
        numDocs, vocabSize = self.wordOccurenceMatrix.shape
        
        # Pseudocounts
        self.n_dt = np.zeros((numDocs, self.numTopics))
        self.n_dts = np.zeros((numDocs, self.numTopics, self.numSentiments))
        self.n_d = np.zeros((numDocs))
        self.n_vts = np.zeros((vocabSize, self.numTopics, self.numSentiments))
        self.n_ts = np.zeros((self.numTopics, self.numSentiments))
        self.topics = {}
        self.sentiments = {}
        self.priorSentiment = {}
        
        alphaVec = self.alpha * np.ones(self.numTopics)
        gammaVec = self.gamma * np.ones(self.numSentiments)
        
        print('--* KSenticNet으로 사전 확률 조작 중... *--')
        # 감정 사전 (KSenticNEt)을 사용하여 사전 확률을 조작 중.
        for i, word in enumerate(self.vectorizer.get_feature_names()):
            w = KSenticNet.keys.get(word)
            if not w: continue
            synsets = KSenticNet.scores[w, :]
            self.priorSentiment[i] = np.random.choice(self.numSentiments, p=synsets)
        
        print('--* initialize 작업 진행 중... *--')
        for d in range(numDocs):
            if d % 5000 == 0: print(' Doc {} of {} Reviews'.format(d, numDocs))
            topicDistribution = sampleFromDirichlet(alphaVec)
            sentimentDistribution = np.zeros((self.numTopics, self.numSentiments))
            for t in range(self.numTopics):
                sentimentDistribution[t, :] = sampleFromDirichlet(gammaVec)
            for i, w in enumerate(word_indices(self.wordOccurenceMatrix[d, :].toarray()[0])):
                t = sampleFromCategorical(topicDistribution)
                s = sampleFromCategorical(sentimentDistribution[t, :])
                
                self.topics[(d, i)] = t
                self.sentiments[(d, i)] = s
                self.n_dt[d, t] += 1
                self.n_dts[d, t, s] += 1
                self.n_d[d] += 1
                self.n_vts[w, t, s] += 1
                self.n_ts[t, s] += 1
                
    def conditionalDistribution(self, d, v):
        probabilites_ts = np.ones((self.numTopics, self.numSentiments))
        firstFactor = (self.n_dt[d] + self.alpha) / \
                (self.n_d[d] + self.numTopics * self.alpha)
        secondFactor = (self.n_dts[d, :, :] + self.gamma) / \
                (self.n_dt[d, :] + self.numSentiments * self.gamma)[:, np.newaxis]
        thirdFactor = (self.n_vts[v, :, :] + self.beta) / \
                (self.n_ts + self.n_vts.shape[0] * self.beta)
        probabilites_ts *= firstFactor[:, np.newaxis]
        probabilites_ts *= secondFactor * thirdFactor
        probabilites_ts /= np.sum(probabilites_ts)
        return probabilites_ts
                
    def run(self, reviews, st, maxIters=30, saveAs=None, saveOverride=False, do_preprocess=True):
        self._initialize_(reviews, st, saveAs, saveOverride, do_preprocess)
        numDocs, vocabSize = self.wordOccurenceMatrix.shape
        for iteration in range(maxIters):
            gc.collect()
            print('Starting iteration {} of {}'.format(iteration + 1, maxIters))
            for d in range(numDocs):
                for i, v in enumerate(word_indices(self.wordOccurenceMatrix[d, :].toarray()[0])):
                    t = self.topics[(d, i)]
                    s = self.sentiments[(d, i)]
                    self.n_dt[d, t] -= 1
                    self.n_d[d] -= 1
                    self.n_dts[d, t, s] -= 1
                    self.n_vts[v, t, s] -= 1
                    self.n_ts[t, s] -= 1
                    
                    probabilites_ts = self.conditionalDistribution(d, v)
                    if v in self.priorSentiment:
                        s = self.priorSentiment[v]
                        t = sampleFromCategorical(probabilites_ts[:, s])
                    else:
                        ind = sampleFromCategorical(probabilites_ts.flatten())
                        t, s = np.unravel_index(ind, probabilites_ts.shape)
                    
                    self.topics[(d, i)] = t
                    self.sentiments[(d, i)] = s
                    self.n_dt[d, t] += 1
                    self.n_d[d] += 1
                    self.n_dts[d, t, s] += 1
                    self.n_vts[v, t, s] += 1
                    self.n_ts[t, s] += 1
        print('Done.')

# 준비 과정

In [4]:
# 데이터 호출 (spacing 전처리 수행 o)
df2 = pd.read_csv('spacing_nsmc_data.csv')

In [5]:
# 감정 분류 호출
with open('1st_jst_result.pkl', 'rb') as f:
    JST = pickle.load(f)

In [6]:
res = defaultdict(list)
for i, j in JST.sentiments.items():
    res[i[0]].append(j)

from collections import Counter
res = {i : Counter(j) for i, j in res.items()}

i2senti = {0 : 'joy', 1 : 'interest', 2 : 'anger', 3 : 'admiration',
           4 : 'sadness', 5 : 'surprise', 6 : 'fear', 7 : 'disgust'}

In [7]:
senti_label_each_review = [[] for _ in range(len(df2))]
for i in range(len(df2)):
    if res.get(i):
        for j in res.get(i).most_common(2):
            senti_label_each_review[i].append(i2senti[j[0]])
    else:
        senti_label_each_review[i].append(['neutral'])

In [8]:
X_train = df2['review'].tolist()
y_train = senti_label_each_review

In [9]:
X_train[:5]

['전체 관람가는 아닌 것 같아요',
 '디렉터스 컷으로 봐서 거의 3시간 짜리인데 참 흡인력 있다',
 '태어나 처음으로 가슴 아리는 영화였다. 20년 이상 지났지만.. 생각하면 또 가슴이 아리는.. 황순원의 소나기에서 또 한번 느꼈던 그 느낌!',
 '어린시절 고딩 때 봤던 때랑 또 결혼하고 나서 봤을 때의 느낌은 확실히 다르네요. 뭔가 알프레도를 더 이해하게 되고.. 토토와 알프레도의 우정이 정말 아름다운 것이었음을, 토토의 첫사랑이 참 풋풋했음을 느낍니다~~ 그리고 언제 들어도 사랑스러운 최고의 영화음악!',
 '토토에게 넓은 세상을 보여주고 픈 알프레도.. 그가 토토를 위해 정을 떼려고 했던 장면에 왠지 씁쓸했고, 친구, 스승을 떠나 아버지 같은 느낌을 받게 되었다.. 평생 못잊을 장면이자 추억이다.']

In [10]:
y_train[:5]

[['fear', 'admiration'],
 ['surprise', 'sadness'],
 ['admiration', 'sadness'],
 ['admiration', 'interest'],
 ['joy', 'interest']]

In [None]:
for i in range(len(X_train)):
    X_train[i] = tokenizer_twitter_morphs(X_train[i])
X_train[:5]

# 아래에 분류 모델 만들어 주세요!

# Word-embedding: TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, auc, f1_score
from sklearn import metrics
import matplotlib.pyplot as plt

In [12]:
import json
import pandas as pd
import numpy as np
from konlpy.tag import Komoran
from konlpy.tag import Okt
from time import time
import pickle
import os
import re

In [27]:
okt = Okt()
def tokenizer_okt_morphs(doc):
    return okt.morphs(doc)

def tokenizer_okt_noun(doc):
    return okt.nouns(doc)

def tokenizer_okt_pos(doc):
    return okt.pos(doc, norm=True, stem=True)

In [14]:
komoran = Komoran()
def tokenizer_noun(doc):
    return komoran.nouns(doc)

def tokenizer_morphs(doc):
    return komoran.morphs(doc)

In [25]:
result = []
for line in X_train:
    malist = okt.pos(line, norm = True, stem = True)
    r = []
    for word in malist:
        if not word[1] in ["Josa","Eomi","Punctuation"]:
            r.append(word[0])
    rl = (" ".join(r)).strip()
    result.append(rl)

In [29]:
X_train_t = result

In [26]:
result[:5]

['전체 관람 가다 아니다 것 같다',
 '디렉터 스 컷 보다 거의 3시간 짜다 리 차다 흡인 력 있다',
 '태어나다 처음 가슴 아리다 영화 이다 20년 이상 지나다 생각 하다 또 가슴 아리다 황순원 소나기 또 한번 느끼다 그 느낌',
 '어린시절 고딩 때 보다 때 또 결혼 나서다 보다 때 느낌 확실하다 다르다 뭔가 알 프레 도르다 더 이해 하다 되다 토토 알 프레 도의 우정 정말 아름답다 것 이다 음 토토 첫사랑 차다 풋풋하다 음 느끼다 그리고 언제 들다 사랑스럽다 최고 영화음악',
 '토토 넓다 세상 보여주다 픈 알 프레 그 토토 위해 정 떼다 하다 장면 왠지 씁쓸하다 친구 스승 떠나다 아버지 같다 느낌 받다 되어다 평생 못 잊다 장면 이자 추억']

In [30]:
for i in range(len(X_train_t)):
    X_train_t[i] = tokenizer_okt_morphs(X_train_t[i])
X_train_t[:5]

[['전체', '관람', '가다', '아니다', '것', '같다'],
 ['디렉터', '스', '컷', '보다', '거의', '3시간', '짜다', '리', '차다', '흡인', '력', '있다'],
 ['태어나다',
  '처음',
  '가슴',
  '아리',
  '다',
  '영화',
  '이다',
  '20년',
  '이상',
  '지나다',
  '생각',
  '하다',
  '또',
  '가슴',
  '아리',
  '다',
  '황순원',
  '소나기',
  '또',
  '한번',
  '느끼다',
  '그',
  '느낌'],
 ['어린시절',
  '고딩',
  '때',
  '보다',
  '때',
  '또',
  '결혼',
  '나서다',
  '보다',
  '때',
  '느낌',
  '확실하다',
  '다르다',
  '뭔가',
  '알',
  '프레',
  '도르다',
  '더',
  '이해',
  '하다',
  '되다',
  '토토',
  '알',
  '프레',
  '도의',
  '우정',
  '정말',
  '아름답다',
  '것',
  '이다',
  '음',
  '토토',
  '첫사랑',
  '차다',
  '풋풋하다',
  '음',
  '느끼다',
  '그리고',
  '언제',
  '들다',
  '사랑스럽다',
  '최고',
  '영화음악'],
 ['토토',
  '넓다',
  '세상',
  '보여주다',
  '픈',
  '알',
  '프레',
  '그',
  '토토',
  '위해',
  '정',
  '떼다',
  '하다',
  '장면',
  '왠지',
  '씁쓸하다',
  '친구',
  '스승',
  '떠나다',
  '아버지',
  '같다',
  '느낌',
  '받다',
  '되어다',
  '평생',
  '못',
  '잊다',
  '장면',
  '이자',
  '추억']]

In [34]:
X_train_t_10 = []
y_train_t_10 = []
for i in range(len(X_train_t)):
    if len(X_train_t[i]) >= 10:
        X_train_t_10.append(result[i])
        y_train_t_10.append(y_train[i])
print(len(X_train_t_10),len(y_train_t_10))
print(X_train_t_10[:5])

306743 306743
[['디렉터', '스', '컷', '보다', '거의', '3시간', '짜다', '리', '차다', '흡인', '력', '있다'], ['태어나다', '처음', '가슴', '아리', '다', '영화', '이다', '20년', '이상', '지나다', '생각', '하다', '또', '가슴', '아리', '다', '황순원', '소나기', '또', '한번', '느끼다', '그', '느낌'], ['어린시절', '고딩', '때', '보다', '때', '또', '결혼', '나서다', '보다', '때', '느낌', '확실하다', '다르다', '뭔가', '알', '프레', '도르다', '더', '이해', '하다', '되다', '토토', '알', '프레', '도의', '우정', '정말', '아름답다', '것', '이다', '음', '토토', '첫사랑', '차다', '풋풋하다', '음', '느끼다', '그리고', '언제', '들다', '사랑스럽다', '최고', '영화음악'], ['토토', '넓다', '세상', '보여주다', '픈', '알', '프레', '그', '토토', '위해', '정', '떼다', '하다', '장면', '왠지', '씁쓸하다', '친구', '스승', '떠나다', '아버지', '같다', '느낌', '받다', '되어다', '평생', '못', '잊다', '장면', '이자', '추억'], ['인생', '최고', '영화', '말', '필요', '없다', '감독판', '감동', '좀', '덜하다']]


In [None]:
len(X_train_c[3])

In [28]:
tfidf = TfidfVectorizer(tokenizer=tokenizer_okt_morphs)

### Gaussian Navie Bayes

In [19]:
class DenseTransformer():

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [20]:
Gaussian_nbc = Pipeline([('vect', tfidf), ('nbc', GaussianNB())], ('to_dense', DenseTransformer()))

In [21]:
start = time()
Gaussian_nbc.fit(X_train_t_10, y_train_t_10)
end = time()
print('Time: {:f}s'.format(end-start))

ValueError: 'memory' should be None, a string or have the same interface as joblib.Memory. Got memory='('to_dense', <__main__.DenseTransformer object at 0x0000023BB06FA048>)' instead.

In [None]:
y_pred_nb = Gaussian_nbc.predict(test["document"])

In [33]:
for i in range(len(X_train)):
    X_train[i] = tokenizer_twitter_morphs(X_train[i])
X_train[:5]

TypeError: No matching overloads found for kr.lucypark.okt.OktInterface.tokenize(list,java.lang.Boolean,java.lang.Boolean), options are:
	public java.util.List kr.lucypark.okt.OktInterface.tokenize(java.lang.String,java.lang.Boolean,java.lang.Boolean)

	at JPMethod::findOverload(native\common\jp_method.cpp:242)
	at JPMethod::findOverload(native\common\jp_method.cpp:245)
	at JPMethod::invoke(native\common\jp_method.cpp:253)
	at PyJPMethod::__call__(native\python\pyjp_method.cpp:142)


In [None]:
fpr_nb, tpr_nb, thresholds = metrics.roc_curve(test["label"], y_pred_nb)
print("Test Accuracy  : {:.3f}".format(accuracy_score(test["label"], y_pred_nb)))
print("Test Precision : {:.3f}".format(precision_score(test["label"], y_pred_nb)))
print("Test Recall    : {:.3f}".format(recall_score(test["label"], y_pred_nb)))
print("Test F1_score  : {:.3f}".format(f1_score(test["label"], y_pred_nb)))
print("Test auc       : {:.3f}".format(metrics.auc(fpr_nb, tpr_nb)))

### Random Forest

In [None]:
rndf_clf = Pipeline([('vect', tfidf), ('rndf', RandomForestClassifier())])

In [None]:
start = time()
rndf_clf.fit(X_train, y_train)
end = time()
print('Time: {:f}s'.format(end-start))

In [None]:
y_pred_rndf = rndf_clf.predict(test["document"])

In [None]:
fpr_rndf, tpr_rndf, thresholds = metrics.roc_curve(test["label"], y_pred_rndf)
print("Test Accuracy  : {:.3f}".format(accuracy_score(test["label"], y_pred_rndf)))
print("Test Precision : {:.3f}".format(precision_score(test["label"], y_pred_rndf)))
print("Test Recall    : {:.3f}".format(recall_score(test["label"], y_pred_rndf)))
print("Test F1_score  : {:.3f}".format(f1_score(test["label"], y_pred_rndf)))
print("Test auc       : {:.3f}".format(metrics.auc(fpr_sgd, tpr_rndf)))

## Logistic Regression