In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import json
import os
import pickle
import sys

from konlpy.tag import Kkma, Okt
import gc

import re
from sklearn.feature_extraction.text import CountVectorizer

from collections import defaultdict

In [2]:
from ksenticnet_kaist import *

ksenticnet = get_ksenticnet()

keys = list(ksenticnet.keys())
senticvals = [[float(i) for i in val[:4]] for val in  ksenticnet.values()]
sentiments = []
polarity = []
semantics = []
for key, val in ksenticnet.items():
    for i in val[4:]:
        if i in ['positive', 'negative']:
            polar_ind = val.index(i)
            sentiments.append(val[4 : polar_ind])
            polarity.append(val[polar_ind : polar_ind+2])
            semantics.append(val[polar_ind+2 :])
            break

ksenticnets = defaultdict(dict)
for key, val, senti, p, seman in zip(keys, 
                                     senticvals, 
                                     sentiments, 
                                     polarity, 
                                     semantics):
    ksenticnets[key]['sentic_value'] = val
    ksenticnets[key]['sentiment'] = senti
    ksenticnets[key]['polarity'] = p
    ksenticnets[key]['semantic'] = seman

f = lambda x : [i if i > 0 else 0 for i in x]
g = lambda x : [abs(i) if i < 0 else 0 for i in x]
scores = np.array(list(map(lambda x : f(x) + g(x), senticvals)))
scores /= scores.sum(axis=1).reshape(-1, 1)

class KSenticNet():
    keys = {j : i for i, j in  enumerate(keys)}
    scores = scores

In [3]:
MAX_VOCAB_SIZE = 50000

def sampleFromDirichlet(alpha):
    return np.random.dirichlet(alpha)

def sampleFromCategorical(theta):
    theta = theta / np.sum(theta)
    return np.random.multinomial(1, theta).argmax()

def word_indices(wordOccurenceVec):
    for idx in wordOccurenceVec.nonzero()[0]:
        for i in range(int(wordOccurenceVec[idx])):
            yield idx
            
class KSenticNet():
    keys = {j : i for i, j in  enumerate(keys)}
    scores = scores
    
class SentimentLDAGibbsSampler:
    
    def __init__(self, numTopics, alpha, beta, gamma, numSentiments=2):
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.numTopics = numTopics
        self.numSentiments = numSentiments
        
    def processSingleReview(self, review, st, d=None, stopwords=None):
        letters_only = re.sub('[^ㄱ-하-ㅣ가-힣]', ' ', review).strip()
        if not stopwords:
            stops = list('의가이은을로들는좀잘걍과도를자에와한것') + ['으로', '하다']
        else:
            stops = stopwords
        words = st.morphs(letters_only, stem=True, norm=True)
        meaningful_words = [w for w in words if w not in stops]
        return ' '.join(meaningful_words)
    
    def processReviews(self, reviews, st, saveAs=None, saveOverride=False, 
                       do_preprocess=True, return_processed_review=False):
        import os
        import dill
        if not saveOverride and saveAs and os.path.isfile(saveAs):
            [wordOccurenceMatrix, self.vectorizer] = dill.load(open(saveAs, 'r'))
            return wordOccurenceMatrix
        if do_preprocess:
            processed_reviews = []
            for i, review in enumerate(reviews):
                if (i + 1) % 10000 == 0:
                    print(' Review {} of {}'.format(i + 1, len(reviews)))
                processed_reviews.append(self.processSingleReview(review, st, i))
        else:
            processed_reviews = reviews
        if return_processed_review:
            return processed_reviews
        self.vectorizer = CountVectorizer(analyzer='word',
                                          tokenizer=None,
                                          preprocessor=None,
                                          max_features=MAX_VOCAB_SIZE)
        train_data_features = self.vectorizer.fit_transform(processed_reviews)
        wordOccurenceMatrix = train_data_features
        if saveAs:
            dill.dump([wordOccurenceMatrix, self.vectorizer], open(saveAs, 'w'))
        return wordOccurenceMatrix
    
    def _initialize_(self, reviews, st, saveAs=None, saveOverride=False, do_preprocess=True):
        self.wordOccurenceMatrix = self.processReviews(reviews, st, saveAs, saveOverride, do_preprocess)
        numDocs, vocabSize = self.wordOccurenceMatrix.shape
        
        # Pseudocounts
        self.n_dt = np.zeros((numDocs, self.numTopics))
        self.n_dts = np.zeros((numDocs, self.numTopics, self.numSentiments))
        self.n_d = np.zeros((numDocs))
        self.n_vts = np.zeros((vocabSize, self.numTopics, self.numSentiments))
        self.n_ts = np.zeros((self.numTopics, self.numSentiments))
        self.topics = {}
        self.sentiments = {}
        self.priorSentiment = {}
        
        alphaVec = self.alpha * np.ones(self.numTopics)
        gammaVec = self.gamma * np.ones(self.numSentiments)
        
        print('--* KSenticNet으로 사전 확률 조작 중... *--')
        # 감정 사전 (KSenticNEt)을 사용하여 사전 확률을 조작 중.
        for i, word in enumerate(self.vectorizer.get_feature_names()):
            w = KSenticNet.keys.get(word)
            if not w: continue
            synsets = KSenticNet.scores[w, :]
            self.priorSentiment[i] = np.random.choice(self.numSentiments, p=synsets)
        
        print('--* initialize 작업 진행 중... *--')
        for d in range(numDocs):
            if d % 5000 == 0: print(' Doc {} of {} Reviews'.format(d, numDocs))
            topicDistribution = sampleFromDirichlet(alphaVec)
            sentimentDistribution = np.zeros((self.numTopics, self.numSentiments))
            for t in range(self.numTopics):
                sentimentDistribution[t, :] = sampleFromDirichlet(gammaVec)
            for i, w in enumerate(word_indices(self.wordOccurenceMatrix[d, :].toarray()[0])):
                t = sampleFromCategorical(topicDistribution)
                s = sampleFromCategorical(sentimentDistribution[t, :])
                
                self.topics[(d, i)] = t
                self.sentiments[(d, i)] = s
                self.n_dt[d, t] += 1
                self.n_dts[d, t, s] += 1
                self.n_d[d] += 1
                self.n_vts[w, t, s] += 1
                self.n_ts[t, s] += 1
                
    def conditionalDistribution(self, d, v):
        probabilites_ts = np.ones((self.numTopics, self.numSentiments))
        firstFactor = (self.n_dt[d] + self.alpha) / \
                (self.n_d[d] + self.numTopics * self.alpha)
        secondFactor = (self.n_dts[d, :, :] + self.gamma) / \
                (self.n_dt[d, :] + self.numSentiments * self.gamma)[:, np.newaxis]
        thirdFactor = (self.n_vts[v, :, :] + self.beta) / \
                (self.n_ts + self.n_vts.shape[0] * self.beta)
        probabilites_ts *= firstFactor[:, np.newaxis]
        probabilites_ts *= secondFactor * thirdFactor
        probabilites_ts /= np.sum(probabilites_ts)
        return probabilites_ts
                
    def run(self, reviews, st, maxIters=30, saveAs=None, saveOverride=False, do_preprocess=True):
        self._initialize_(reviews, st, saveAs, saveOverride, do_preprocess)
        numDocs, vocabSize = self.wordOccurenceMatrix.shape
        for iteration in range(maxIters):
            gc.collect()
            print('Starting iteration {} of {}'.format(iteration + 1, maxIters))
            for d in range(numDocs):
                for i, v in enumerate(word_indices(self.wordOccurenceMatrix[d, :].toarray()[0])):
                    t = self.topics[(d, i)]
                    s = self.sentiments[(d, i)]
                    self.n_dt[d, t] -= 1
                    self.n_d[d] -= 1
                    self.n_dts[d, t, s] -= 1
                    self.n_vts[v, t, s] -= 1
                    self.n_ts[t, s] -= 1
                    
                    probabilites_ts = self.conditionalDistribution(d, v)
                    if v in self.priorSentiment:
                        s = self.priorSentiment[v]
                        t = sampleFromCategorical(probabilites_ts[:, s])
                    else:
                        ind = sampleFromCategorical(probabilites_ts.flatten())
                        t, s = np.unravel_index(ind, probabilites_ts.shape)
                    
                    self.topics[(d, i)] = t
                    self.sentiments[(d, i)] = s
                    self.n_dt[d, t] += 1
                    self.n_d[d] += 1
                    self.n_dts[d, t, s] += 1
                    self.n_vts[v, t, s] += 1
                    self.n_ts[t, s] += 1
        print('Done.')

# 준비

In [4]:
# 데이터 호출 (spacing 전처리 수행 o)
df2 = pd.read_csv('spacing_nsmc_data.csv')

In [5]:
# 감정 분류 호출
with open('1st_jst_result.pkl', 'rb') as f:
    JST = pickle.load(f)

In [6]:
res = defaultdict(list)
for i, j in JST.sentiments.items():
    res[i[0]].append(j)

from collections import Counter
res = {i : Counter(j) for i, j in res.items()}

i2senti = {0 : 'joy', 1 : 'interest', 2 : 'anger', 3 : 'admiration',
           4 : 'sadness', 5 : 'surprise', 6 : 'fear', 7 : 'disgust'}

In [7]:
senti_label_each_review = [[] for _ in range(len(df2))]
for i in range(len(df2)):
    if res.get(i):
        for j in res.get(i).most_common(2):
            senti_label_each_review[i].append(i2senti[j[0]])
    else:
        senti_label_each_review[i].append(['neutral'])

In [8]:
X_train = df2['review'].copy()
y_train = senti_label_each_review

In [9]:
X_train[:7]

0                                     전체 관람가는 아닌 것 같아요
1                     디렉터스 컷으로 봐서 거의 3시간 짜리인데 참 흡인력 있다
2    태어나 처음으로 가슴 아리는 영화였다. 20년 이상 지났지만.. 생각하면 또 가슴이...
3    어린시절 고딩 때 봤던 때랑 또 결혼하고 나서 봤을 때의 느낌은 확실히 다르네요. ...
4    토토에게 넓은 세상을 보여주고 픈 알프레도.. 그가 토토를 위해 정을 떼려고 했던 ...
5                  인생 최고의 영화. 말이 필요 없음. 감독판은 감동이 좀 덜함.
6                   아름다운 영화 지금까지 봤던 영화 중 끝까지 감동적이었던 영화
Name: review, dtype: object

In [10]:
y_train[:7]

[['fear', 'admiration'],
 ['surprise', 'sadness'],
 ['admiration', 'sadness'],
 ['admiration', 'interest'],
 ['joy', 'interest'],
 ['interest', 'admiration'],
 ['sadness', 'surprise']]

In [40]:
import pickle
with open('191110_1353_processed_review.pkl', 'rb') as f:
    processed_reviews = pickle.load(f)

In [42]:
len(processed_reviews)

712383

In [46]:
df = pd.read_csv('../sentiment_analysis/movie_review_sentiment/raw_data_nsmc.csv')

In [47]:
df.head()

Unnamed: 0,author,date,movie_id,rating,review,review_id,year,class
0,dhrl****,15.08.25,10001,10,전체관람가는 아닌것 같아요,10275182,15,POS
1,yuns****,15.08.25,10001,10,디렉터스컷으로봐서 거의 3시간짜리인데 참 흡인력있다,10272934,15,POS
2,supe****,15.08.23,10001,10,태어나 처음으로 가슴아리는 영화였다. 20년이상 지났지만.. 생각하면 또 가슴이...,10265507,15,POS
3,clai****,15.08.14,10001,10,어린시절 고딩때 봤던 때랑 또 결혼하고 나서 봤을때의 느낌은 확실히 다르네요. 뭔가...,10228406,15,POS
4,dlag****,15.08.11,10001,10,토토에게 넓은 세상을 보여주고픈 알프레도.. 그가 토토를 위해 정을 떼려고 했던 장...,10216349,15,POS


In [48]:
# 전처리 안된 놈 비교를 위해 tokenizing

In [50]:
from konlpy.tag import Okt
okt = Okt()
%time raw_reviews = df['review'].map(lambda x : okt.morphs(x, stem=True, norm=True))

-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


Wall time: 34min 59s


In [51]:
with open('191110_1353_raw_review.pkl', 'wb') as f:
    pickle.dump(raw_reviews, f, protocol=pickle.HIGHEST_PROTOCOL)

In [52]:
len(raw_reviews)

712383

# Word Embedding

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [113]:
raw_X_train = raw_reviews
y_train = senti_label_each_review

In [114]:
y_train2 = np.array([i[0] for i in y_train])

y_label = np.zeros((len(y_train), len(i2senti.values())))
senti2i = {j : i for i, j in i2senti.items()}
for ix, contents in enumerate(y_train):
    for j in contents:
        if j == ['neutral']:
            continue
        if y_label[ix, senti2i[j]] == 0:
            y_label[ix, senti2i[j]] += 1

In [115]:
ind = np.where(y_label.sum(axis=1) != 0)

In [116]:
y_label = y_label[ind]
y = y_train2[ind]

In [70]:
raw_reviews2 = pd.Series(raw_reviews).map(lambda x : ' '.join(x))
processed_reviews2 = pd.Series(processed_reviews).map(lambda x : ' '.join(x))

In [80]:
raw_X_train = raw_reviews2.values[ind]
processed_X_train = processed_reviews2.values[ind]

In [98]:
raw_X_test = raw_X_train[500000:]
raw_X_train = raw_X_train[:500000]
processed_X_test = processed_X_train[500000:]
processed_X_train = processed_X_train[:500000]

In [100]:
MAX_VOCAB_SIZE = 50000

# CountVectorizer
countvect = CountVectorizer(analyzer='word',
                            tokenizer=None,
                            preprocessor=None,
                            max_features=MAX_VOCAB_SIZE)
%time raw_cvect_X_train = countvect.fit_transform(raw_X_train)
raw_cvect_X_test = countvect.transform(raw_X_test)

countvect = CountVectorizer(analyzer='word',
                            tokenizer=None,
                            preprocessor=None,
                            max_features=MAX_VOCAB_SIZE)
%time processed_cvect_X_train = countvect.fit_transform(processed_X_train)
processed_cvect_X_test = countvect.transform(processed_X_test)

# TfidfVectorizer
tfidfvect = TfidfVectorizer(tokenizer=None, max_features=50000)
%time raw_tfidf_X_train = tfidfvect.fit_transform(raw_X_train)
raw_tfidf_X_test = tfidfvect.transform(raw_X_test)

tfidfvect = TfidfVectorizer(tokenizer=None, max_features=50000)
%time processed_tfidf_X_train = tfidfvect.fit_transform(processed_X_train)
processed_tfidf_X_test = tfidfvect.transform(processed_X_test)

Wall time: 6.15 s
Wall time: 5.35 s
Wall time: 6.93 s
Wall time: 5.32 s


In [180]:
r = tfidfvect.transform(['중국집에 짜장면먹으러 왔다'])

In [185]:
okt.morphs('중국집에 짜장면먹으러 왔다.')

['중국집', '에', '짜장면', '먹으러', '왔다', '.']

In [120]:
y_train = np.array(list(map(lambda x : senti2i[x], y[:500000])))
y_test = np.array(list(map(lambda x : senti2i[x], y[500000:])))

In [94]:
multi_nbc = MultinomialNB()

In [101]:
%%time
multi_nbc.fit(raw_cvect_X_train, y_train)
raw_cvect_y_pred = multi_nbc.predict(raw_cvect_X_test)

multi_nbc.fit(raw_tfidf_X_train, y_train)
raw_tfidf_y_pred = multi_nbc.predict(raw_tfidf_X_test)

multi_nbc.fit(processed_cvect_X_train, y_train)
processed_cvect_y_pred = multi_nbc.predict(processed_cvect_X_test)

multi_nbc.fit(processed_tfidf_X_train, y_train)
processed_tfidf_y_pred = multi_nbc.predict(processed_tfidf_X_test)

Wall time: 1.43 s


In [136]:
from sklearn.metrics import confusion_matrix

In [138]:
confu_mat1 = confusion_matrix(y_test, raw_cvect_y_pred)
confu_mat2 = confusion_matrix(y_test, raw_tfidf_y_pred)
confu_mat3 = confusion_matrix(y_test, processed_cvect_y_pred)
confu_mat4 = confusion_matrix(y_test, processed_tfidf_y_pred)

In [140]:
print('<Gaussian Naive Bayes>')
print('      Raw / CountVectorizer acc :', accuracy_score(y_test, raw_cvect_y_pred))
print('\t\t\t recall :', np.mean(np.diag(confu_mat1) / confu_mat1.sum(axis=1)))
print('      Raw / TfidfVectorizer acc :', accuracy_score(y_test, raw_tfidf_y_pred))
print('\t\t\t recall :', np.mean(np.diag(confu_mat2) / confu_mat2.sum(axis=1)))
print('Processed / CountVectorizer acc :', accuracy_score(y_test, processed_cvect_y_pred))
print('\t\t\t recall :', np.mean(np.diag(confu_mat3) / confu_mat3.sum(axis=1)))
print('Processed / TfidfVectorizer acc :', accuracy_score(y_test, processed_tfidf_y_pred))
print('\t\t\t recall :', np.mean(np.diag(confu_mat4) / confu_mat4.sum(axis=1)))

<Gaussian Naive Bayes>
      Raw / CountVectorizer acc : 0.5161420079555443
			 recall : 0.5135930879160164
      Raw / TfidfVectorizer acc : 0.5104509631571891
			 recall : 0.5067899330597001
Processed / CountVectorizer acc : 0.5279164121587732
			 recall : 0.5255563870936407
Processed / TfidfVectorizer acc : 0.5214903982241557
			 recall : 0.5178706321685447


In [165]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

knn_clf = KNeighborsClassifier(n_neighbors=7)
svm_clf = SVC(gamma='scale')
log_clf = LogisticRegression(multi_class='auto', solver='lbfgs')
tree_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier(n_estimators=200)
xgb_clf = XGBClassifier(n_estimators=200)
lgbm_clf = LGBMClassifier(n_estimators=200)

In [171]:
def print_model_result(model, train, test,
                       acc_text='', recall_text=''):
    model.fit(train, y_train)
    y_pred = model.predict(test)
    confu_mat = confusion_matrix(y_test, y_pred)
    print(acc_text + '   acc : {:.2%}'.format(
        accuracy_score(y_test, y_pred)))
    print(recall_text + 'recall : {:.2%}'.format(
        np.mean(np.diag(confu_mat) / confu_mat.sum(axis=1))))

In [172]:
def print_res(model, text):
    print(text)
    print_model_result(model, raw_cvect_X_train, raw_cvect_X_test,
                       acc_text='      Raw / CountVectorizer ',
                       recall_text='\t\t\t    ')
    print_model_result(model, processed_cvect_X_train, processed_cvect_X_test, 
                       acc_text='Processed / CountVectorizer ',
                       recall_text='\t\t\t    ')
    print_model_result(model, raw_tfidf_X_train, raw_tfidf_X_test, 
                       acc_text='Processed / CountVectorizer ',
                       recall_text='\t\t\t    ')
    print_model_result(model, processed_tfidf_X_train, processed_tfidf_X_test, 
                       acc_text='Processed / CountVectorizer ',
                       recall_text='\t\t\t    ')

In [168]:
print_res(multi_nbc, '<Gaussian Naive Bayes>')

<Gaussian Naive Bayes>
      Raw / CountVectorizer    acc : 51.61%
			    recall : 51.36%
Processed / CountVectorizer    acc : 52.79%
			    recall : 52.56%
Processed / CountVectorizer    acc : 51.05%
			    recall : 50.68%
Processed / CountVectorizer    acc : 52.15%
			    recall : 51.79%


In [None]:
print_res(knn_clf, '<K-Nearest Neighborhoods>')

In [None]:
print_res(log_clf, '<Logistic Regression>')

In [None]:
print_res(tree_clf, '<Decision Tree>')

In [169]:
print_res(rf_clf, '<Random Forest>')

<Random Forest>


KeyboardInterrupt: 

In [None]:
print_res(xgb_clf, '<XGBoost Classifier>')

In [None]:
print_res(lgbm_clf, '<LightGBM Classifier>')

In [174]:
rf_clf = RandomForestClassifier(n_estimators=200, verbose=1, n_jobs=6)

In [175]:
rf_clf.fit(raw_cvect_X_train, y_train)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed: 40.2min


KeyboardInterrupt: 

# Ensemble

In [177]:
JST.topics

{(0, 0): 1,
 (0, 1): 1,
 (0, 2): 1,
 (0, 3): 1,
 (0, 4): 3,
 (1, 0): 3,
 (1, 1): 0,
 (1, 2): 0,
 (1, 3): 2,
 (1, 4): 1,
 (1, 5): 1,
 (1, 6): 2,
 (1, 7): 3,
 (1, 8): 3,
 (2, 0): 1,
 (2, 1): 1,
 (2, 2): 1,
 (2, 3): 3,
 (2, 4): 1,
 (2, 5): 3,
 (2, 6): 1,
 (2, 7): 1,
 (2, 8): 3,
 (2, 9): 2,
 (2, 10): 2,
 (2, 11): 2,
 (2, 12): 2,
 (2, 13): 2,
 (2, 14): 1,
 (2, 15): 1,
 (2, 16): 0,
 (3, 0): 0,
 (3, 1): 0,
 (3, 2): 2,
 (3, 3): 1,
 (3, 4): 0,
 (3, 5): 3,
 (3, 6): 3,
 (3, 7): 2,
 (3, 8): 3,
 (3, 9): 3,
 (3, 10): 3,
 (3, 11): 3,
 (3, 12): 1,
 (3, 13): 1,
 (3, 14): 2,
 (3, 15): 2,
 (3, 16): 2,
 (3, 17): 1,
 (3, 18): 2,
 (3, 19): 2,
 (3, 20): 2,
 (3, 21): 3,
 (3, 22): 1,
 (3, 23): 3,
 (3, 24): 0,
 (3, 25): 2,
 (3, 26): 2,
 (3, 27): 2,
 (3, 28): 2,
 (3, 29): 2,
 (3, 30): 2,
 (3, 31): 2,
 (3, 32): 3,
 (4, 0): 2,
 (4, 1): 2,
 (4, 2): 0,
 (4, 3): 0,
 (4, 4): 3,
 (4, 5): 2,
 (4, 6): 0,
 (4, 7): 3,
 (4, 8): 2,
 (4, 9): 3,
 (4, 10): 0,
 (4, 11): 0,
 (4, 12): 3,
 (4, 13): 0,
 (4, 14): 3,
 (4, 15): 3,
 (4,