# Text to Sentiment

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import os
import re
import gc
import sys
import time
import json
import pickle

from glob import glob
from collections import defaultdict, Counter

from khaiii import KhaiiiApi
from konlpy.tag import Okt, Komoran, Kkma, Mecab
from chatspace import ChatSpace

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
def load_nsmc_data():
    # data load
    paths = [path.replace('\\', '/') for path in glob('./nsmc/raw/*.json')]
    res = []
    for path in paths:
        with open(path, encoding='utf-8') as data_file:
            res.extend(json.load(data_file))
    # struct dataframe
    df = pd.DataFrame(res)
    # drop null data & \n, \r
    df['review'] = df['review'].map(lambda x : re.sub('[\n\r]', '', x))
    df = df[df['review'].map(lambda x : len(x) != 0)]
    df.index = range(len(df))
    return df

In [3]:
df = load_nsmc_data()

In [4]:
device = torch.device('cuda:0')
spacer = ChatSpace(device=device)

In [5]:
%time spacing_reviews = spacer.space(df.review.values)

CPU times: user 1min 24s, sys: 137 ms, total: 1min 24s
Wall time: 1min 24s


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
tokenizer = KhaiiiApi()

def tokenize(self, text, morphs=False, split=False):
    results = self.analyze(text)
    results = ' + '.join(list(map(lambda s: str(s).split('\t')[1], results)))
    if morphs:
        results = list(map(lambda s: s.split('/')[0], results.split(' + ')))
        results = ' + '.join(results)
    if split:
        results = results.split(' + ')
    return results

setattr(tokenizer.__class__, 'tokenize', tokenize)

In [15]:
def process_single_review(text, tokenizer, *args, **kwargs):
    letters_only = re.sub('[^0-9ㄱ-ㅎㅏ-ㅣ가-힣.,!?*♡]', ' ', text).strip()
    return tokenizer.tokenize(letters_only, *args, **kwargs)

In [16]:
%%time
reviews = list(map(lambda s: re.sub('[^0-9ㄱ-ㅎㅏ-ㅣ가-힣.,!?*♡]', ' ', s), spacing_reviews))

CPU times: user 2.31 s, sys: 4.01 ms, total: 2.31 s
Wall time: 2.31 s


In [22]:
%%time
_reviews = [tokenizer.tokenize(text, split=True) for text in spacing_reviews]

CPU times: user 10min 7s, sys: 2.28 s, total: 10min 9s
Wall time: 10min 8s


In [17]:
from konlpy.tag import Okt
okt = Okt()

In [27]:
from tqdm.notebook import tqdm

_reviews = []

for text in tqdm(spacing_reviews):
    _reviews.append(okt.morphs(text, norm=True, stem=True) )

HBox(children=(FloatProgress(value=0.0, max=712383.0), HTML(value='')))




In [29]:
import pickle

# Khaiii
# with open('tokenized_reviews.pkl', 'wb') as f:
#     pickle.dump(_reviews, f, protocol=pickle.HIGHEST_PROTOCOL)

# with open('tokenized_reviews.pkl', 'rb') as f:
#     _reviews = pickle.load(f)

# Open Korean Text
with open('tokenized_reviews_okt.pkl', 'wb') as f:
    pickle.dump(_reviews, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('tokenized_reviews_okt.pkl', 'rb') as f:
    _reviews = pickle.load(f)

In [30]:
_reviews.__len__()

712383

In [32]:
from collections import Counter

tokens = Counter([i for review in _reviews for i in review])

In [114]:
JST = SentimentLDAGibbsSampler()

In [116]:
JST.config

{'alpha': 10, 'beta': 0.1, 'gamma': 0.1, 'numTopics': 4, 'numSentiments': 8}