In [None]:
import keras, tensorflow
keras.__version__
tensorflow.__version__

## Call the data of NSMC(Naver Sentiment Movie Corpus)

In [None]:
def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]  # txt파일의 헤더(id label)는 제외
    return data

train_data = read_data('data/nsmcData/ratings_train.txt')
test_data = read_data('data/nsmcData/ratings_test.txt')

In [None]:
# Check the loaded data
print(len(train_data))
print(train_data[0])
print(len(test_data))
print(test_data[0])

## Preprocessing of NSMC data

#### Using Okt(Open Korean Text) Class provided by KoNLPy

In [None]:
from konlpy.tag import Okt

okt = Okt()
print(okt.pos(u'나는 저 하늘을 높이 날고 있어 그때 니가 내게 줬던 두날개로'))

In [None]:
import os, json
from pprint import pprint

def tokenize(doc):
    # norm은 정규화, stem은 근어로 표시
    return ['/'.join(t) for t in okt.pos(doc, norm=True, stem=True)]

if os.path.isfile('data/nsmcData/train_docs.json'):
    with open('data/nsmcData/train_docs.json') as f:
        train_docs = json.load(f)
    with open('data/nsmcData/test_docs.json') as f:
        test_docs = json.load(f)
else:
    train_docs = [(tokenize(row[1]), row[2]) for row in train_data]
    test_docs = [(tokenize(row[1]), row[2]) for row in test_data]
    
    # save as json file
    with open('data/nsmcData/train_docs.json', 'w', encoding='utf-8') as make_file:
        json.dump(train_docs, make_file, ensure_ascii=False, indent="\t")
    with open('data/nsmcData/test_docs.json', 'w', encoding='utf-8') as make_file:
        json.dump(test_docs, make_file, ensure_ascii=False, indent="\t")
        
# Data pretty printer - pprint module
pprint(train_docs[0])

In [None]:
# Check the number of tokens in the analyzed data
tokens = [t for d in  train_docs for t in d[0]]
print(tokens[:10])

#### Pretreatment vis NLTK

In [None]:
import nltk

text = nltk.Text(tokens, name='NSMC')

print(len(text.tokens))        # Total tokens
print(len(set(text.tokens)))   # Non-Duplicate tokens

pprint(text.vocab().most_common(10))

#### Graph of 50 most common words using matplotlib

In [None]:
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
%matplotlib inline

font_fname = '/Library/Fonts/AppleGothic.ttf'
font_name = font_manager.FontProperties(fname=font_fname).get_name()
rc('font', family=font_name)

plt.figure(figsize=(20, 10))
text.plot(50)

#### Vectorized using 10,000 commonly used token - Using CountVectorization

In [None]:
selected_words = [f[0] for f in text.vocab().most_common(10000)]

def term_frequency(doc):
    return [doc.count(word) for word in selected_words]

train_x = [term_frequency(d) for d, _ in train_docs]
train_y = [c for _, c in train_docs]
test_x = [term_frequency(d) for d, _ in test_docs]
test_y = [c for _, c in test_docs]

# Change data to float
import numpy as np

x_train = np.asarray(train_x).astype('float32')
y_train = np.asarray(train_y).astype('float32')
x_test = np.asarray(test_x).astype('float32')
y_test = np.asarray(test_y).astype('float32')

## Model Definition and Learning

In [None]:
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import metrics

In [None]:
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy])
model.fit(x_train, y_train, epochs=10, batch_size=512)
results = model.evaluate(x_test, y_test)

In [None]:
results

## Predict results with new data

In [None]:
import pandas as pd

# df = pd.read_csv("data/commentsData/comments_Entertainment.csv", sep=",")
# df = pd.read_csv("data/commentsData/comments_Politics.csv", sep=",")
df = pd.read_csv("data/commentsData/comments_Social.csv", sep=",")

df.shape

In [None]:
comments = []

comments = df.comments

print(comments[:5])

#### Predict whether comment is pos(1) or neg(0)

In [None]:
def predict_pos_neg(review):
    token = tokenize(review)
    tf = term_frequency(token)
    data = np.expand_dims(np.asarray(tf).astype('float32'), axis=0)
    score = float(model.predict(data))
    
    if(score>0.5):
        print("[{}] is positive\n".format(review))
        return 1
    else:
        print("[{}] is negative\n".format(review))
        return 0

#### Calculate the probability of pos or neg

In [None]:
def percentage_pos_neg(review):
    token = tokenize(review)
    tf = term_frequency(token)
    data = np.expand_dims(np.asarray(tf).astype('float32'), axis=0)
    score = float(model.predict(data))
    
    if(score > 0.5):
        print("[{}]는 {:.2f}% chance to be positive\n".format(review, score*100))
        return round(score*100, 2)
    else:
        print("[{}]는 {:.2f}% chance to be negative\n".format(review, (1-score)*100))
        return round((1-score)*100, 2)

In [None]:
predict = []

for index in comments:
    predict.append(predict_pos_neg(index))

In [None]:
percentage = []

for index in comments:
    percentage.append(percentage_pos_neg(index))

#### Add column to csv file (predict / percent / truelike / wholelike)

In [None]:
df['predict'] = predict
df['percent'] = percentage

truelike = df['like'] - df['dislike']
wholelike = df['like'] + df['dislike']
# truelike.head()
# wholelike.head()

df['truelike'] = truelike
df['wholelike'] = wholelike

df.to_csv('data/commentsData/comments_Social_.csv', mode='a', encoding='utf-8', index=False)

#### Sorting comment data

In [None]:
# df = pd.read_csv("data/commentsData/comments_Entertainment_.csv", sep=",")
# df = pd.read_csv("data/commentsData/comments_Politics_.csv", sep=",")
df = pd.read_csv("data/commentsData/comments_Social_.csv", sep=",")

descendData = []
ascendData = []
total = []

descendData = df.loc[df['predict']==1].sort_values(by='percent', ascending=False)
ascendData = df.loc[df['predict']==0].sort_values(by='percent', ascending=True)

total = descendData.append(ascendData)

# total.to_csv('data/commentSorting/comments_Entertainment(sort).csv', mode='a', encoding='utf-8', index=False)
# total.to_csv('data/commentSorting/comments_Politics(sort).csv', mode='a', encoding='utf-8', index=False)
total.to_csv('data/commentSorting/comments_Social(sort).csv', mode='a', encoding='utf-8', index=False)

#### Save the model as a file

In [None]:
from keras.models import load_model
from keras.models import model_from_json

model.save('data/modelData/model.h5')

# model_json = model.to_json()
# with open('data/model.json', 'w') as json_file:
#     json_file.write(model_json)

In [None]:
# keras version: 2.2.2
# tensorflow version: 1.11.0
import tensorflowjs as tfjs
tfjs.converters.save_keras_model(model, 'data/')