# CNN_aug


## Mount Google Drive

In [None]:
# Colab 사용시
from google.colab import drive
drive.mount('/content/drive')

## Install

In [None]:
# !java -version
# %pip install PyKomoran
%pip install konlpy

## Import Library

In [None]:
# data
import pandas as pd
import pickle

# preprocessing
from konlpy.tag import Komoran
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# model
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# evaluation
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# visualization
import matplotlib.pyplot as plt
import seaborn as sn

# env
import os
from tqdm import tqdm
# import hangul_font

## GPU Setting

In [None]:
if tf.test.gpu_device_name():
    print('GPU found')
else:
    print("No GPU found")

## Hyper_parameters

In [None]:
# Colab
DATA_DIR = '/content/drive/MyDrive/miso/sentiment/data/'
SAVE_DIR = '/content/drive/MyDrive/miso/sentiment/model/CNN_aug/'
LOAD_DIR = '/content/drive/MyDrive/miso/sentiment/model/CNN_aug/'

# Model
# MAX_LEN -> 패딩 단계에서 설정
OUTPUT_DIM = 256 #Embedding
FILTERS = 128 #Con1D
KERNEL_SIZE = 3 #Con1D
UNITS = 128 #Dense
RATE = 0.3 #Dropout
OUTPUT = 6 #Dense

# Evaluation
EPOCHS = 10
VERBOSE = 1
BATCH_SIZE = 256
PATIENCE = 5
VALIDATION_SPLIT=0.2


## Load Data

In [None]:
df = pd.read_csv(DATA_DIR+ 'final_sentiment.csv')

In [None]:
df[:1]

In [None]:
df.info()

In [None]:
label = df['label']

plt.figure(figsize=(16, 8))
plt.hist(label)
plt.show()

### Train, Test split

In [None]:
X = df.text
y = df.label

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=34, stratify=y
)

In [None]:
print('X_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)
print('\nX_test shape: ', X_test.shape)
print('y_test shape: ', y_test.shape)

## Train Data PreProcessing

### Tokenization

In [None]:
komoran = Komoran()

# 불용어(Stop Words) 제거
stop_pos_tags =  ['IC', 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX',
                   'EF', 'ETN', 'ETM', 'XSA', 'SF', 'SP', 'SS', 'SE', 'SO', 'SL', 'SH',
                   'SW', 'NF', 'NV', 'SN', 'NA']

#어간 원형 복원 (Lemmatization)
#동사와 형용사의 경우에는 어간(Stem)에 '다'를 붙여서 기본형으로 복원
def tokenize(corpus, stop_pos_tags):
    result = []
    pairs = komoran.pos(corpus)
    for pair in pairs:
        morph, pos = pair
        if pos not in stop_pos_tags:
            if pos in ['VV', 'VA', 'VX', 'VCP', 'VCN']:
                morph = morph + '다'
            result.append(morph)
    return result

In [None]:
# 토큰을 만들고 리스트에 저장
clean_train_text = []
for sample in tqdm(X_train):
    clean_train_text.append(tokenize(sample, stop_pos_tags))

In [None]:
clean_train_text[:3]

## Test Data PreProcessing

### Tokenization

In [None]:
# 토큰을 만들고 리스트에 저장
clean_test_text = []
for sample in tqdm(X_test):
    clean_test_text.append(tokenize(sample, stop_pos_tags))

### Vectorization

In [None]:
tokenizer_name = 'tokenizer.pickle'
save_path = os.path.join(SAVE_DIR, tokenizer_name)

tokenizer = Tokenizer(oov_token = True)

try:
    tokenizer.fit_on_texts(clean_train_text)
    train_sequences = tokenizer.texts_to_sequences(clean_train_text)
    test_sequences = tokenizer.texts_to_sequences(clean_test_text)
except:
    print('Already tokenized')

with open(save_path, 'wb') as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

vocab_size = len(tokenizer.word_index)+1 # 단어의 수
vocab_size

train_sequences[:3]

### Set vocab_size

In [None]:
VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

### Get vocab items

In [None]:
token_dic = tokenizer.word_counts
d2 = sorted(token_dic.items(), key=lambda x: x[1], reverse=True)
print("sorted")
print(d2)

### Padding

In [None]:
print('텍스트의 최대 길이 :',max(len(text) for text in train_sequences))
print('텍스트의 평균 길이 :',sum(map(len, train_sequences))/len(train_sequences))
plt.hist([len(text) for text in train_sequences], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
trunc_type = 'post'
padding_type = 'post'
MAX_LEN = int(max(len(text) for text in train_sequences) * 95 / 100)

X_train = pad_sequences(train_sequences, maxlen=MAX_LEN, padding=padding_type, truncating=trunc_type)
X_test= pad_sequences(test_sequences, maxlen=MAX_LEN, padding=padding_type, truncating=trunc_type)

In [None]:
print(X_train[:3])

### Labeling

In [None]:
y_train.unique()

In [None]:
emo_dic = {
    'angry': 0, 'disgusting': 1, 'fear': 2, 'happy': 3, 'neutral': 4, 'sad': 5
}

In [None]:
y_train = y_train.replace(emo_dic)
y_test = y_test.replace(emo_dic)
y_train.unique()

## Model definition

In [None]:
model = Sequential()
model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=OUTPUT_DIM, mask_zero=True))
model.add(Dropout(rate=RATE))
model.add(Conv1D(filters=FILTERS,  kernel_size=KERNEL_SIZE, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(units=UNITS, activation='relu'))
model.add(Dropout(rate=RATE))
model.add(Dense(units=OUTPUT, activation='softmax'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=VERBOSE, patience=PATIENCE, restore_best_weights=True)
mcp = ModelCheckpoint(SAVE_DIR + 'best-model.h5', monitor='val_accuracy', mode='max', verbose=VERBOSE, save_best_only=True)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## Train Model

In [None]:
history = model.fit(
    X_train, y_train, epochs=EPOCHS, callbacks=[es, mcp], batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT
)

## Test Model

In [None]:
loaded_model = load_model(SAVE_DIR + 'best-model.h5')
loss, acc = loaded_model.evaluate(X_test, y_test)

print('Test loss:', loss)
print('Test accuracy:', acc)

### Visualization

In [None]:
fig, loss_ax = plt.subplots()
acc_ax = loss_ax.twinx()

loss_ax.plot(history.history['loss'], 'y', label='train loss')
loss_ax.plot(history.history['val_loss'], 'r', label='val loss')

acc_ax.plot(history.history['accuracy'], 'b', label='train acc')
acc_ax.plot(history.history['val_accuracy'], 'g', label='val acc')

loss_ax.set_xlabel('epoch')
loss_ax.set_ylabel('loss')
acc_ax.set_ylabel('accuracy')

loss_ax.legend(loc='upper left')
acc_ax.legend(loc='lower left')

plt.show()

## Save Model

In [None]:
model_name = 'trained-model.h5'

# Save model and weights
model_path = os.path.join(SAVE_DIR, model_name)
model.save(model_path)
print('Saved trained model at %s ' % model_path)

## Evaluation

### confusion matrix

In [None]:
y_pred = loaded_model.predict(X_test)
y_pred = y_pred.argmax(axis=-1)
print(y_pred)

In [None]:
# constant for classes
classes = ('angry', 'disgusting', 'fear', 'happy', 'neutral', 'sad')

# Build confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred)
print(cf_matrix)
df_cm = pd.DataFrame(cf_matrix/cf_matrix.sum(axis=1).reshape(6, 1), index = [i for i in classes],
                     columns = [i for i in classes])
plt.figure(figsize = (12,10))
sn.heatmap(df_cm, annot=True, cmap='Blues')
plt.savefig(SAVE_DIR + 'output.png')

### precision, recall, f1 score

In [None]:
acc_score = accuracy_score(y_test, y_pred)
prec_score = precision_score(y_test, y_pred, average='macro')
re_score = recall_score(y_test, y_pred, average='macro')
f1_score = f1_score(y_test, y_pred, average='macro')
print("accuracy_score: ", acc_score)
print("precision_score: ", prec_score)
print("recall_score: ", re_score)
print("f1_score: ", f1_score)

## Load Model

In [None]:
def load_tokenizer(path):
    with open(path, 'rb') as f:
        tokenizer = pickle.load(f)
    return tokenizer

model_name = 'trained-model.h5'
tokenizer_name = 'tokenizer.pickle'
model_path = os.path.join(LOAD_DIR, model_name)
tokenizer_path = os.path.join(LOAD_DIR, tokenizer_name)

model = load_model(model_path)
tokenizer = load_tokenizer(tokenizer_path)


In [None]:
def getResult(predict) :
    highest = 0
    highestIdx = 0
    for x in range (0,len(predict)) :
        if(predict[x] > highest) :
            highestIdx = x
            highest =predict[x]
    return highestIdx

def predict_sentiment(text, model):
    tokens = []
    tokenized = tokenize(text, stop_pos_tags)
    if(len(tokenized) == 0) :
        return "너무 짧아"
    tokens.append(tokenized)
    tokens = tokenizer.texts_to_sequences(tokens)
    x_test = pad_sequences(tokens, maxlen=MAX_LEN)
    predict = model.predict(x_test)
    # print(predict)
    result = getResult(predict[0])
    if result == 0 :
        return 'angry'
    elif result == 1 :
        return 'disgusting'
    elif result == 2 :
        return 'fear'
    elif result == 3 :
        return 'happy'
    elif result == 4 :
        return 'neutral'
    elif result == 5 :
        return 'sad'

# TEST

angry 라벨에 대한 예측

In [None]:
predict_sentiment("연락이 또 안돼. 짜증난다.", model)

disgusting 라벨에 대한 예측

In [None]:
predict_sentiment("음식물 냄새가 너무 심해.", model)

fear 라벨에 대한 예측

In [None]:
predict_sentiment("갑자기 사라질까봐 무서워.", model)

happy라벨에 대한 예측

In [None]:
predict_sentiment("부모님께서 건강하다는 사실에 대해 감사하고 있어.", model)

neutral라벨에 대한 예측

In [None]:
predict_sentiment("지금 밥 먹으러 가는중이야.", model)

sad라벨에 대한 예측

In [None]:
predict_sentiment("강아지가 세상을 떠났어.", model)

angry 라벨에 대한 예측

In [None]:
predict_sentiment("연락이 또 안돼. 짜증난다.", model)

disgusting 라벨에 대한 예측

In [None]:
predict_sentiment("음식물 냄새가 너무 심해.", model)

fear 라벨에 대한 예측

In [None]:
predict_sentiment("갑자기 사라질까봐 무서워.", model)

happy라벨에 대한 예측

In [None]:
predict_sentiment("부모님께서 건강하다는 사실에 대해 감사하고 있어.", model)

neutral라벨에 대한 예측

In [None]:
predict_sentiment("지금 밥 먹으러 가는중이야.", model)

sad라벨에 대한 예측

In [None]:
predict_sentiment("강아지가 세상을 떠났어.", model)

In [41]:
predict_sentiment("연락이 또 안돼. 짜증난다.", model)

'angry'

disgusting 라벨에 대한 예측

In [42]:
predict_sentiment("음식물 냄새가 너무 심해.", model)

'disgusting'

fear 라벨에 대한 예측

In [43]:
predict_sentiment("갑자기 사라질까봐 무서워.", model)

'fear'

happy라벨에 대한 예측

In [44]:
predict_sentiment("부모님께서 건강하다는 사실에 대해 감사하고 있어.", model)

'happy'

neutral라벨에 대한 예측

In [45]:
predict_sentiment("지금 밥 먹으러 가는중이야.", model)

'sad'

sad라벨에 대한 예측

In [46]:
predict_sentiment("강아지가 세상을 떠났어.", model)

'sad'