<a href="https://colab.research.google.com/github/ksee1230/NL_team12/blob/master/%EC%9E%90%EC%97%B0%EC%96%B4%EC%B2%98%EB%A6%AC_Friends_12%EC%A1%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

모듈 import, 데이터 로드 및 one-hot encoding을 위한 전처리

In [None]:
import pandas as pd
import json
import urllib.request
from tensorflow.keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import nltk
from keras.utils import np_utils
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

urllib.request.urlretrieve("https://raw.githubusercontent.com/ksee1230/NL_team12/master/friends_train.json", filename="friends_train.json")
urllib.request.urlretrieve("https://raw.githubusercontent.com/ksee1230/NL_team12/master/en_data.csv", filename="friends_test.csv")

test_data = pd.read_csv('friends_test.csv', encoding='utf-8')

with open('friends_train.json') as json_file:
    json_data = json.load(json_file)

train_data = []
for data in json_data:
  train_data = train_data + data

for idx in train_data:
  if (idx['emotion'] == 'non-neutral'):
    idx['emotion_num'] = 0
  elif (idx['emotion'] == 'neutral'):
    idx['emotion_num'] = 1
  elif (idx['emotion'] == 'joy'):
    idx['emotion_num'] = 2
  elif (idx['emotion'] == 'sadness'):
    idx['emotion_num'] = 3
  elif (idx['emotion'] == 'fear'):
    idx['emotion_num'] = 4
  elif (idx['emotion'] == 'anger'):
    idx['emotion_num'] = 5
  elif (idx['emotion'] == 'surprise'):
    idx['emotion_num'] = 6
  elif (idx['emotion'] == 'disgust'):
    idx['emotion_num'] = 7

result = pd.DataFrame(train_data, columns=['annotation', 'emotion', 'emotion_num', 'utterance'])

print('훈련용 데이터 개수 :',len(result))

# 데이터 정제


(1) 중복 데이터 제거 및 문장 기호 제거

In [None]:
result['utterance'].nunique()

In [None]:
result.drop_duplicates(subset=['utterance'], inplace=True)

In [None]:
print('총 샘플의 수 :',len(result))

In [None]:
print(result.isnull().values.any())

In [None]:
result['utterance'] = result['utterance'].str.replace("[^a-zA-Z ]","")
result[:5]

In [None]:
result = result.dropna(how = 'any')
print(len(result))

(2) 불용어 제거 및 토큰화

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
X_train = []
for sentence in result['utterance']:
    temp_X = []
    word_tokens = word_tokenize(sentence)
    for word in word_tokens:
      if word not in stop_words:
        temp_X.append(word)
    X_train.append(temp_X)

In [None]:
print(X_train[:3])

In [None]:
X_test = []
for sentence in test_data['utterance']:
    temp_X = []
    word_tokens = word_tokenize(sentence)
    for word in word_tokens:
      if word not in stop_words:
        temp_X.append(word)
    X_test.append(temp_X)

(3) 정수 인코딩

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
total_cnt = len(tokenizer.word_index)
vocab_size = total_cnt + 1
print('단어 집합의 크기 :',vocab_size)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
tokenizer = Tokenizer(vocab_size) 

In [None]:
y_train = np_utils.to_categorical(result['emotion_num'])
num_classes = y_train.shape[1]

(4) 패딩

In [None]:
print('문장의 최대 길이 :',max(len(l) for l in X_train))
print('문장의 평균 길이 :',sum(map(len, X_train))/len(X_train))
plt.hist([len(s) for s in X_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
def below_threshold_len(max_len, nested_list):
  cnt = 0
  for s in nested_list:
    if(len(s) <= max_len):
        cnt = cnt + 1
  print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (cnt / len(nested_list))*100))

max_len = 24
below_threshold_len(max_len, X_train)

In [None]:
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

# 모델 설계 및 학습

In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(LSTM(128))
model.add(Dense(num_classes, activation='softmax'))

In [None]:
earlystop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
ckpt = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=15, callbacks=[earlystop, ckpt], batch_size=128, validation_split=0.2)

모델을 통해 test 데이터에 대한 결과 예측 및 csv 변환

In [None]:
loaded_model = load_model('best_model.h5')
prediction = loaded_model.predict_classes(X_test, verbose=0)

prediction = prediction.flatten().tolist()

prediction_emotion = []
for num in prediction:
  if num == 0:
    prediction_emotion.append('non-neutral')
  elif num == 1:
    prediction_emotion.append('neutral')
  elif num == 2:
    prediction_emotion.append('joy')
  elif num == 3:
    prediction_emotion.append('sadness')
  elif num == 4:
    prediction_emotion.append('fear')
  elif num == 5:
    prediction_emotion.append('anger')
  elif num == 6:
    prediction_emotion.append('surprise')
  elif num == 7:
    prediction_emotion.append('disgust')

id = list(range(len(prediction_emotion)))

dic = {'Id': id,
       'Predicted': prediction_emotion}

df = pd.DataFrame(dic)
df.to_csv('sample.csv', sep=',', index = False)