## 네이버 영화평 감성분석 - LSTM

In [1]:
!pip install Konlpy > /dev/null

Collecting Konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from Konlpy)
  Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, Konlpy
Successfully installed JPype1-1.4.1 Konlpy-0.6.0


In [2]:
import numpy as np
import pandas as pd

In [3]:
train_df = pd.read_csv("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", sep='\t')
test_df = pd.read_csv("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", sep='\t')

In [4]:
train_df.shape, test_df.shape

((150000, 3), (50000, 3))

In [5]:
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


#### 1. 데이터 전처리
- train dataset

In [6]:
# 결측치 확인
train_df.isna().sum()

id          0
document    5
label       0
dtype: int64

In [7]:
# 결측치 데이터 삭제
train_df.dropna(how='any', inplace=True)
train_df.shape

(149995, 3)

In [8]:
# 중복 데이터 확인
train_df.document.nunique()

146182

In [9]:
# 중복 데이터 제거
train_df.drop_duplicates(subset=['document'], inplace=True)
train_df.shape

(146182, 3)

In [10]:
# 데이터 분포
train_df.label.value_counts()

0    73342
1    72840
Name: label, dtype: int64

In [11]:
test_df.dropna(how='any', inplace=True)
test_df.drop_duplicates(subset=['document'], inplace=True)
test_df.shape

(49157, 3)

In [12]:
test_df.label.value_counts()

1    24711
0    24446
Name: label, dtype: int64

#### 2. 텍스트 전처리
- train dataset

In [13]:
# 한글 이외의 문자는 공백으로 처리하고 strip
train_df.document = train_df.document.str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]', ' ', regex=True).str.strip()
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [14]:
# 한글이 없는 글 --> ''만 남게됨
# ''만 남은 데이터는 제거: np.nan으로 대체후 dropna 실행
train_df.document.replace('', np.nan, inplace=True)
train_df.document.isna().sum()

789

In [15]:
train_df.dropna(how='any', inplace=True)
train_df.shape

(145393, 3)

- test dataset

In [16]:
test_df.document = test_df.document.str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]', ' ', regex=True).str.strip()
test_df.document.replace('', np.nan, inplace=True)
test_df.dropna(how='any', inplace=True)
test_df.shape

(48852, 3)

#### 3. 한글 형태소 분석

In [17]:
from konlpy.tag import Okt
okt = Okt()

In [18]:
from google.colab import files
up = files.upload()

Saving 한글불용어100.txt to 한글불용어100.txt


In [20]:
with open('한글불용어100.txt') as st:
    lines = st.readlines()

stop_words = [line.split('\t')[0] for line in lines]
stop_words[:10]

['이', '있', '하', '것', '들', '그', '되', '수', '이', '보']

In [21]:
from tqdm import tqdm

X_train = []
for review in tqdm(train_df.document):
    morphs = okt.morphs(review, stem=True)
    clean_morph_review = ' '.join([morph for morph in morphs if morph not in stop_words])
    X_train.append(clean_morph_review)

100%|██████████| 145393/145393 [03:22<00:00, 716.93it/s]


In [22]:
%%time
X_test = []
for review in test_df.document:
    morphs = okt.morphs(review, stem=True)
    clean_morph_review = ' '.join([morph for morph in morphs if morph not in stop_words])
    X_test.append(clean_morph_review)

CPU times: user 1min, sys: 93.8 ms, total: 1min 1s
Wall time: 1min 1s


#### 4. Keras Tokenizer

In [23]:
import tensorflow as tf
seed = 2023
np.random.seed(seed)
tf.random.set_seed(seed)

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [25]:
t = Tokenizer()
t.fit_on_texts(X_train)
len(t.word_index)

43068

In [26]:
# 빈도수 상위 10,000개 단어로 인코딩
num_words = 10000
t = Tokenizer(num_words=num_words)
t.fit_on_texts(X_train)

In [27]:
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)

In [28]:
# 데이터의 최대/평균 길이
max(len(s) for s in X_train), sum(len(s) for s in X_train) / len(X_train)

(67, 10.90946606782995)

In [29]:
# 한 문장의 최대 길이
max_len = 20

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [30]:
y_train = train_df.label.values
y_test = test_df.label.values

5. LSTM모델 정의/설정/학습

In [31]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [32]:
model = Sequential([
    Embedding(num_words, 100, input_length=max_len),
    LSTM(128),
    Dense(1, 'sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 100)           1000000   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1117377 (4.26 MB)
Trainable params: 1117377 (4.26 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [33]:
model.compile('adam', 'binary_crossentropy', ['accuracy'])

model_path = 'best_naver_movie_lstm.h5'
mc = ModelCheckpoint(model_path, save_best_only=True, verbose=1)
es = EarlyStopping(patience=5)

In [34]:
hist = model.fit(
    X_train, y_train, validation_split=0.2, epochs=30, batch_size=128, callbacks=[mc,es]
)

Epoch 1/30
Epoch 1: val_loss improved from inf to 0.36663, saving model to best_naver_movie_lstm.h5
Epoch 2/30
  2/909 [..............................] - ETA: 1:01 - loss: 0.2855 - accuracy: 0.8828

  saving_api.save_model(


Epoch 2: val_loss improved from 0.36663 to 0.35050, saving model to best_naver_movie_lstm.h5
Epoch 3/30
Epoch 3: val_loss did not improve from 0.35050
Epoch 4/30
Epoch 4: val_loss did not improve from 0.35050
Epoch 5/30
Epoch 5: val_loss did not improve from 0.35050
Epoch 6/30
Epoch 6: val_loss did not improve from 0.35050
Epoch 7/30
Epoch 7: val_loss did not improve from 0.35050


In [35]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.35575976967811584, 0.8428314328193665]