In [3]:
import numpy as np
import pandas as pd
import keras
import seaborn as sns
import matplotlib.pyplot as plt
from time import time
from collections import Counter

from imblearn.over_sampling import SMOTE
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# 1. 데이터 로드 및 전처리
df_sp_sample = pd.read_csv("movie_sample.csv", encoding="utf-8")

def clean_text(text):
    from konlpy.tag import Okt
    import re
    okt = Okt()
    text = re.sub(r"[^가-힣\s]", "", str(text))  # 한글과 공백만 남기기
    text = text.strip()
    tokens = okt.morphs(text, stem=True)  # 형태소 분석(어간추출)
    stopwords = set(['은', '는', '이', '가', '을', '를', '의', '에', '과', '와', '도', '한'])
    return " ".join([word for word in tokens if word not in stopwords])

df_sp_sample["document"] = df_sp_sample["document"].fillna("").apply(clean_text)

# 2. 학습 및 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(
    df_sp_sample["document"], df_sp_sample["label"], test_size=0.2, random_state=42
)

- keras 
    - pad_sequences()
        - maxlen - 패딩 후 최대 길이
        - dtype - 데이터 타입 지정
        - padding - (pre, post)- 앞과 뒤 중 어디에 0을 추가할지
        - truncating - pre, post 앞과 뒤 중 어디를 길이가 초과할 때 자를지?
        - value - 값 채우기

In [4]:
#토크나이징 패딩 처리
tokenizer=Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index)+1

In [5]:
X_train_seq=tokenizer.texts_to_sequences(X_train)
X_test_seq=tokenizer.texts_to_sequences(X_test)

max_len = max(len(seq) for seq in X_train_seq)

In [6]:
X_train_pad=pad_sequences(X_train_seq, maxlen= max_len, padding='post')
X_test_pad=pad_sequences(X_test_seq, maxlen= max_len, padding='post')

In [7]:
## 레이블인코딩과 원핫 인코딩으로 변환 
encoder =LabelEncoder()
y_train_enc=encoder.fit_transform(y_train)
y_test_enc=encoder.transform(y_test)
y_train_cat=to_categorical(y_train_enc)
y_test_cat=to_categorical(y_test_enc)

In [8]:
## keras 딥러닝 모델을 간단하게 구현해서 만들 예정 
## 입력층
## 특징을 추출하는 layer
## 정규화 기타 일반화 층 작업 Dropout 등등
## 출력층
## base model 만들기
model = Sequential([
    Embedding(input_dim = vocab_size, output_dim = 100, input_length = max_len),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dense(y_train_cat.shape[1], activation='softmax')
])

## model compile

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## model 학습

model.fit(X_train_pad, y_train_cat, epochs =10, batch_size=32, validation_data=(X_test_pad, y_test_cat))

Epoch 1/10




[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 29ms/step - accuracy: 0.5012 - loss: 0.6940 - val_accuracy: 0.5080 - val_loss: 0.6931
Epoch 2/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 30ms/step - accuracy: 0.4997 - loss: 0.6935 - val_accuracy: 0.4920 - val_loss: 0.6935
Epoch 3/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 30ms/step - accuracy: 0.4944 - loss: 0.6934 - val_accuracy: 0.5080 - val_loss: 0.6931
Epoch 4/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 28ms/step - accuracy: 0.5024 - loss: 0.6933 - val_accuracy: 0.5080 - val_loss: 0.6931
Epoch 5/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.4976 - loss: 0.6932 - val_accuracy: 0.5080 - val_loss: 0.6933
Epoch 6/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 29ms/step - accuracy: 0.4967 - loss: 0.6936 - val_accuracy: 0.4920 - val_loss: 0.6937
Epoch 7/10
[1m750/750[0m 

<keras.src.callbacks.history.History at 0x2c145fcb950>

In [9]:
## Dropout의 비율을 조정

model_dev = Sequential([
    Embedding(input_dim = vocab_size, output_dim = 100, input_length = max_len),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(y_train_cat.shape[1], activation='softmax')
])

## model compile

model_dev.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## model 학습

model_dev.fit(X_train_pad, y_train_cat, epochs =10, batch_size=32, validation_data=(X_test_pad, y_test_cat))

Epoch 1/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 31ms/step - accuracy: 0.5030 - loss: 0.6941 - val_accuracy: 0.4920 - val_loss: 0.6932
Epoch 2/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.5030 - loss: 0.6932 - val_accuracy: 0.4920 - val_loss: 0.6933
Epoch 3/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.5019 - loss: 0.6932 - val_accuracy: 0.4920 - val_loss: 0.6932
Epoch 4/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 30ms/step - accuracy: 0.5068 - loss: 0.6931 - val_accuracy: 0.5080 - val_loss: 0.6931
Epoch 5/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.5048 - loss: 0.6932 - val_accuracy: 0.4920 - val_loss: 0.6933
Epoch 6/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 31ms/step - accuracy: 0.5050 - loss: 0.6932 - val_accuracy: 0.4920 - val_loss: 0.6936
Epoch 7/10
[1m7

<keras.src.callbacks.history.History at 0x2c1948b7fe0>

In [10]:
from tensorflow.keras.layers import BatchNormalization

## Dropout 비율 조정  & 배치정규화 추가 

model_dev2 = Sequential([
    Embedding(input_dim = vocab_size, output_dim = 200, input_length = max_len),
    LSTM(64, return_sequences=True),
    BatchNormalization(), #배치 정규화 추가 
    LSTM(32),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(y_train_cat.shape[1], activation='softmax')
])

## model compile

model_dev2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## model 학습

model_dev2.fit(X_train_pad, y_train_cat, epochs =10, batch_size=32, validation_data=(X_test_pad, y_test_cat))

Epoch 1/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 39ms/step - accuracy: 0.4958 - loss: 0.6972 - val_accuracy: 0.4918 - val_loss: 0.6935
Epoch 2/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 38ms/step - accuracy: 0.4958 - loss: 0.6933 - val_accuracy: 0.4920 - val_loss: 0.7371
Epoch 3/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 37ms/step - accuracy: 0.5070 - loss: 0.6930 - val_accuracy: 0.4920 - val_loss: 0.6952
Epoch 4/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 38ms/step - accuracy: 0.4998 - loss: 0.6930 - val_accuracy: 0.5097 - val_loss: 0.6928
Epoch 5/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 37ms/step - accuracy: 0.4945 - loss: 0.6935 - val_accuracy: 0.4920 - val_loss: 0.6929
Epoch 6/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 37ms/step - accuracy: 0.5031 - loss: 0.6930 - val_accuracy: 0.4920 - val_loss: 0.6933
Epoch 7/10
[1m7

<keras.src.callbacks.history.History at 0x2c19912cc80>

# 필수과제
## 성능을 개선하기 위해 한 작업
- **1. model을 통해 성능을 개선해 보자!**
    - model 두 가지 작업을 추가 
        - Dropout 비율 조정  & 배치정규화 추가 
        - 다른 레이어를 추가하거나 model을 더욱 고도화해서 만들면 성능 분명 개선될 것
        - accuracy 0.85 이상 만들기 ( 0.85를 넘지 않으면 당연히 과제를 수행하지 않은 것 입니다. ) 
   

In [11]:
model_dev = Sequential([
    Embedding(input_dim = vocab_size, output_dim = 100, input_length = max_len),
    LSTM(64, return_sequences=True),
    LSTM(32),
    BatchNormalization(),
    Dropout(0.45),
    Dense(16, activation='relu'),
    Dense(y_train_cat.shape[1], activation='softmax')
])

## model compile

model_dev.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## model 학습

model_dev.fit(X_train_pad, y_train_cat, epochs =4, batch_size=16, validation_data=(X_test_pad, y_test_cat))

Epoch 1/4
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 27ms/step - accuracy: 0.5006 - loss: 0.6938 - val_accuracy: 0.5080 - val_loss: 1.1304
Epoch 2/4
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 26ms/step - accuracy: 0.7342 - loss: 0.5135 - val_accuracy: 0.8200 - val_loss: 0.4335
Epoch 3/4
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 26ms/step - accuracy: 0.8799 - loss: 0.2984 - val_accuracy: 0.7678 - val_loss: 0.6096
Epoch 4/4
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 27ms/step - accuracy: 0.9215 - loss: 0.2091 - val_accuracy: 0.8220 - val_loss: 0.4701


<keras.src.callbacks.history.History at 0x2c1a25a4f50>

- **2. embedding**
    - model 최적으로 고정한 상태로 
    - padding(keras 내에 조절 가능), tf_idf, word2vec, 임베딩 최소 3개 이상을 비교해서 어떤 식으로 진행하는 게 가장 좋은 성능 보여주는지?
    - 각 임베딩별로 성능이 어떻게 나오는지? 
        - 최적의 임베딩은 어떤 것인지 정리해 보고 
        - 왜 다른 것들과 비교했을 때 성능이 좋은지?

In [15]:
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()  
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()  

X_train_tfidf_seq = np.expand_dims(X_train_tfidf, axis=1)  
X_test_tfidf_seq = np.expand_dims(X_test_tfidf, axis=1)

print("TF-IDF 변환 후 차원:", X_train_tfidf.shape)  
print("LSTM 입력을 위한 차원 확장:", X_train_tfidf_seq.shape)  


TF-IDF 변환 후 차원: (24000, 17651)
LSTM 입력을 위한 차원 확장: (24000, 1, 17651)


In [19]:
from sklearn.metrics import accuracy_score

def train_eval_lstm_tfidf(X_train, X_test, y_train, y_test):
    model = Sequential([
        LSTM(64, return_sequences=True, input_shape=(1, X_train.shape[2])),
        LSTM(32),
        BatchNormalization(),
        Dropout(0.45),
        Dense(16, activation='relu'),
        Dense(y_train.shape[1], activation='softmax')
    ])

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test), verbose=0)

    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=1)
    y_test = np.argmax(y_test, axis=1)

    return accuracy_score(y_test, y_pred)

In [17]:
X_train_tokens = [sentence.split() for sentence in X_train]
X_test_tokens = [sentence.split() for sentence in X_test]
w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)
w2v_model.train(X_train_tokens, total_examples=len(X_train_tokens), epochs=10)

embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

print("Word2Vec 임베딩 행렬 크기:", embedding_matrix.shape)

def sentence_vector(tokens, model, vector_size=100):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

X_train_w2v = np.array([sentence_vector(tokens, w2v_model) for tokens in X_train_tokens])
X_test_w2v = np.array([sentence_vector(tokens, w2v_model) for tokens in X_test_tokens])

print("Word2Vec 변환 후 데이터 크기:", X_train_w2v.shape) 

Word2Vec 임베딩 행렬 크기: (18690, 100)
Word2Vec 변환 후 데이터 크기: (24000, 100)


In [18]:
def train_eval_lstm(X_train, X_test, y_train, y_test, embedding_matrix=None, trainable=True):
    model = Sequential()

    if embedding_matrix is None: 
        model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_len))
    else: 
        model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=max_len, trainable=trainable))
    
    model.add(LSTM(64, return_sequences=True))
    model.add(LSTM(32))
    model.add(BatchNormalization())
    model.add(Dropout(0.45))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(y_train.shape[1], activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test), verbose=0)
    
    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=1)
    y_test = np.argmax(y_test, axis=1)

    return accuracy_score(y_test, y_pred)

In [20]:
acc_keras = train_eval_lstm(X_train_pad, X_test_pad, y_train_cat, y_test_cat)
print(f"LSTM Accuracy - Keras: {acc_keras:.4f}")



[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step
LSTM Accuracy - Keras: 0.8117


In [21]:
acc_w2v = train_eval_lstm(X_train_w2v, X_test_w2v, y_train_cat, y_test_cat, embedding_matrix=embedding_matrix)
print(f"LSTM Accuracy - Word2Vec: {acc_w2v:.4f}")



[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step
LSTM Accuracy - Word2Vec: 0.5582


In [22]:
acc_tfidf = train_eval_lstm_tfidf(X_train_tfidf_seq, X_test_tfidf_seq, y_train_cat, y_test_cat)
print(f"LSTM Accuracy - TF-IDF: {acc_tfidf:.4f}")

  super().__init__(**kwargs)


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
LSTM Accuracy - TF-IDF: 0.7705


### 결론
- 최적 임베딩 방식 : **Keras 임베딩 (0.8117)**
- 이유 : 데이터셋을 기반으로 학습하며 최적의 임베딩 벡터를 찾아서 
    - Word2Vec : 데이터셋이 작고 도메인이 특화될 경우 성능 떨어짐
    - TF-IDF : 단어·역문서 빈도 가중치로 인해 성능이 좋으나, 단어 간 문맥 고려 제한