In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import koreanize_matplotlib
from konlpy.tag import Okt
from gensim.models import Word2Vec
from collections import Counter

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam

In [2]:
# 데이터 로드
df = pd.read_csv("https://raw.githubusercontent.com/hongsukyi/Lectures/main/data/nsmc.txt", sep="\t")
df = df.dropna().drop_duplicates(['document']).reset_index(drop=True)

df['word_count'] = df['document'].apply(lambda x: len(str(x).split()))
df_sorted = df.sort_values(by='word_count', ascending=False).reset_index(drop=True)

df=df_sorted[:10000].copy()

df.replace("", float("NaN"), inplace=True)
df = df.dropna().reset_index(drop=True)
print('결측치 처리 이후:',len(df))
df = df.drop_duplicates(['document']).reset_index(drop=True)
print('중복 제거 이후:',len(df))
df['document'] = df['document'].str.replace(r"[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", regex=True)
print('한글 아닌 문자 제거 이후:',len(df))

df['document'] = df['document'].apply(lambda x: ' '.join([token for token in x.split() if len(token) > 2]))
print('리뷰 길이가 짧은 것 제거 :',len(df))

df.head()

결측치 처리 이후: 10000
중복 제거 이후: 10000
한글 아닌 문자 제거 이후: 10000
리뷰 길이가 짧은 것 제거 : 10000


Unnamed: 0,id,document,label,word_count
0,6928782,쌕기들이 종교랑 보솤ㅋㅋㅋㅋㅋ 그라제 우덜이 미군이 써보지도 완전히 전쟁도 있었당께...,0,41
1,8330404,정유미 정준영 부부막둥이 윤한부부 나오면 채널로 최초로 부부가 재미없음 막둥이 윤한...,0,40
2,8525988,측정용 영화임 당신은 마음이 남에게 상처를 못하는 당신은 인문학예술은 당신의 분야가...,0,40
3,8153927,황당한 쓰레기같은 스토리 수준을 초월한 칭찬해 차라리 디워는 나와서 신기하기라도 도...,0,40
4,8120918,남자고 아비라면 김윤진 역보다 어미니의 아들입장이라면 저보다 평만보면 공자시구만ㅋㅋ...,1,40


In [None]:
# 짧은 문장 제거
okt = Okt()
def token_count(text):
    return len(okt.morphs(text, stem=True))

df = df[df['document'].apply(lambda x: token_count(x) >= 3)].reset_index(drop=True)

print(f"최종 데이터 크기: {len(df)}")

In [None]:
stopwords = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로',
             '자', '에', '와', '한', '하다', '을', '적', '로', '인', '만',
             '다', '이다', '에서', '되다', '하고', '않다', '못', '고', '안', '것', '나', '그']

def clean_tokenize(text):
    tokens = okt.morphs(text, stem=True)
    return [w for w in tokens if w not in stopwords]

df['tokens'] = df['document'].apply(clean_tokenize)


In [None]:
df_samples=df['tokens']
all_tokens = [token for tokens in df_samples for token in tokens]

counter = Counter(all_tokens)
common_words = counter.most_common(20) 
words, freqs = zip(*common_words)

plt.figure(figsize=(12,4))
plt.bar(words, freqs)
plt.title("Top 7 Most Common Tokens")
plt.xlabel("Tokens")
plt.ylabel("Frequency")
plt.show()

In [None]:
## 패딩을 위한 95% 분위값 출력
q95= df['word_count'].quantile([0.95])

plt.figure(figsize=(7, 3))
plt.hist(df['word_count'], bins=50, color='lightblue', edgecolor='black')


plt.axvline(x=q95.iloc[0], color='gray', linestyle='--', label=f'95%: {int(q95.iloc[0])}')
plt.title('문서별 단어 수 분포 및 분위수 기준선', fontsize=10)
plt.xlabel('단어 수', fontsize=10)
plt.ylabel('문서 수', fontsize=10)
plt.legend(frameon=False,fontsize=10)
plt.grid(False)
plt.show()

In [None]:
max_len = 35
embedding_dim = 64

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['tokens'].apply(lambda x: ' '.join(x)))

sequences = tokenizer.texts_to_sequences(df['tokens'].apply(lambda x: ' '.join(x)))

X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = np.array(df['label'])
print(f"X shape: {X.shape}, y shape: {y.shape}")

In [None]:
vocab_size = len(tokenizer.word_index) + 1
print(f"vocab_size: {vocab_size}")
print(f"max index in sequences: {np.max(X)}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

d1= 128
d2= 64
drop_rate = 0.5 

In [None]:
model_keras_dropout = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(d1, activation='relu'),
    Dropout(drop_rate),
    Dense(d2, activation='relu'),
    Dropout(drop_rate),
    Dense(1, activation='sigmoid')
])

model_keras = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(d1, activation='relu'),
    Dense(d2, activation='relu'),
    Dense(1, activation='sigmoid')
])


model_keras_opt = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    Dropout(drop_rate),
    GlobalAveragePooling1D(),
    Dense(d1, activation='relu', kernel_regularizer=regularizers.l2(0.0001)),
    Dropout(drop_rate),
    Dense(d2, activation='relu', kernel_regularizer=regularizers.l2(0.0001)),
    Dense(1, activation='sigmoid')
])

In [None]:
model_dict = { 
    'keras_drop': model_keras_dropout,  
    'keras': model_keras, 
    'keras_opt': model_keras_opt }
choice = 'keras_opt'
model = model_dict[choice]
model.summary()

In [None]:
#model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
optimizer = Adam(learning_rate=0.00005)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=30, batch_size=64,
                    validation_split=0.2, callbacks=[early_stop], verbose=1)


In [None]:
# Accuracy
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title(f"Accuracy Curve with {choice}")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

# Loss
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title(f"Loss Curve with {choice}")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

# 최종 평가
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {acc:.4f}, Loss: {loss:.4f}")
