In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 데이터 불러오기

In [2]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [3]:
### 라이브러리
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import re
import urllib.request
import random
import torch
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm

### 폰트 설정
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt

!apt-get install -qq fonts-nanum
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
fm.fontManager.addfont(font_path)
plt.rc('font', family='NanumGothic')

Selecting previously unselected package fonts-nanum.
(Reading database ... 123594 files and directories currently installed.)
Preparing to unpack .../fonts-nanum_20200506-1_all.deb ...
Unpacking fonts-nanum (20200506-1) ...
Setting up fonts-nanum (20200506-1) ...
Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...


In [4]:
### 시드 고정
random.seed(2024)
np.random.seed(2024)
torch.manual_seed(2024)
torch.cuda.manual_seed(2024)
torch.cuda.manual_seed_all(2024)
torch.backends.cudnn.deterministic = True

In [6]:
### 데이터
%cd '/content/drive/MyDrive/[2024-1] EURON 6기/프로젝트'
train = pd.read_csv('./review_train.csv', encoding='utf-8')
test = pd.read_csv('./review_test.csv', encoding='utf-8')
print(train.shape, test.shape)

/content/drive/MyDrive/[2024-1] EURON 6기/프로젝트
(2026, 2) (400, 2)


In [7]:
### label encoding
from sklearn.preprocessing import LabelEncoder
n = train.shape[0]
y = pd.concat([train,test])

le = LabelEncoder()
y['keyword2'] = le.fit_transform(y['keyword2'])
train = y[:n]
test = y[n:]

### NLP

In [8]:
### okt
from konlpy.tag import Okt
okt = Okt()
train['okt'] = train['reviews'].apply(okt.morphs)
test['okt'] = test['reviews'].apply(okt.morphs)

In [9]:
### 불용어 제거
with open('./new_stopwords.txt', 'r', encoding='utf-8') as file:
    stop_words = file.readlines()
stop_words = [word.strip() for word in stop_words]


def remove_stopwords(review):
    return [word for word in review if word not in stop_words]


train['okt'] = [remove_stopwords(review) for review in train['okt']]
test['okt'] = [remove_stopwords(review) for review in test['okt']]

In [10]:
### TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_okt = TfidfVectorizer(tokenizer=okt.morphs, ngram_range=(1,2), min_df=2,
                            max_df=0.95, stop_words=stop_words)
tfidf_okt_matrix = tfidf_okt.fit_transform(train['reviews'])
tfidf_okt_matrix_test = tfidf_okt.transform(test['reviews'])

vocab_size = tfidf_okt_matrix.shape[1]

In [11]:
### X/y split
X = tfidf_okt_matrix
y = train['keyword2']
X_test = tfidf_okt_matrix_test
y_test = test['keyword2']

### train/valid split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=2024
)

print(X_train.shape, X_valid.shape, X_test.shape)
print(y_train.shape, y_valid.shape, y_test.shape)

(1620, 92075) (406, 92075) (400, 92075)
(1620,) (406,) (400,)


### 7.1.2

In [12]:
### 라이브러리
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split


### 장치 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
### 베이스라인 LSTM 모델 구축
class LSTMModel(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1):
    super(LSTMModel, self).__init__()
    self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True,
                        dropout=0.4, bidirectional=False, num_layers=num_layers)
    # self.layer_norm = nn.LayerNorm(hidden_dim)
    self.tanh = nn.Tanh()
    self.dropout = nn.Dropout(0.5)
    self.fc = nn.Linear(hidden_dim, output_dim)

  def _initialize_weights(self):
        for name, param in self.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param.data)
            elif 'bias' in name:
                nn.init.constant_(param.data, 0)

  def forward(self, x):
    x, _ = self.lstm(x)
    x = x[:, -1, :]
    # x = self.layer_norm(x)
    x = self.tanh(x)
    x = self.dropout(x)
    x = self.fc(x)
    return x

In [14]:
### 모델 훈련 함수
def train(model, train_loader, criterion, optimizer, device):
  model.train()
  total_loss = 0
  correct = 0

  for X, y in tqdm(train_loader, desc='Training', leave=False):
    X = X.unsqueeze(1)  # (batch_size, seq_length, input_dim)
    X, y = X.to(device), y.to(device)
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs, y)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
    _, predicted = torch.max(outputs, 1)
    correct += (predicted == y).sum().item()

  return total_loss / len(train_loader), correct / len(train_loader.dataset)

### 모델 평가 함수
def evaluate(model, test_loader, criterion, device):
  model.eval()
  total_loss = 0
  correct = 0
  with torch.no_grad():
    for X, y in test_loader:
      X = X.unsqueeze(1)  # (batch_size, seq_length, input_dim)
      X, y = X.to(device), y.to(device)
      outputs = model(X)
      loss = criterion(outputs, y)
      total_loss += loss.item()
      _, predicted = torch.max(outputs, 1)
      correct += (predicted == y).sum().item()

  return total_loss / len(test_loader), correct / len(test_loader.dataset)

In [17]:
X_tensor = torch.tensor(tfidf_okt_matrix.toarray(), dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

### cross-validation
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=2024)
BATCH_SIZE = 32

### 모델 설정
input_dim = tfidf_okt_matrix.shape[1]
hidden_dim = 64
output_dim = 13
lr = 0.001
epochs = 10

fold_val_loss = []
fold_val_acc = []

for fold, (train_indices, val_indices) in enumerate(kfold.split(X_tensor)):
  print(f'Fold {fold+1}/{kfold.n_splits}')

  train_dataset = TensorDataset(X_tensor[train_indices], y_tensor[train_indices])
  valid_dataset = TensorDataset(X_tensor[val_indices], y_tensor[val_indices])

  train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
  valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

  model = LSTMModel(input_dim, hidden_dim, output_dim)
  model.to(device)

  criterion = nn.CrossEntropyLoss()
  optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)

  for epoch in range(epochs):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    print(f"[Training] Epoch {epoch+1}: Loss = {train_loss:.4f}, Accuracy = {train_acc:.4f}")

  valid_loss, valid_acc = evaluate(model, valid_loader, criterion, device)
  print(f"[Validation] Loss = {valid_loss:.4f}, Accuracy = {valid_acc:.4f}")

  fold_val_loss.append(valid_loss)
  fold_val_acc.append(valid_acc)

print(f'Average Validation Loss: {np.mean(fold_val_loss):.4f}')
print(f'Average Validation Accuracy: {np.mean(fold_val_acc):.4f}')

Fold 1/10




[Training] Epoch 1: Loss = 2.5298, Accuracy = 0.1201




[Training] Epoch 2: Loss = 2.2604, Accuracy = 0.5118




[Training] Epoch 3: Loss = 1.5697, Accuracy = 0.8031




[Training] Epoch 4: Loss = 0.8763, Accuracy = 0.9457




[Training] Epoch 5: Loss = 0.4597, Accuracy = 0.9890




[Training] Epoch 6: Loss = 0.2450, Accuracy = 0.9967




[Training] Epoch 7: Loss = 0.1424, Accuracy = 0.9989




[Training] Epoch 8: Loss = 0.0883, Accuracy = 0.9989




[Training] Epoch 9: Loss = 0.0637, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0460, Accuracy = 1.0000
[Validation] Loss = 0.1842, Accuracy = 0.9704
Fold 2/10




[Training] Epoch 1: Loss = 2.5346, Accuracy = 0.1651




[Training] Epoch 2: Loss = 2.2755, Accuracy = 0.5837




[Training] Epoch 3: Loss = 1.5702, Accuracy = 0.8683




[Training] Epoch 4: Loss = 0.8583, Accuracy = 0.9781




[Training] Epoch 5: Loss = 0.4492, Accuracy = 0.9912




[Training] Epoch 6: Loss = 0.2387, Accuracy = 0.9978




[Training] Epoch 7: Loss = 0.1407, Accuracy = 0.9978




[Training] Epoch 8: Loss = 0.0932, Accuracy = 0.9984




[Training] Epoch 9: Loss = 0.0608, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0477, Accuracy = 1.0000
[Validation] Loss = 0.2482, Accuracy = 0.9360
Fold 3/10




[Training] Epoch 1: Loss = 2.5346, Accuracy = 0.1744




[Training] Epoch 2: Loss = 2.2790, Accuracy = 0.6018




[Training] Epoch 3: Loss = 1.5973, Accuracy = 0.8459




[Training] Epoch 4: Loss = 0.8923, Accuracy = 0.9654




[Training] Epoch 5: Loss = 0.4785, Accuracy = 0.9901




[Training] Epoch 6: Loss = 0.2505, Accuracy = 0.9978




[Training] Epoch 7: Loss = 0.1434, Accuracy = 0.9995




[Training] Epoch 8: Loss = 0.0894, Accuracy = 1.0000




[Training] Epoch 9: Loss = 0.0633, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0492, Accuracy = 1.0000
[Validation] Loss = 0.2488, Accuracy = 0.9507
Fold 4/10




[Training] Epoch 1: Loss = 2.5305, Accuracy = 0.1739




[Training] Epoch 2: Loss = 2.2763, Accuracy = 0.5963




[Training] Epoch 3: Loss = 1.6146, Accuracy = 0.7855




[Training] Epoch 4: Loss = 0.9187, Accuracy = 0.9748




[Training] Epoch 5: Loss = 0.4781, Accuracy = 0.9962




[Training] Epoch 6: Loss = 0.2430, Accuracy = 0.9989




[Training] Epoch 7: Loss = 0.1364, Accuracy = 0.9989




[Training] Epoch 8: Loss = 0.0878, Accuracy = 0.9995




[Training] Epoch 9: Loss = 0.0585, Accuracy = 0.9989




[Training] Epoch 10: Loss = 0.0450, Accuracy = 0.9995
[Validation] Loss = 0.2318, Accuracy = 0.9655
Fold 5/10




[Training] Epoch 1: Loss = 2.5301, Accuracy = 0.1876




[Training] Epoch 2: Loss = 2.2719, Accuracy = 0.5507




[Training] Epoch 3: Loss = 1.5841, Accuracy = 0.8256




[Training] Epoch 4: Loss = 0.8855, Accuracy = 0.9556




[Training] Epoch 5: Loss = 0.4609, Accuracy = 0.9918




[Training] Epoch 6: Loss = 0.2426, Accuracy = 0.9995




[Training] Epoch 7: Loss = 0.1397, Accuracy = 0.9989




[Training] Epoch 8: Loss = 0.0877, Accuracy = 1.0000




[Training] Epoch 9: Loss = 0.0634, Accuracy = 0.9995




[Training] Epoch 10: Loss = 0.0451, Accuracy = 0.9995
[Validation] Loss = 0.2598, Accuracy = 0.9310
Fold 6/10




[Training] Epoch 1: Loss = 2.5304, Accuracy = 0.2457




[Training] Epoch 2: Loss = 2.2606, Accuracy = 0.6506




[Training] Epoch 3: Loss = 1.5765, Accuracy = 0.8415




[Training] Epoch 4: Loss = 0.8798, Accuracy = 0.9753




[Training] Epoch 5: Loss = 0.4505, Accuracy = 0.9967




[Training] Epoch 6: Loss = 0.2366, Accuracy = 0.9967




[Training] Epoch 7: Loss = 0.1344, Accuracy = 0.9984




[Training] Epoch 8: Loss = 0.0860, Accuracy = 0.9995




[Training] Epoch 9: Loss = 0.0607, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0444, Accuracy = 1.0000
[Validation] Loss = 0.2746, Accuracy = 0.9458
Fold 7/10




[Training] Epoch 1: Loss = 2.5330, Accuracy = 0.1678




[Training] Epoch 2: Loss = 2.2734, Accuracy = 0.5554




[Training] Epoch 3: Loss = 1.5811, Accuracy = 0.8311




[Training] Epoch 4: Loss = 0.8758, Accuracy = 0.9698




[Training] Epoch 5: Loss = 0.4686, Accuracy = 0.9940




[Training] Epoch 6: Loss = 0.2460, Accuracy = 0.9973




[Training] Epoch 7: Loss = 0.1405, Accuracy = 0.9995




[Training] Epoch 8: Loss = 0.0930, Accuracy = 0.9989




[Training] Epoch 9: Loss = 0.0646, Accuracy = 0.9995




[Training] Epoch 10: Loss = 0.0479, Accuracy = 1.0000
[Validation] Loss = 0.2691, Accuracy = 0.9455
Fold 8/10




[Training] Epoch 1: Loss = 2.5342, Accuracy = 0.1656




[Training] Epoch 2: Loss = 2.2753, Accuracy = 0.5970




[Training] Epoch 3: Loss = 1.5840, Accuracy = 0.8525




[Training] Epoch 4: Loss = 0.8899, Accuracy = 0.9808




[Training] Epoch 5: Loss = 0.4511, Accuracy = 0.9967




[Training] Epoch 6: Loss = 0.2404, Accuracy = 0.9989




[Training] Epoch 7: Loss = 0.1323, Accuracy = 0.9989




[Training] Epoch 8: Loss = 0.0881, Accuracy = 0.9995




[Training] Epoch 9: Loss = 0.0623, Accuracy = 0.9995




[Training] Epoch 10: Loss = 0.0441, Accuracy = 0.9995
[Validation] Loss = 0.2047, Accuracy = 0.9505
Fold 9/10




[Training] Epoch 1: Loss = 2.5283, Accuracy = 0.2029




[Training] Epoch 2: Loss = 2.2597, Accuracy = 0.5312




[Training] Epoch 3: Loss = 1.5640, Accuracy = 0.8333




[Training] Epoch 4: Loss = 0.8576, Accuracy = 0.9764




[Training] Epoch 5: Loss = 0.4471, Accuracy = 0.9956




[Training] Epoch 6: Loss = 0.2354, Accuracy = 0.9978




[Training] Epoch 7: Loss = 0.1383, Accuracy = 0.9973




[Training] Epoch 8: Loss = 0.0882, Accuracy = 0.9984




[Training] Epoch 9: Loss = 0.0625, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0443, Accuracy = 1.0000
[Validation] Loss = 0.1951, Accuracy = 0.9901
Fold 10/10




[Training] Epoch 1: Loss = 2.5286, Accuracy = 0.2368




[Training] Epoch 2: Loss = 2.2572, Accuracy = 0.6124




[Training] Epoch 3: Loss = 1.5621, Accuracy = 0.8580




[Training] Epoch 4: Loss = 0.8726, Accuracy = 0.9775




[Training] Epoch 5: Loss = 0.4592, Accuracy = 0.9929




[Training] Epoch 6: Loss = 0.2449, Accuracy = 0.9973




[Training] Epoch 7: Loss = 0.1352, Accuracy = 0.9995




[Training] Epoch 8: Loss = 0.0926, Accuracy = 0.9989




[Training] Epoch 9: Loss = 0.0648, Accuracy = 0.9995




[Training] Epoch 10: Loss = 0.0490, Accuracy = 1.0000
[Validation] Loss = 0.2221, Accuracy = 0.9604
Average Validation Loss: 0.2338
Average Validation Accuracy: 0.9546


In [18]:
### 모델 평가
y_test = torch.tensor(np.array(y_test), dtype=torch.long)
test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test.toarray(), dtype=torch.float32), y_test)
test_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE)

test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f"[Test] Loss = {test_loss:.4f}, Accuracy = {test_acc:.4f}")

[Test] Loss = 1.7499, Accuracy = 0.3775
