### 데이터 불러오기

In [None]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [None]:
### 라이브러리
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import re
import urllib.request
import random
import torch
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm

### 폰트 설정
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt

!apt-get install -qq fonts-nanum
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
fm.fontManager.addfont(font_path)
plt.rc('font', family='NanumGothic')

In [None]:
### 시드 고정
random.seed(2024)
np.random.seed(2024)
torch.manual_seed(2024)
torch.cuda.manual_seed(2024)
torch.cuda.manual_seed_all(2024)
torch.backends.cudnn.deterministic = True

In [None]:
### 데이터
%cd '/content/drive/MyDrive/[2024-1] EURON 6기/프로젝트'
train = pd.read_csv('./review_train.csv', encoding='utf-8')
test = pd.read_csv('./review_test.csv', encoding='utf-8')
print(train.shape, test.shape)

/content/drive/MyDrive/[2024-1] EURON 6기/프로젝트
(2026, 2) (400, 2)


In [None]:
### label encoding
from sklearn.preprocessing import LabelEncoder
n = train.shape[0]
y = pd.concat([train,test])

le = LabelEncoder()
y['keyword2'] = le.fit_transform(y['keyword2'])
train = y[:n]
test = y[n:]

### NLP

In [None]:
### okt
from konlpy.tag import Okt
okt = Okt()

### 불용어 리스트 불러오기
with open('./new_stopwords.txt', 'r', encoding='utf-8') as file:
    stop_words = file.readlines()
stop_words = [word.strip() for word in stop_words]

### TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_okt = TfidfVectorizer(tokenizer=okt.morphs, ngram_range=(1,2), min_df=2,
                            max_df=0.95, stop_words=stop_words)
tfidf_okt_matrix = tfidf_okt.fit_transform(train['reviews'])
tfidf_okt_matrix_test = tfidf_okt.transform(test['reviews'])

In [None]:
### X/y split
X = tfidf_okt_matrix
y = train['keyword2']
X_test = tfidf_okt_matrix_test
y_test = test['keyword2']

### train/valid split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=2024
)

print(X_train.shape, X_valid.shape, X_test.shape)
print(y_train.shape, y_valid.shape, y_test.shape)

(1620, 92075) (406, 92075) (400, 92075)
(1620,) (406,) (400,)


### Baseline

In [None]:
### 라이브러리
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split


### 장치 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
### 베이스라인 LSTM 모델 구축
class LSTMModel(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim):
    super(LSTMModel, self).__init__()
    self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True,
                        dropout=0.2, bidirectional=False)
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    x, _ = self.lstm(x)
    x = x[:, -1, :]
    x = self.fc(x)
    return x

In [None]:
### 모델 훈련 함수
def train(model, train_loader, criterion, optimizer, device):
  model.train()
  total_loss = 0
  correct = 0

  for X, y in tqdm(train_loader, desc='Training', leave=False):
    X = X.unsqueeze(1)  # (batch_size, seq_length, input_dim)
    X, y = X.to(device), y.to(device)
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs, y)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
    _, predicted = torch.max(outputs, 1)
    correct += (predicted == y).sum().item()

  return total_loss / len(train_loader), correct / len(train_loader.dataset)

### 모델 평가 함수
def evaluate(model, test_loader, criterion, device):
  model.eval()
  total_loss = 0
  correct = 0
  with torch.no_grad():
    for X, y in test_loader:
      X = X.unsqueeze(1)  # (batch_size, seq_length, input_dim)
      X, y = X.to(device), y.to(device)
      outputs = model(X)
      loss = criterion(outputs, y)
      total_loss += loss.item()
      _, predicted = torch.max(outputs, 1)
      correct += (predicted == y).sum().item()

  return total_loss / len(test_loader), correct / len(test_loader.dataset)

In [None]:
X_tensor = torch.tensor(tfidf_okt_matrix.toarray(), dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

### cross-validation
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=2024)
BATCH_SIZE = 2

### 모델 설정
input_dim = tfidf_okt_matrix.shape[1]
hidden_dim = 64
output_dim = 13
lr = 0.001
epochs = 10

fold_val_loss = []
fold_val_acc = []

for fold, (train_indices, val_indices) in enumerate(kfold.split(X_tensor)):
  print(f'Fold {fold+1}/{kfold.n_splits}')

  train_dataset = TensorDataset(X_tensor[train_indices], y_tensor[train_indices])
  valid_dataset = TensorDataset(X_tensor[val_indices], y_tensor[val_indices])

  train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
  valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

  model = LSTMModel(input_dim, hidden_dim, output_dim)
  model.to(device)

  criterion = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=lr)

  for epoch in range(epochs):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    print(f"[Training] Epoch {epoch+1}: Loss = {train_loss:.4f}, Accuracy = {train_acc:.4f}")

  valid_loss, valid_acc = evaluate(model, valid_loader, criterion, device)
  print(f"[Validation] Loss = {valid_loss:.4f}, Accuracy = {valid_acc:.4f}")

  fold_val_loss.append(valid_loss)
  fold_val_acc.append(valid_acc)

print(f'Average Validation Loss: {np.mean(fold_val_loss):.4f}')
print(f'Average Validation Accuracy: {np.mean(fold_val_acc):.4f}')

Fold 1/10




[Training] Epoch 1: Loss = 1.5690, Accuracy = 0.6078




[Training] Epoch 2: Loss = 0.1215, Accuracy = 0.9945




[Training] Epoch 3: Loss = 0.0119, Accuracy = 0.9995




[Training] Epoch 4: Loss = 0.0053, Accuracy = 1.0000




[Training] Epoch 5: Loss = 0.0027, Accuracy = 1.0000




[Training] Epoch 6: Loss = 0.0014, Accuracy = 1.0000




[Training] Epoch 7: Loss = 0.0008, Accuracy = 1.0000




[Training] Epoch 8: Loss = 0.0004, Accuracy = 1.0000




[Training] Epoch 9: Loss = 0.0003, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0002, Accuracy = 1.0000
[Validation] Loss = 0.1101, Accuracy = 0.9704
Fold 2/10




[Training] Epoch 1: Loss = 1.5914, Accuracy = 0.6067




[Training] Epoch 2: Loss = 0.1219, Accuracy = 0.9940




[Training] Epoch 3: Loss = 0.0124, Accuracy = 1.0000




[Training] Epoch 4: Loss = 0.0055, Accuracy = 1.0000




[Training] Epoch 5: Loss = 0.0029, Accuracy = 1.0000




[Training] Epoch 6: Loss = 0.0015, Accuracy = 1.0000




[Training] Epoch 7: Loss = 0.0008, Accuracy = 1.0000




[Training] Epoch 8: Loss = 0.0005, Accuracy = 1.0000




[Training] Epoch 9: Loss = 0.0003, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0002, Accuracy = 1.0000
[Validation] Loss = 0.1736, Accuracy = 0.9360
Fold 3/10




[Training] Epoch 1: Loss = 1.5924, Accuracy = 0.5869




[Training] Epoch 2: Loss = 0.1334, Accuracy = 0.9929




[Training] Epoch 3: Loss = 0.0117, Accuracy = 1.0000




[Training] Epoch 4: Loss = 0.0052, Accuracy = 1.0000




[Training] Epoch 5: Loss = 0.0027, Accuracy = 1.0000




[Training] Epoch 6: Loss = 0.0015, Accuracy = 1.0000




[Training] Epoch 7: Loss = 0.0008, Accuracy = 1.0000




[Training] Epoch 8: Loss = 0.0005, Accuracy = 1.0000




[Training] Epoch 9: Loss = 0.0003, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0002, Accuracy = 1.0000
[Validation] Loss = 0.1180, Accuracy = 0.9606
Fold 4/10




[Training] Epoch 1: Loss = 1.5953, Accuracy = 0.6199




[Training] Epoch 2: Loss = 0.1268, Accuracy = 0.9962




[Training] Epoch 3: Loss = 0.0113, Accuracy = 0.9995




[Training] Epoch 4: Loss = 0.0050, Accuracy = 1.0000




[Training] Epoch 5: Loss = 0.0026, Accuracy = 1.0000




[Training] Epoch 6: Loss = 0.0014, Accuracy = 1.0000




[Training] Epoch 7: Loss = 0.0007, Accuracy = 1.0000




[Training] Epoch 8: Loss = 0.0004, Accuracy = 1.0000




[Training] Epoch 9: Loss = 0.0003, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0002, Accuracy = 1.0000
[Validation] Loss = 0.1469, Accuracy = 0.9606
Fold 5/10




[Training] Epoch 1: Loss = 1.6211, Accuracy = 0.5924




[Training] Epoch 2: Loss = 0.1242, Accuracy = 0.9956




[Training] Epoch 3: Loss = 0.0111, Accuracy = 1.0000




[Training] Epoch 4: Loss = 0.0051, Accuracy = 1.0000




[Training] Epoch 5: Loss = 0.0026, Accuracy = 1.0000




[Training] Epoch 6: Loss = 0.0014, Accuracy = 1.0000




[Training] Epoch 7: Loss = 0.0008, Accuracy = 1.0000




[Training] Epoch 8: Loss = 0.0004, Accuracy = 1.0000




[Training] Epoch 9: Loss = 0.0003, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0002, Accuracy = 1.0000
[Validation] Loss = 0.1741, Accuracy = 0.9409
Fold 6/10




[Training] Epoch 1: Loss = 1.6037, Accuracy = 0.5864




[Training] Epoch 2: Loss = 0.1390, Accuracy = 0.9956




[Training] Epoch 3: Loss = 0.0127, Accuracy = 1.0000




[Training] Epoch 4: Loss = 0.0055, Accuracy = 1.0000




[Training] Epoch 5: Loss = 0.0028, Accuracy = 1.0000




[Training] Epoch 6: Loss = 0.0015, Accuracy = 1.0000




[Training] Epoch 7: Loss = 0.0008, Accuracy = 1.0000




[Training] Epoch 8: Loss = 0.0005, Accuracy = 1.0000




[Training] Epoch 9: Loss = 0.0003, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0002, Accuracy = 1.0000
[Validation] Loss = 0.1550, Accuracy = 0.9507
Fold 7/10




[Training] Epoch 1: Loss = 1.5853, Accuracy = 0.6014




[Training] Epoch 2: Loss = 0.1236, Accuracy = 0.9923




[Training] Epoch 3: Loss = 0.0127, Accuracy = 1.0000




[Training] Epoch 4: Loss = 0.0056, Accuracy = 1.0000




[Training] Epoch 5: Loss = 0.0028, Accuracy = 1.0000




[Training] Epoch 6: Loss = 0.0015, Accuracy = 1.0000




[Training] Epoch 7: Loss = 0.0008, Accuracy = 1.0000




[Training] Epoch 8: Loss = 0.0005, Accuracy = 1.0000




[Training] Epoch 9: Loss = 0.0003, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0002, Accuracy = 1.0000
[Validation] Loss = 0.1305, Accuracy = 0.9554
Fold 8/10




[Training] Epoch 1: Loss = 1.5924, Accuracy = 0.5921




[Training] Epoch 2: Loss = 0.1406, Accuracy = 0.9956




[Training] Epoch 3: Loss = 0.0134, Accuracy = 1.0000




[Training] Epoch 4: Loss = 0.0060, Accuracy = 1.0000




[Training] Epoch 5: Loss = 0.0030, Accuracy = 1.0000




[Training] Epoch 6: Loss = 0.0016, Accuracy = 1.0000




[Training] Epoch 7: Loss = 0.0009, Accuracy = 1.0000




[Training] Epoch 8: Loss = 0.0005, Accuracy = 1.0000




[Training] Epoch 9: Loss = 0.0003, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0002, Accuracy = 1.0000
[Validation] Loss = 0.1692, Accuracy = 0.9554
Fold 9/10




[Training] Epoch 1: Loss = 1.5997, Accuracy = 0.5883




[Training] Epoch 2: Loss = 0.1277, Accuracy = 0.9951




[Training] Epoch 3: Loss = 0.0123, Accuracy = 0.9989




[Training] Epoch 4: Loss = 0.0057, Accuracy = 1.0000




[Training] Epoch 5: Loss = 0.0029, Accuracy = 1.0000




[Training] Epoch 6: Loss = 0.0016, Accuracy = 1.0000




[Training] Epoch 7: Loss = 0.0009, Accuracy = 1.0000




[Training] Epoch 8: Loss = 0.0005, Accuracy = 1.0000




[Training] Epoch 9: Loss = 0.0003, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0002, Accuracy = 1.0000
[Validation] Loss = 0.0682, Accuracy = 0.9851
Fold 10/10




[Training] Epoch 1: Loss = 1.5864, Accuracy = 0.6003




[Training] Epoch 2: Loss = 0.1291, Accuracy = 0.9929




[Training] Epoch 3: Loss = 0.0133, Accuracy = 0.9995




[Training] Epoch 4: Loss = 0.0058, Accuracy = 1.0000




[Training] Epoch 5: Loss = 0.0029, Accuracy = 1.0000




[Training] Epoch 6: Loss = 0.0015, Accuracy = 1.0000




[Training] Epoch 7: Loss = 0.0008, Accuracy = 1.0000




[Training] Epoch 8: Loss = 0.0005, Accuracy = 1.0000




[Training] Epoch 9: Loss = 0.0003, Accuracy = 1.0000




[Training] Epoch 10: Loss = 0.0002, Accuracy = 1.0000
[Validation] Loss = 0.1490, Accuracy = 0.9505
Average Validation Loss: 0.1394
Average Validation Accuracy: 0.9566


In [None]:
### 모델 평가
y_test = torch.tensor(np.array(y_test), dtype=torch.long)
test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test.toarray(), dtype=torch.float32), y_test)
test_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE)

test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f"[Test] Loss = {test_loss:.4f}, Accuracy = {test_acc:.4f}")

[Test] Loss = 2.1224, Accuracy = 0.3625
