# 텍스트 임베딩과 텍스트 분류

## 개요

- 사용 데이터: News Group 20

- 설계
| 태스크 | 기술 |
|----------------|-----------------------------|
| 데이터 전처리 | 데이터 로드, 전처리 |
| 토큰화 | nltk, BPE |
| 임베딩 | World2Vec, FastText, Glove  |
| 시퀀스 처리 | LSTM, GRU  |
| 출력 | 소프트맥스 분류  |

# 1. 데이터 전처리 및 토큰화

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m97.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
# 파일 경로 및 파일 읽기 라이브러리
from pathlib import Path
from dataclasses import dataclass
import zipfile
import urllib.request
import tarfile
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from collections import Counter

# 모델 다운로드
from gensim.models import Word2Vec, FastText

# 토큰 관련 라이브러리
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Lowercase, Sequence

# 사이킷런 관련 라이브러리
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 파이토치 관련 라이브러리
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F

# 훈련 시 시각화
from tqdm import tqdm

In [None]:
"""
Config 클래스 정의

- 상수 변수 정의
- @dataclass 데코레이터 추가
"""
@dataclass
class Config:
    # 데이터 폴더 생성
    root = Path(".")
    raw_dir = root / "data"
    data_dir = raw_dir / "20news-bydate"
    train_dir = data_dir / "20news-bydate-train"
    test_dir = data_dir / "20news-bydate-test"
    model_dir = root / "models"

    # 데이터로더용 변수 선언
    batch_size = 16
    num_workers = 2
    max_len = 512

    # 데이터 분리용 변수 선언
    seed = 42

    # 학습용 변수 선언: 학습률, 에포크, 인내심, 최소 개선 폭
    lr = 1e-3
    epochs = 11
    patience = 3
    min_delta = 0.01

    # 디바이스 설정
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
# 변수 생성 및 폴더 생성
cfg = Config()

# 모델, 데이터 저장 폴더
cfg.model_dir.mkdir(parents=True, exist_ok=True)
cfg.raw_dir.mkdir(parents=True, exist_ok=True)

## 1.1. 데이터 다운로드 및 데이터셋 생성

In [None]:
"""
데이터 다운로드

- 다운로드 이슈로 직접 다운로드
- https://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz
"""
with tarfile.open(f"{str(cfg.raw_dir)}/20news-bydate.tar.gz", "r:gz") as t:
    t.extractall(path=f"{str(cfg.raw_dir)}/20news-bydate")

  t.extractall(path=f"{str(cfg.raw_dir)}/20news-bydate")


In [None]:
# 데이터셋 클래스 정의
class DocData(Dataset):
    def __init__(self, data_dir: Path):
        """
        데이터셋 클래스 초기화

        Args:
            data_dir (Path): 데이터 디렉토리 경로
        """
        self.data_dir = data_dir

        # 데이터셋 내 모든 파일 경로 수집
        self.file_paths = [p for p in self.data_dir.rglob("*") if p.is_file()]

        # 클래스 정수 라벨 매핑
        self.classes = sorted({p.parent.name for p in self.file_paths})
        self.class_to_idx = {cls: i for i, cls in enumerate(self.classes)}

    def __len__(self) -> int:
        """
        데이터셋 크기 반환

        Returns:
            int: 데이터셋 크기
        """
        return len(self.file_paths)

    def __getitem__(self, idx: int) ->  tuple[str, str]:
        """
        인덱스에 해당하는 문서 반환

        Args:
            idx (int): 문서 인덱스

        Returns:
            str: 문서 내용
        """
        file_path = self.file_paths[idx]
        label = self.class_to_idx[file_path.parent.name]
        with open(file_path, "r", encoding="latin-1") as f:
            content = f.read()
        return content, label

In [None]:
train_data = DocData(cfg.train_dir)
texts = train_data[0]
labels = train_data.classes


In [None]:
print(f"문서 길이: {len(texts)}")
print(f"문서 샘플: \n{texts}")
print(f"라벨: {len(labels)}")

문서 길이: 2
문서 샘플: 
('From: mtt@kepler.unh.edu (Matthew T Thompson)\nSubject: music censorship survey - please fill out\nOrganization: University of New Hampshire  -  Durham, NH\nLines: 68\nNNTP-Posting-Host: kepler.unh.edu\n\nHello, I\'m doing a paper on censorship in music and I would appreciate it if you took the time to participate in this survey.  Please answer as each question asks (\'why?\' simply means that you have room to explain your answer, if you chose.).  The last question is for any comments, questions, or suggestions.  Thank you in advance, please E-mail to the address at the end.\n\nI)  are you [male/female]\nII) what is your age? \nIII)what is your major/occupation?\nIV) what type of music do you listen to (check all that apply)?\n      a.  hard rock   b.  metal   c.  alternative   d.  blues    e.  rap\n      f.  jazz    g.  soft rock   h.  easy listening   i.  country   \n      j.  classical   k.  hard core   l.  dance   m.  new age\n      n.  others (did I miss any?)__

In [None]:
test_data = DocData(cfg.test_dir)
print(f"테스트 데이터 길이: {len(test_data)}")

테스트 데이터 길이: 7532


## 1.2. NLTK 기반 단어 토큰화

In [None]:
# 필요한 NLTK 리소스 다운로드
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

# 불용어 집합 (한 번만 생성)
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to /home/ahnhs2k/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/ahnhs2k/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ahnhs2k/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# 텍스트 전처리 함수 정의
def clean_text(text: str) -> str:
    """
    텍스트 전처리 함수
    - 소문자 변환
    - 불필요한 공백 정리
    - 과도한 특수문자 제거는 하지 않음
    """
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
# 토큰화 함수
def tokenize(text: str) -> list[str]:
    """
    토큰화 함수
    - NLTK word_tokenize 사용
    - 불용어 제거
    """
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

## 1.3. BPE 기반 토큰화

In [None]:
# BPE 토크나이저 학습 함수
def train_bpe_tokenizer(
    texts: list[str],
    vocab_size: int = 20000,
    min_freq: int = 2
) -> Tokenizer:
    """
    BPE 토크나이저 학습
    - vocab_size: 서브워드 vocab 최대 크기
    - min_freq: 최소 등장 빈도
    """
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

    # 정규화: 소문자화
    tokenizer.normalizer = Sequence([Lowercase()])

    # 1차 분리: 공백 기준
    tokenizer.pre_tokenizer = Whitespace()

    trainer = BpeTrainer(
        vocab_size=vocab_size,
        min_frequency=min_freq,
        special_tokens=["[PAD]", "[UNK]"]
    )

    tokenizer.train_from_iterator(texts, trainer)
    return tokenizer

In [None]:
# BPE 토크나이저를 위한 텍스트 리스트 생성
texts_for_bpe = [text for text, _ in train_data]

In [None]:
# BPE 토크나이저 생성
bpe_tokenizer = train_bpe_tokenizer(texts_for_bpe)
bpe_tokenizer_path = cfg.model_dir / "bpe_tokenizer.json"
bpe_tokenizer.save(str(bpe_tokenizer_path))
print(f"BPE 토크나이저 vocab size: {bpe_tokenizer.get_vocab_size()}")




BPE 토크나이저 vocab size: 20000


In [None]:
# BPE 토큰화 함수
def bpe_tokenize(text: str) -> list[int]:
    """
    BPE 토큰화 함수
    - 텍스트를 BPE 토크나이저로 토큰화
    """
    encoding = bpe_tokenizer.encode(text)
    return encoding.ids


In [None]:
# 인코딩 테스트
test_t, test_l = train_data[0]
test_ids = bpe_tokenize(test_t)

print(f"원본 텍스트: {test_t[:30]}")
print(f"토큰 아이디: {test_ids[:30]}")

원본 텍스트: From: mtt@kepler.unh.edu (Matt
토큰 아이디: [193, 34, 59, 1023, 40, 17681, 22, 14622, 22, 196, 16, 3278, 66, 9417, 17, 259, 34, 4312, 15380, 6357, 21, 755, 2949, 211, 271, 34, 410, 130, 438, 14834]


# 2. 임베딩

## 2.1. Word2Vec

In [None]:
# 학습용 코퍼스 준비
train_sentences = [tokenize(text) for text, _ in train_data]

In [None]:
"""
Word2Vec 모델 생성 및 학습
- sg=1 : Skip-gram 방식
- vector_size : 임베딩 차원
- window : 주변 단어 탐색 범위
- min_count : 최소 등장 빈도 (희귀 단어 제거)
- workers : 멀티프로세싱 스레드 수
"""

w2v_model = Word2Vec(
    sentences=train_sentences,  # 토큰 시퀀스
    vector_size=300,            # 임베딩 차원
    window=5,                   # 컨텍스트 윈도우 크기
    min_count=2,                # 최소 등장 빈도
    workers=4,                  # 병렬 처리
    sg=1                        # Skip-gram 사용
)

# 학습 완료 후 vocab 크기 출력
print("Word2Vec vocab 사이즈:", len(w2v_model.wv))

Word2Vec vocab 사이즈: 88479


In [None]:
# Word2Vec 임베딩 벡터
vec_w2v = w2v_model.wv["name"]
print("Word2Vec vector shape:", vec_w2v.shape)

Word2Vec vector shape: (300,)


In [None]:
# 특수 토큰 먼저 정의
w2v_token_to_id = {
    "[PAD]": 0,
    "[UNK]": 1,
}

# Word2Vec vocab 그대로 추가
for token in w2v_model.wv.index_to_key:
    w2v_token_to_id[token] = len(w2v_token_to_id)

w2v_vocab_size = len(w2v_token_to_id)
print("최종 Vocab 사이즈:", w2v_vocab_size)

최종 Vocab 사이즈: 88481


In [None]:
# Word2Vec에서 학습된 임베딩 차원
w2v_embed_dim = w2v_model.vector_size

# 임베딩 행렬 초기화
# 각 행 = 하나의 토큰 ID에 대응하는 임베딩 벡터
w2v_embedding_matrix = np.zeros(
    (w2v_vocab_size, w2v_embed_dim),
    dtype=np.float32
)

# w2v_token_to_id 순서에 맞춰 임베딩 채우기
for token, idx in w2v_token_to_id.items():
    if token in w2v_model.wv:
        # Word2Vec이 학습한 단어면 해당 벡터 사용
        w2v_embedding_matrix[idx] = w2v_model.wv[token]
    else:
        # [PAD], [UNK] 같은 특수 토큰은
        # 작은 랜덤값으로 초기화
        w2v_embedding_matrix[idx] = np.random.normal(
            scale=0.01,
            size=(w2v_embed_dim,)
        )

In [None]:
# numpy -> torch tensor 변환
w2v_embedding_tensor = torch.tensor(w2v_embedding_matrix)

# 사전 학습된 임베딩을 사용하는 Embedding 레이어
w2v_embedding_layer = nn.Embedding.from_pretrained(
    w2v_embedding_tensor,
    freeze=False,
    padding_idx=w2v_token_to_id["[PAD]"]
)

In [None]:
# 모델 입력용 토큰-아이디 변환 함수
def w2v_tokens_to_ids(tokens: list[str]) -> list[int]:
    """
    토큰 리스트를 정수 ID 리스트로 변환
    - 사전에 없는 토큰은 [UNK] ID로 치환
    """
    unk_id = w2v_token_to_id["[UNK]"]
    return [w2v_token_to_id.get(token, unk_id) for token in tokens]

## 2.2. FastText

In [None]:
# FastText는 n-gram(subword)을 함께 학습.

ft_model = FastText(
    sentences=train_sentences,   # 토큰 시퀀스
    vector_size=300,             # 임베딩 차원
    window=5,                    # 컨텍스트 윈도우 크기
    min_count=2,                 # 최소 등장 빈도
    workers=4                    # 병렬 처리
)

# 학습 완료 후 vocab 크기 출력
print("FastText vocab 사이즈:", len(ft_model.wv))

FastText vocab 사이즈: 88479


In [None]:
vec_ft = ft_model.wv["computer"]
print("FastText vector shape:", vec_ft.shape)

FastText vector shape: (300,)


In [None]:
# 특수 토큰 먼저 정의
ft_token_to_id = {
    "[PAD]": 0,
    "[UNK]": 1,
}

# FastText가 학습한 vocab 그대로 사용
for token in ft_model.wv.index_to_key:
    ft_token_to_id[token] = len(ft_token_to_id)

ft_vocab_size = len(ft_token_to_id)

In [None]:
# FastText에서 학습된 임베딩 차원
ft_embed_dim = ft_model.vector_size

ft_embedding_matrix = np.zeros((ft_vocab_size, ft_embed_dim), dtype=np.float32)

for token, idx in ft_token_to_id.items():
    if token in ft_model.wv:
        ft_embedding_matrix[idx] = ft_model.wv[token]
    else:
        # [PAD], [UNK]
        ft_embedding_matrix[idx] = np.random.normal(
            scale=0.01,
            size=(ft_embed_dim,)
        )

In [None]:
# numpy -> torch tensor 변환
ft_embedding_tensor = torch.tensor(ft_embedding_matrix)

ft_embedding_layer = nn.Embedding.from_pretrained(
    ft_embedding_tensor,
    freeze=False,
    padding_idx=ft_token_to_id["[PAD]"]
)

In [None]:
# 모델 입력용 토큰-아이디 변환 함수
def ft_tokens_to_ids(tokens: list[str]) -> list[int]:
    """
    토큰 리스트를 정수 ID 리스트로 변환
    - 사전에 없는 토큰은 [UNK] ID로 치환
    """
    unk_id = ft_token_to_id["[UNK]"]
    return [ft_token_to_id.get(token, unk_id) for token in tokens]

## 2.3. Glove

In [None]:
# GloVe 다운로드
GLOVE_URL = "https://nlp.stanford.edu/data/glove.6B.zip"
GLOVE_ZIP = cfg.model_dir / "glove.6B.zip"
GLOVE_FILE = cfg.model_dir / "glove.6B.300d.txt"

urllib.request.urlretrieve(GLOVE_URL, GLOVE_ZIP)
with zipfile.ZipFile(GLOVE_ZIP, 'r') as zip_ref:
    zip_ref.extractall(cfg.model_dir)

print("GloVe embeddings downloaded and extracted.")

GloVe embeddings downloaded and extracted.


In [None]:
# GloVe 임베딩 로드 함수 정의
def load_glove_embeddings(glove_path: str, embedding_dim: int) -> dict[str, np.ndarray]:
    """
    GloVe 텍스트 파일을 읽어
    {단어(str): 임베딩 벡터(np.ndarray)} 형태로 변환한다.

    GloVe 파일 한 줄의 실제 형태:
        word 0.123 0.134 ... 0.532

    Args:
        glove_path (str): GloVe 임베딩 파일 경로
        embedding_dim (int): 임베딩 차원

    Returns:
        dict[str, np.ndarray]: 단어와 해당 임베딩 벡터의 딕셔너리

    처리 순서:
    1. 한 줄을 공백 기준으로 분리
    2. 첫 번째 인덱스: 단어 문자열
    3. 나머지: 임베딩 벡터 값들
    """
    glove_dict = {}

    with open(glove_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.rstrip().split(" ")

            # GloVe 포맷: 첫 칸은 단어
            word = parts[0]

            # 나머지는 해당 단어의 임베딩 벡터 값
            vector = np.asarray(parts[1:], dtype=np.float32)

            # 차원이 맞지 않으면 무시 (파일/설정 불일치 방어)
            if vector.shape[0] != embedding_dim:
                continue

            glove_dict[word] = vector

    return glove_dict


In [None]:
# 특수 토큰 먼저 정의
gl_token_to_id = {
    "[PAD]": 0,
    "[UNK]": 1,
}

# Glove 사전 객체 생성
gl_dict = load_glove_embeddings(GLOVE_FILE, embedding_dim=300)

# Glove용 token_to_id 생성
for token in gl_dict.keys():
    gl_token_to_id[token] = len(gl_token_to_id)

gl_vocab_size = len(gl_token_to_id)


In [None]:
# (vocab_size, embed_dim) 크기의 임베딩 행렬 생성
gl_embedding_matrix = np.zeros((gl_vocab_size, 300), dtype=np.float32)

# PAD 토큰은 0 설정
pad_id = gl_token_to_id["[PAD]"]
gl_embedding_matrix[pad_id] = 0.0

# GloVe에 존재하는 단어만 사전학습 벡터로 덮어씀
for token, idx in gl_token_to_id.items():
    if token in gl_dict:
        gl_embedding_matrix[idx] = gl_dict[token]

In [None]:
# numpy -> torch tensor 변환
gl_embedding_tensor = torch.tensor(gl_embedding_matrix)

# Glove 임베딩 레이어 생성
gl_embedding_layer = nn.Embedding.from_pretrained(
    embeddings=torch.tensor(gl_embedding_matrix),
    freeze=False,
    padding_idx=pad_id
)

In [None]:
# 모델 입력용 토큰-아이디 변환 함수
def gl_tokens_to_ids(tokens: list[str]) -> list[int]:
    """
    토큰 리스트를 정수 ID 리스트로 변환
    - 사전에 없는 토큰은 [UNK] ID로 치환
    """
    unk_id = gl_token_to_id["[UNK]"]
    return [gl_token_to_id.get(token, unk_id) for token in tokens]

## 2.3. collate_fn 함수 정의

In [None]:
# DataLoader용 collate_fn 함수 정의 (Word2Vec, FastText, GloVe 공통)
def make_collate_fn(tokens_to_ids, token_to_id, max_len: int = cfg.max_len):
    """
    DataLoader용 collate_fn 생성 함수

    - cfg.max_len으로 지나치게 긴 텍스트 truncate
    - cfg.max_len보다 짧은 텍스트는 pad 추가 + 실제 길이도 반환

    Args:
        tokens_to_ids (callable):
            list[str] -> list[int] 변환 함수
        token_to_id (dict[str, int]):
            토큰 -> ID 매핑 딕셔너리
        max_len (int):
            문서당 최대 토큰 길이 (OOM 방지용)

    Returns:
        collate_fn (callable)
    """
    pad_id = token_to_id["[PAD]"]

    def collate_fn(batch):
        # batch = [(text, label), ...]
        texts, labels = zip(*batch)

        # 1. 텍스트 → 토큰 (길이 제한!!)
        token_lists = [
            tokenize(clean_text(text))[:max_len]
            for text in texts
        ]

        # 2. 토큰 -> ID
        id_lists = [tokens_to_ids(tokens) for tokens in token_lists]

        # 3. 실제 길이 (last_valid_hidden / pack용)
        lengths = torch.tensor(
            [len(ids) for ids in id_lists],
            dtype=torch.long
        )

        # 4. padding
        cur_max_len = max(lengths).item()
        padded_ids = [
            ids + [pad_id] * (cur_max_len - len(ids))
            for ids in id_lists
        ]

        # 5. tensor 변환
        input_ids = torch.tensor(padded_ids, dtype=torch.long)
        labels_t = torch.tensor(labels, dtype=torch.long)

        return input_ids, labels_t, lengths

    return collate_fn


In [None]:
# BPE DataLoader용 collate_fn 함수 정의
def make_bpe_collate_fn(bpe_tokenizer, max_len: int = cfg.max_len):
    """
    BPE용 collate_fn

    Args:
        bpe_tokenizer: HuggingFace Tokenizer (BPE)
        max_len (int): 문서당 최대 subword 길이 (OOM 방지용)

    Returns:
        collate_fn
    """
    pad_id = bpe_tokenizer.token_to_id("[PAD]")

    def collate_fn(batch):
        texts, labels = zip(*batch)

        # 1. BPE 토큰화 + ID 변환 + 길이 제한
        id_lists = [
            bpe_tokenizer.encode(text).ids[:max_len]
            for text in texts
        ]

        # 2. 실제 길이
        lengths = torch.tensor(
            [len(ids) for ids in id_lists],
            dtype=torch.long
        )

        # 3. batch 내 최대 길이
        cur_max_len = max(lengths).item()

        # 4. padding
        padded_ids = [
            ids + [pad_id] * (cur_max_len - len(ids))
            for ids in id_lists
        ]

        # 5. tensor 변환
        input_ids = torch.tensor(padded_ids, dtype=torch.long)
        labels_t = torch.tensor(labels, dtype=torch.long)

        return input_ids, labels_t, lengths

    return collate_fn


## 2.4. 데이터로더 생성

In [None]:
# Word2Vec용 collate_fn 생성
w2v_collate_fn = make_collate_fn(w2v_tokens_to_ids, w2v_token_to_id)

# Word2Vec용 훈련 DataLoader 생성
w2v_train_loader = DataLoader(
    train_data,
    batch_size=cfg.batch_size,
    shuffle=True,
    collate_fn=w2v_collate_fn,
    num_workers=cfg.num_workers,
    drop_last=True
)

# Word2Vec용 테스트 DataLoader 생성
w2v_test_loader = DataLoader(
    test_data,
    batch_size=cfg.batch_size,
    shuffle=False,
    collate_fn=w2v_collate_fn,
    num_workers=cfg.num_workers,
    drop_last=False
)

In [None]:
# Word2Vec DataLoader 테스트
wx, wy, wz  = next(iter(w2v_train_loader))

print(wx.shape)  # (batch_size, seq_len)
print(wy.shape)  # (batch_size,)
print(wz)        # (batch_size,) 실제 길이 *cfg.max_len

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


torch.Size([16, 391])
torch.Size([16])
tensor([248, 223,  52, 138, 223, 132, 263, 391, 339, 382, 122,  96, 168, 143,
        150,  85])


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# FastText용 collate_fn 생성
ft_collate_fn = make_collate_fn(ft_tokens_to_ids, ft_token_to_id)

# FastText용 훈련 DataLoader 생성
ft_train_loader = DataLoader(
    train_data,
    batch_size=cfg.batch_size,
    shuffle=True,
    collate_fn=ft_collate_fn,
    num_workers=cfg.num_workers,
    drop_last=True
)

# FastText용 테스트 DataLoader 생성
ft_test_loader = DataLoader(
    test_data,
    batch_size=cfg.batch_size,
    shuffle=False,
    collate_fn=ft_collate_fn,
    num_workers=cfg.num_workers,
    drop_last=False
)

In [None]:
# FastText DataLoader 테스트
fx, fy, fz = next(iter(ft_train_loader))

print(fx.shape)  # (batch_size, seq_len)
print(fy.shape)  # (batch_size,)
print(fz)  # (batch_size,) 실제 길이

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


torch.Size([16, 512])
torch.Size([16])
tensor([152, 244, 111, 214, 156, 447, 237, 105, 135, 382,  60, 512, 217, 322,
        136, 229])


In [None]:
# Glove용 collate_fn 생성
gl_collate_fn = make_collate_fn(gl_tokens_to_ids, gl_token_to_id)

# Glove용 훈련 DataLoader 생성
gl_train_loader = DataLoader(
    train_data,
    batch_size=cfg.batch_size,
    shuffle=True,
    collate_fn=gl_collate_fn,
    num_workers=cfg.num_workers,
    drop_last=True
)

# Glove용 테스트 DataLoader 생성
gl_test_loader = DataLoader(
    test_data,
    batch_size=cfg.batch_size,
    shuffle=False,
    collate_fn=gl_collate_fn,
    num_workers=cfg.num_workers,
    drop_last=False
)

In [None]:
# Glove DataLoader 테스트
gx, gy, gz = next(iter(gl_train_loader))

print(gx.shape)  # (batch_size, seq_len)
print(gy.shape)  # (batch_size,)
print(gz)  # (batch_size,) 실제 길이

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


torch.Size([16, 512])
torch.Size([16])
tensor([512, 138, 176, 103, 276, 365, 164,  93, 132, 512, 397, 144, 119, 512,
        205,  42])


In [None]:
# bpe용 collate_fn 생성
bpe_collate_fn = make_bpe_collate_fn(bpe_tokenizer)

# BPE용 훈련 DataLoader 생성
bpe_train_loader = DataLoader(
    train_data,
    batch_size=cfg.batch_size,
    shuffle=True,
    collate_fn=bpe_collate_fn,
    drop_last=True
)

# BPE용 테스트 DataLoader 생성
bpe_test_loader = DataLoader(
    test_data,
    batch_size=cfg.batch_size,
    shuffle=False,
    collate_fn=bpe_collate_fn,
    drop_last=False
)

In [None]:
# BPE DataLoader 테스트
bx, by, bz = next(iter(bpe_train_loader))

print(bx.shape)  # (batch_size, seq_len)
print(by.shape)  # (batch_size,)
print(bz)  # (batch_size,) 실제 길이

torch.Size([16, 349])
torch.Size([16])
tensor([180, 171, 105, 193, 128, 230, 118, 127, 202, 281, 349,  67, 147, 296,
        272, 141])


# 3. 분류 모델링

## 3.1. 공통 유틸

In [None]:
# 시퀀스 출력에서 유효한 마지막 타임스텝의 은닉 상태 추출 함수
def last_valid_hidden(outputs, lengths):
    """
    outputs: (batch, seq_len, hidden)
    lengths: (batch,) 실제 길이
    """
    # gather 연상을 위해 view 사용 (batch, 1, 1)
    idx = (lengths - 1).view(-1, 1, 1)
    idx = idx.expand(
        outputs.size(0),  # batch size
        1,                # 하나의 time dimension
        outputs.size(2)   # hidden dim
    )
    # 시간 축 기준으로 연산
    return outputs.gather(1, idx).squeeze(1)

## 3.2. LSTM

In [None]:
# LSTM 분류기 정의 (w2v, ft, bpe 교체 허용)
class LSTMClassifier(nn.Module):
    def __init__(self, embedding, hidden_dim, num_classes, dropout=0.3):
        """
        LSTM 모델 설계
        Args:
            embedding: 앞서 정의한 w2v, ft, glove, bpe 임베딩 레이어 중 하나
            hidden_dim: 모델의 은닉 차원
            num_classes: 마지막 선형 층에서 사용할 클래스 수

        Returns:
            logits: 전체 클래스 수에 대한 점수
        """
        super().__init__()
        self.embedding = embedding                 # w2v, ft, glove, bpe 교체 지점
        self.lstm = nn.LSTM(
            input_size=embedding.embedding_dim,
            hidden_size=hidden_dim,
            batch_first=True
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)

    # collate_fn의 lengths를 넘겨 패딩을 제외한 마지막 은닉 차원을 선형 변환
    def forward(self, input_ids, lengths):
        x = self.embedding(input_ids)              # (B, T, D)
        x = x.contiguous()
        out, (h_n, c_n) = self.lstm(x)             # (B, T, H)
        last = last_valid_hidden(out, lengths)     # (B, H)
        last = self.dropout(last)
        return self.fc(last)


## 3.3. GRU

In [None]:
# GRU 분류기 정의 (w2v, ft, bpe 교체 허용)
class GRUClassifier(nn.Module):
    def __init__(self, embedding, hidden_dim, num_classes, dropout=0.3, bidir=False):
        """
        GRU 모델 설계
        Args:
            embedding: 앞서 정의한 w2v, ft, glove, bpe 임베딩 레이어 중 하나
            hidden_dim: 모델의 은닉 차원
            num_classes: 마지막 선형 층에서 사용할 클래스 수

        Returns:
            logits: 전체 클래스 수에 대한 점수
        """
        super().__init__()
        self.embedding = embedding                 # w2v, ft, bpe 교체 지점
        self.gru = nn.GRU(
            input_size=embedding.embedding_dim,
            hidden_size=hidden_dim,
            batch_first=True,
            bidirectional=bidir
        )
        out_dim = hidden_dim * (2 if bidir else 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(out_dim, num_classes)

    def forward(self, input_ids, lengths):
        x = self.embedding(input_ids)
        x = x.contiguous()
        out, _ = self.gru(x)
        last = last_valid_hidden(out, lengths)
        last = self.dropout(last)
        return self.fc(last)

## 3.4. 모델 인스턴스화

### A. Word2Vec

In [None]:
# 앞서 만든 w2v_embedding_layer 사용
w2v_lstm = LSTMClassifier(
    embedding=w2v_embedding_layer,
    hidden_dim=128,
    num_classes=20
).to(cfg.device)

w2v_gru = GRUClassifier(
    embedding=w2v_embedding_layer,
    hidden_dim=128,
    num_classes=20,
    bidir=True
).to(cfg.device)

### B. FastText

In [None]:
# 앞서 만든 ft_embedding_layer 사용
ft_lstm = LSTMClassifier(
    embedding=ft_embedding_layer,
    hidden_dim=128,
    num_classes=20
).to(cfg.device)

ft_gru = GRUClassifier(
    embedding=ft_embedding_layer,
    hidden_dim=128,
    num_classes=20,
    bidir=True
).to(cfg.device)

### C. Glove

In [None]:
# 앞서 만든 gl_embedding_layer 사용
gl_lstm = LSTMClassifier(
    embedding=gl_embedding_layer,
    hidden_dim=128,
    num_classes=20
).to(cfg.device)

gl_gru = GRUClassifier(
    embedding=gl_embedding_layer,
    hidden_dim=128,
    num_classes=20,
    bidir=True
).to(cfg.device)

### D. BPE

In [None]:
# BPE vocab_size, embed_dim
bpe_embedding_bpe = nn.Embedding(
    num_embeddings=bpe_tokenizer.get_vocab_size(),
    embedding_dim=300,
    padding_idx=bpe_tokenizer.token_to_id("[PAD]")
)

bpe_lstm = LSTMClassifier(
    embedding=bpe_embedding_bpe,
    hidden_dim=128,
    num_classes=20
).to(cfg.device)

bpe_gru = GRUClassifier(
    embedding=bpe_embedding_bpe,
    hidden_dim=128,
    num_classes=20,
    bidir=True
).to(cfg.device)

# 4. 훈련

## 4.1. 훈련용 공용 유틸

In [None]:
# 조기 종료 클래스 (w2v, ft, glove용)
class EarlyStopping:
    def __init__(self, patience: int=cfg.patience, min_delta: float = 0.0, save_path: str | Path = "model.pt"):
        """
        Args:
            patience (int): 개선이 없을 때 허용 epoch 수
            min_delta (float): 최소 개선 폭
            save_path (str | Path): best model 저장 경로
        """
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float("inf")
        self.counter = 0
        self.save_path = Path(save_path)

    def step(self, val_loss: float, model: torch.nn.Module, token_to_id, embedding_matrix) -> bool:
        """
        Args:
            val_loss (float): 현재 epoch의 validation loss
            model (nn.Module): 현재 모델

        Returns:
            bool: True면 학습 중단, False면 계속
        """
        if val_loss < self.best_loss - self.min_delta:
            # 성능 개선
            self.best_loss = val_loss
            self.counter = 0

            # best model 저장
            torch.save({
                "model_state": model.state_dict(),
                "token_to_id": token_to_id,
                "embedding_matrix": embedding_matrix,
            }, self.save_path)
            print(f"Validation loss improved. Best model saved to {self.save_path}")

            return False
        else:
            self.counter += 1
            print(f"EarlyStopping counter: {self.counter} / {self.patience}")

            return self.counter >= self.patience

In [None]:
# bpe용 EarlyStopping
class EarlyStopping_bpe:
    def __init__(
        self,
        patience: int = cfg.patience,
        min_delta: float = 0.0,
        save_path: str | Path = "best_bpe_model.pt",
        tokenizer_path: str | Path = "best_bpe_tokenizer.json",
    ):
        """
        Args:
            patience (int): 개선이 없을 때 허용 epoch 수
            min_delta (float): 최소 개선 폭
            save_path (str | Path): best model 저장 경로 (.pt)
            tokenizer_path (str | Path): BPE tokenizer 저장 경로 (.json)
        """
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float("inf")
        self.counter = 0

        self.save_path = Path(save_path)
        self.tokenizer_path = Path(tokenizer_path)

    def step(self, val_loss: float, model: torch.nn.Module, tokenizer) -> bool:
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0

            # 1. 모델 가중치 저장
            torch.save(
                {"model_state": model.state_dict()},
                self.save_path
            )

            # 2. BPE 토크나이저 저장 (JSON)
            tokenizer.save(str(self.tokenizer_path))

            print(
                f"[BPE] Best model saved to {self.save_path}, "
                f"tokenizer saved to {self.tokenizer_path}"
            )
            return False
        else:
            self.counter += 1
            print(f"EarlyStopping counter: {self.counter}/{self.patience}")
            return self.counter >= self.patience


In [None]:
# 훈련 함수 정의
def my_trainer(model, dataloader, optimizer, loss_fn, device=cfg.device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    for batch in dataloader:
        # batch: (input_ids, labels, lengths)
        input_ids, labels, lengths = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        lengths = lengths.to(device)

        optimizer.zero_grad()

        logits = model(input_ids, lengths)          # (B, C)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)

        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / total
    acc = correct / total
    return avg_loss, acc

In [None]:
@torch.no_grad()
def evaluate(model, dataloader, loss_fn, device=cfg.device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    for batch in dataloader:
        input_ids, labels, lengths = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        lengths = lengths.to(device)

        logits = model(input_ids, lengths)
        loss = loss_fn(logits, labels)

        total_loss += loss.item() * labels.size(0)

        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / total
    acc = correct / total
    return avg_loss, acc

## 4.2. Word2Vec LSTM & GRU

In [None]:
# 공용
loss_fn = nn.CrossEntropyLoss()
early_stopping = EarlyStopping()
bpe_early_stopping = EarlyStopping_bpe()

# w2v 전용
wl_optimizer = torch.optim.Adam(w2v_lstm.parameters(), lr=cfg.lr)
wg_optimizer = torch.optim.Adam(w2v_gru.parameters(), lr=cfg.lr)

In [None]:
# LSTM 모델 훈련
wl_early_stopping = EarlyStopping(
    patience=cfg.patience,
    min_delta=cfg.min_delta,
    save_path="best_w2v_lstm.pt"
)


for epoch in tqdm(range(1, cfg.epochs), desc=f"processing"):
    print(f"Epoch {epoch}/{cfg.epochs}")
    train_loss, train_acc = my_trainer(
        w2v_lstm, w2v_train_loader, wl_optimizer, loss_fn, cfg.device
    )
    val_loss, val_acc = evaluate(
        w2v_lstm, w2v_test_loader, loss_fn, cfg.device
    )

    print(f"[{epoch}] "
          f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f}")

    if wl_early_stopping.step(val_loss, model=w2v_lstm, token_to_id=w2v_token_to_id, embedding_matrix=w2v_embedding_matrix):
        print("Early stopping triggered.")
        break

processing:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[1] train_loss=2.7942 acc=0.1239 | val_loss=2.9971 acc=0.1000


processing:  10%|█         | 1/10 [01:04<09:44, 64.97s/it]

Validation loss improved. Best model saved to best_w2v_lstm.pt
Epoch 2/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[2] train_loss=2.0597 acc=0.3273 | val_loss=1.9028 acc=0.3561


processing:  20%|██        | 2/10 [02:10<08:40, 65.01s/it]

Validation loss improved. Best model saved to best_w2v_lstm.pt
Epoch 3/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[3] train_loss=1.3034 acc=0.5706 | val_loss=1.4140 acc=0.5376


processing:  30%|███       | 3/10 [03:13<07:31, 64.47s/it]

Validation loss improved. Best model saved to best_w2v_lstm.pt
Epoch 4/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[4] train_loss=0.5764 acc=0.8141 | val_loss=1.1132 acc=0.6512


processing:  40%|████      | 4/10 [04:16<06:22, 63.77s/it]

Validation loss improved. Best model saved to best_w2v_lstm.pt
Epoch 5/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[5] train_loss=0.2541 acc=0.9242 | val_loss=1.2582 acc=0.6455
EarlyStopping counter: 1 / 3
Epoch 6/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[6] train_loss=0.1143 acc=0.9702 | val_loss=1.2118 acc=0.7034
EarlyStopping counter: 2 / 3
Epoch 7/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[7] train_loss=0.0704 acc=0.9824 | val_loss=1.1443 acc=0.7224
EarlyStopping counter: 3 / 3
Early stopping triggered.





In [None]:
w2v_lstm_ckpt = torch.load("best_w2v_lstm.pt", map_location=cfg.device, weights_only=False)

w2v_lstm.load_state_dict(w2v_lstm_ckpt["model_state"])
w2v_lstm.to(cfg.device)

w2v_token_to_id = w2v_lstm_ckpt["token_to_id"]

In [None]:
# GRU 모델 훈련
wg_early_stopping = EarlyStopping(
    patience=cfg.patience,
    min_delta=cfg.min_delta,
    save_path="best_w2v_gru.pt"
)

for epoch in tqdm(range(1, cfg.epochs), desc=f"processing"):
    print(f"Epoch {epoch}/{cfg.epochs}")
    train_loss, train_acc = my_trainer(
        w2v_gru, w2v_train_loader, wg_optimizer, loss_fn, cfg.device
    )
    val_loss, val_acc = evaluate(
        w2v_gru, w2v_test_loader, loss_fn, cfg.device
    )

    print(f"[{epoch}] "
          f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f}")

    if wg_early_stopping.step(val_loss, model=w2v_gru, token_to_id=w2v_token_to_id, embedding_matrix=w2v_embedding_matrix):
        print("Early stopping triggered.")
        break

processing:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[1] train_loss=1.2687 acc=0.6244 | val_loss=1.0317 acc=0.6875


processing:  10%|█         | 1/10 [00:59<08:54, 59.44s/it]

Validation loss improved. Best model saved to best_w2v_gru.pt
Epoch 2/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[2] train_loss=0.2168 acc=0.9404 | val_loss=0.9559 acc=0.7362


processing:  20%|██        | 2/10 [01:56<07:46, 58.28s/it]

Validation loss improved. Best model saved to best_w2v_gru.pt
Epoch 3/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[3] train_loss=0.0682 acc=0.9836 | val_loss=1.0550 acc=0.7306
EarlyStopping counter: 1 / 3
Epoch 4/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[4] train_loss=0.0321 acc=0.9927 | val_loss=1.1292 acc=0.7423
EarlyStopping counter: 2 / 3
Epoch 5/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[5] train_loss=0.0215 acc=0.9943 | val_loss=1.2397 acc=0.7254
EarlyStopping counter: 3 / 3
Early stopping triggered.





In [None]:
w2v_gru_ckpt = torch.load("best_w2v_gru.pt", map_location=cfg.device, weights_only=False)

w2v_gru.load_state_dict(w2v_gru_ckpt["model_state"])
w2v_gru.to(cfg.device)

w2v_token_to_id = w2v_gru_ckpt["token_to_id"]

## 4.3. FastText LSTM & GRU

In [None]:
fl_optimizer = torch.optim.Adam(ft_lstm.parameters(), lr=cfg.lr)
fg_optimizer = torch.optim.Adam(ft_gru.parameters(), lr=cfg.lr)

In [None]:
# LSTM 모델 훈련
fl_early_stopping = EarlyStopping(
    patience=cfg.patience,
    min_delta=cfg.min_delta,
    save_path="best_ft_lstm.pt"
)

for epoch in tqdm(range(1, cfg.epochs), desc=f"processing"):
    print(f"Epoch {epoch}/{cfg.epochs}")
    train_loss, train_acc = my_trainer(
        ft_lstm, ft_train_loader, fl_optimizer, loss_fn, cfg.device
    )
    val_loss, val_acc = evaluate(
        ft_lstm, ft_test_loader, loss_fn, cfg.device
    )

    print(f"[{epoch}] "
          f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f}")

    if fl_early_stopping.step(val_loss, model=ft_lstm, token_to_id=ft_token_to_id, embedding_matrix=ft_embedding_matrix):
        print("Early stopping triggered.")
        break

processing:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[1] train_loss=2.7241 acc=0.1535 | val_loss=2.4164 acc=0.2306


processing:  10%|█         | 1/10 [00:33<05:03, 33.73s/it]

Validation loss improved. Best model saved to best_ft_lstm.pt
Epoch 2/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[2] train_loss=2.1609 acc=0.2969 | val_loss=2.1046 acc=0.3005


processing:  20%|██        | 2/10 [01:03<04:11, 31.48s/it]

Validation loss improved. Best model saved to best_ft_lstm.pt
Epoch 3/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[3] train_loss=1.7706 acc=0.4155 | val_loss=1.8123 acc=0.3995


processing:  30%|███       | 3/10 [01:33<03:36, 30.87s/it]

Validation loss improved. Best model saved to best_ft_lstm.pt
Epoch 4/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[4] train_loss=1.4021 acc=0.5290 | val_loss=1.6834 acc=0.4456


processing:  40%|████      | 4/10 [02:05<03:08, 31.34s/it]

Validation loss improved. Best model saved to best_ft_lstm.pt
Epoch 5/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[5] train_loss=1.0765 acc=0.6446 | val_loss=1.5103 acc=0.4942


processing:  50%|█████     | 5/10 [02:35<02:34, 30.88s/it]

Validation loss improved. Best model saved to best_ft_lstm.pt
Epoch 6/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[6] train_loss=0.8134 acc=0.7284 | val_loss=1.4501 acc=0.5262


processing:  60%|██████    | 6/10 [03:07<02:04, 31.15s/it]

Validation loss improved. Best model saved to best_ft_lstm.pt
Epoch 7/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[7] train_loss=0.6514 acc=0.7874 | val_loss=1.5243 acc=0.5182
EarlyStopping counter: 1 / 3
Epoch 8/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[8] train_loss=0.5340 acc=0.8248 | val_loss=1.5274 acc=0.5377
EarlyStopping counter: 2 / 3
Epoch 9/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[9] train_loss=0.3972 acc=0.8761 | val_loss=1.5143 acc=0.5645
EarlyStopping counter: 3 / 3
Early stopping triggered.





In [None]:
ft_lstm_ckpt = torch.load("best_ft_lstm.pt", map_location=cfg.device, weights_only=False)

ft_lstm.load_state_dict(ft_lstm_ckpt["model_state"])
ft_lstm.to(cfg.device)

ft_token_to_id = ft_lstm_ckpt["token_to_id"]

In [None]:
# GRU 모델 훈련
fg_early_stopping = EarlyStopping(
    patience=cfg.patience,
    min_delta=cfg.min_delta,
    save_path="best_ft_gru.pt"
)

for epoch in tqdm(range(1, cfg.epochs), desc=f"processing"):
    print(f"Epoch {epoch}/{cfg.epochs}")
    train_loss, train_acc = my_trainer(
        ft_gru, ft_train_loader, fg_optimizer, loss_fn, cfg.device
    )
    val_loss, val_acc = evaluate(
        ft_gru, ft_test_loader, loss_fn, cfg.device
    )

    print(f"[{epoch}] "
          f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f}")

    if fg_early_stopping.step(val_loss, model=ft_gru, token_to_id=ft_token_to_id, embedding_matrix=ft_embedding_matrix):
        print("Early stopping triggered.")
        break

processing:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
ft_gru_ckpt = torch.load("best_ft_gru.pt", map_location=cfg.device, weights_only=False)

ft_gru.load_state_dict(ft_gru_ckpt["model_state"])
ft_gru.to(cfg.device)

ft_token_to_id = ft_gru_ckpt["token_to_id"]

## 4.4. Glove LSTM & GRU

In [None]:
gl_optimizer = torch.optim.Adam(gl_lstm.parameters(), lr=cfg.lr)
gg_optimizer = torch.optim.Adam(gl_gru.parameters(), lr=cfg.lr)

In [None]:
# LSTM 모델 훈련
gl_early_stopping = EarlyStopping(
    patience=cfg.patience,
    min_delta=cfg.min_delta,
    save_path="best_gl_lstm.pt"
)

for epoch in tqdm(range(1, cfg.epochs), desc=f"processing"):
    print(f"Epoch {epoch}/{cfg.epochs}")
    train_loss, train_acc = my_trainer(
        gl_lstm, gl_train_loader, gl_optimizer, loss_fn, cfg.device
    )
    val_loss, val_acc = evaluate(
        gl_lstm, gl_test_loader, loss_fn, cfg.device
    )

    print(f"[{epoch}] "
          f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f}")

    if gl_early_stopping.step(val_loss, model=gl_lstm, token_to_id=gl_token_to_id, embedding_matrix=gl_embedding_matrix):
        print("Early stopping triggered.")
        break

processing:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[1] train_loss=2.6503 acc=0.1600 | val_loss=2.2749 acc=0.2718


processing:  10%|█         | 1/10 [00:20<03:00, 20.10s/it]

Validation loss improved. Best model saved to best_gl_lstm.pt
Epoch 2/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[2] train_loss=1.9275 acc=0.3725 | val_loss=1.9015 acc=0.4077


processing:  20%|██        | 2/10 [00:39<02:37, 19.73s/it]

Validation loss improved. Best model saved to best_gl_lstm.pt
Epoch 3/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[3] train_loss=1.1423 acc=0.6170 | val_loss=1.1068 acc=0.6337


processing:  30%|███       | 3/10 [00:59<02:18, 19.80s/it]

Validation loss improved. Best model saved to best_gl_lstm.pt
Epoch 4/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[4] train_loss=0.6201 acc=0.7918 | val_loss=0.9553 acc=0.6945


processing:  40%|████      | 4/10 [01:20<02:00, 20.10s/it]

Validation loss improved. Best model saved to best_gl_lstm.pt
Epoch 5/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[5] train_loss=0.3677 acc=0.8857 | val_loss=0.9273 acc=0.7363


processing:  50%|█████     | 5/10 [01:39<01:40, 20.02s/it]

Validation loss improved. Best model saved to best_gl_lstm.pt
Epoch 6/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[6] train_loss=0.1932 acc=0.9455 | val_loss=1.0099 acc=0.7370
EarlyStopping counter: 1 / 3
Epoch 7/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[7] train_loss=0.0906 acc=0.9775 | val_loss=1.0134 acc=0.7576
EarlyStopping counter: 2 / 3
Epoch 8/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[8] train_loss=0.0454 acc=0.9898 | val_loss=1.2161 acc=0.7256
EarlyStopping counter: 3 / 3
Early stopping triggered.





In [None]:
gl_lstm_ckpt = torch.load("best_gl_lstm.pt", map_location=cfg.device, weights_only=False)

gl_lstm.load_state_dict(gl_lstm_ckpt["model_state"])
gl_lstm.to(cfg.device)

gl_token_to_id = gl_lstm_ckpt["token_to_id"]

In [None]:
# GRU 모델 훈련
gg_early_stopping = EarlyStopping(
    patience=cfg.patience,
    min_delta=cfg.min_delta,
    save_path="best_gl_gru.pt"
)

for epoch in tqdm(range(1, cfg.epochs), desc=f"processing"):
    print(f"Epoch {epoch}/{cfg.epochs}")
    train_loss, train_acc = my_trainer(
        gl_gru, gl_train_loader, gg_optimizer, loss_fn, cfg.device
    )
    val_loss, val_acc = evaluate(
        gl_gru, gl_test_loader, loss_fn, cfg.device
    )

    print(f"[{epoch}] "
          f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f}")

    if gg_early_stopping.step(val_loss, model=gl_gru, token_to_id=gl_token_to_id, embedding_matrix=gl_embedding_matrix):
        print("Early stopping triggered.")
        break

processing:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[1] train_loss=1.5873 acc=0.5121 | val_loss=0.9870 acc=0.6925


processing:  10%|█         | 1/10 [00:18<02:49, 18.84s/it]

Validation loss improved. Best model saved to best_gl_gru.pt
Epoch 2/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[2] train_loss=0.2894 acc=0.9205 | val_loss=0.8626 acc=0.7576


processing:  20%|██        | 2/10 [00:38<02:32, 19.05s/it]

Validation loss improved. Best model saved to best_gl_gru.pt
Epoch 3/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[3] train_loss=0.0825 acc=0.9810 | val_loss=0.9848 acc=0.7540
EarlyStopping counter: 1 / 3
Epoch 4/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[4] train_loss=0.0341 acc=0.9932 | val_loss=1.1111 acc=0.7539
EarlyStopping counter: 2 / 3
Epoch 5/11


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[5] train_loss=0.0222 acc=0.9952 | val_loss=1.0883 acc=0.7562
EarlyStopping counter: 3 / 3
Early stopping triggered.





In [None]:
gl_gru_ckpt = torch.load("best_gl_gru.pt", map_location=cfg.device, weights_only=False)

gl_gru.load_state_dict(gl_gru_ckpt["model_state"])
gl_gru.to(cfg.device)

gl_token_to_id = gl_gru_ckpt["token_to_id"]

## 4.5. BPE LSTM & GRU

In [None]:
bl_optimizer = torch.optim.Adam(bpe_lstm.parameters(), lr=cfg.lr)
bg_optimizer = torch.optim.Adam(bpe_gru.parameters(), lr=cfg.lr)

In [None]:
# LSTM 모델 훈련
bl_early_stopping = EarlyStopping_bpe(
    patience=cfg.patience,
    min_delta=cfg.min_delta,
    save_path="best_bpe_lstm.pt",
    tokenizer_path="best_bpe_lstm_tokenizer.json"
)

for epoch in tqdm(range(1, cfg.epochs), desc=f"processing"):
    print(f"Epoch {epoch}/{cfg.epochs}")
    train_loss, train_acc = my_trainer(
        bpe_lstm, bpe_train_loader, bl_optimizer, loss_fn, cfg.device
    )
    val_loss, val_acc = evaluate(
        bpe_lstm, bpe_test_loader, loss_fn, cfg.device
    )

    print(f"[{epoch}] "
          f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f}")

    if bl_early_stopping.step(val_loss, model=bpe_lstm, tokenizer=bpe_tokenizer):
        print("Early stopping triggered.")
        break

processing:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/11


processing:  10%|█         | 1/10 [00:10<01:30, 10.02s/it]

[1] train_loss=2.7858 acc=0.1780 | val_loss=2.6660 acc=0.2083
[BPE] Best model saved to best_bpe_lstm.pt, tokenizer saved to best_bpe_lstm_tokenizer.json
Epoch 2/11


processing:  20%|██        | 2/10 [00:19<01:19,  9.94s/it]

[2] train_loss=2.1969 acc=0.3625 | val_loss=2.4549 acc=0.2796
[BPE] Best model saved to best_bpe_lstm.pt, tokenizer saved to best_bpe_lstm_tokenizer.json
Epoch 3/11


processing:  30%|███       | 3/10 [00:29<01:09,  9.90s/it]

[3] train_loss=1.6138 acc=0.5198 | val_loss=2.3925 acc=0.3096
[BPE] Best model saved to best_bpe_lstm.pt, tokenizer saved to best_bpe_lstm_tokenizer.json
Epoch 4/11


processing:  40%|████      | 4/10 [00:39<00:59,  9.86s/it]

[4] train_loss=1.1469 acc=0.6614 | val_loss=2.4112 acc=0.3396
EarlyStopping counter: 1/3
Epoch 5/11


processing:  50%|█████     | 5/10 [00:49<00:49,  9.82s/it]

[5] train_loss=0.7930 acc=0.7719 | val_loss=2.5143 acc=0.3372
EarlyStopping counter: 2/3
Epoch 6/11


processing:  50%|█████     | 5/10 [00:59<00:59, 11.82s/it]

[6] train_loss=0.5323 acc=0.8502 | val_loss=2.5861 acc=0.3581
EarlyStopping counter: 3/3
Early stopping triggered.





In [None]:
# checkpoint 로드
bpe_lstm_ckpt = torch.load("best_bpe_lstm.pt", map_location=cfg.device)

# state_dict만 꺼내서 로드
bpe_lstm.load_state_dict(bpe_lstm_ckpt["model_state"])

# 토크나이저 로드
bpe_tokenizer = Tokenizer.from_file("best_bpe_lstm_tokenizer.json")

# 디바이스 이동
bpe_lstm.to(cfg.device)

LSTMClassifier(
  (embedding): Embedding(20000, 300, padding_idx=0)
  (lstm): LSTM(300, 128, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=128, out_features=20, bias=True)
)

In [None]:
# GRU 모델 훈련
bg_early_stopping = EarlyStopping_bpe(
    patience=cfg.patience,
    min_delta=cfg.min_delta,
    save_path="best_bpe_gru.pt",
    tokenizer_path="best_bpe_gru_tokenizer.json"
)

for epoch in tqdm(range(1, cfg.epochs), desc=f"processing"):
    print(f"Epoch {epoch}/{cfg.epochs}")
    train_loss, train_acc = my_trainer(
        bpe_gru, bpe_train_loader, bg_optimizer, loss_fn, cfg.device
    )
    val_loss, val_acc = evaluate(
        bpe_gru, bpe_test_loader, loss_fn, cfg.device
    )

    print(f"[{epoch}] "
          f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f}")

    if bg_early_stopping.step(val_loss, model=bpe_gru, tokenizer=bpe_tokenizer):
        print("Early stopping triggered.")
        break

processing:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/11


processing:  10%|█         | 1/10 [00:10<01:30, 10.08s/it]

[1] train_loss=2.5704 acc=0.2335 | val_loss=2.4949 acc=0.2484
[BPE] Best model saved to best_bpe_gru.pt, tokenizer saved to best_bpe_gru_tokenizer.json
Epoch 2/11


processing:  20%|██        | 2/10 [00:19<01:19,  9.95s/it]

[2] train_loss=1.7498 acc=0.4699 | val_loss=2.3085 acc=0.3282
[BPE] Best model saved to best_bpe_gru.pt, tokenizer saved to best_bpe_gru_tokenizer.json
Epoch 3/11


processing:  30%|███       | 3/10 [00:29<01:09,  9.88s/it]

[3] train_loss=1.0993 acc=0.6592 | val_loss=2.1053 acc=0.3796
[BPE] Best model saved to best_bpe_gru.pt, tokenizer saved to best_bpe_gru_tokenizer.json
Epoch 4/11


processing:  40%|████      | 4/10 [00:39<00:59,  9.88s/it]

[4] train_loss=0.6573 acc=0.7951 | val_loss=2.2055 acc=0.3933
EarlyStopping counter: 1/3
Epoch 5/11


processing:  50%|█████     | 5/10 [00:49<00:49,  9.85s/it]

[5] train_loss=0.3983 acc=0.8838 | val_loss=2.1645 acc=0.4424
EarlyStopping counter: 2/3
Epoch 6/11


processing:  50%|█████     | 5/10 [00:59<00:59, 11.85s/it]

[6] train_loss=0.2251 acc=0.9382 | val_loss=2.2963 acc=0.4539
EarlyStopping counter: 3/3
Early stopping triggered.





In [None]:
# checkpoint 로드
bpe_gru_ckpt = torch.load("best_bpe_gru.pt", map_location=cfg.device)

# state_dict만 꺼내서 로드
bpe_gru.load_state_dict(bpe_gru_ckpt["model_state"])

# 토크나이저 로드
bpe_tokenizer = Tokenizer.from_file("best_bpe_gru_tokenizer.json")

# 디바이스 이동
bpe_gru.to(cfg.device)

GRUClassifier(
  (embedding): Embedding(20000, 300, padding_idx=0)
  (gru): GRU(300, 128, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=20, bias=True)
)

# 5. 평가

In [None]:
# 평가 전용 infer 함수
@torch.no_grad()
def evaluate_metrics(model, dataloader, device):
    model.eval()

    all_preds = []
    all_labels = []

    for batch in dataloader:
        # batch: (input_ids, labels, lengths)
        input_ids, labels, lengths = batch

        input_ids = input_ids.to(device)
        labels = labels.to(device)
        lengths = lengths.to(device)

        logits = model(input_ids, lengths)   # (B, num_classes)
        preds = torch.argmax(logits, dim=1)

        all_preds.append(preds.cpu())
        all_labels.append(labels.cpu())

    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()

    acc = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels,
        all_preds,
        average="macro",
        zero_division=0
    )

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
results_r = {}

results_r["Word2Vec-LSTM"] = evaluate_metrics(
    w2v_lstm, w2v_test_loader, cfg.device
)

results_r["FastText-LSTM"] = evaluate_metrics(
    ft_lstm, ft_test_loader, cfg.device
)

results_r["GloVe-LSTM"] = evaluate_metrics(
    gl_lstm, gl_test_loader, cfg.device
)

results_r["BPE-LSTM"] = evaluate_metrics(
    bpe_lstm, bpe_test_loader, cfg.device
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
df_results_r = pd.DataFrame(results_r).T
df_results_r

Unnamed: 0,accuracy,precision,recall,f1
Word2Vec-LSTM,0.673394,0.67313,0.661502,0.661256
FastText-LSTM,0.538237,0.542024,0.529939,0.53116
GloVe-LSTM,0.754647,0.75291,0.745128,0.743761
BPE-LSTM,0.319835,0.326344,0.317058,0.305976


In [None]:
results_g = {}

results_g["Word2Vec-GRU"] = evaluate_metrics(
    w2v_gru, w2v_test_loader, cfg.device
)

results_g["FastText-GRU"] = evaluate_metrics(
    ft_gru, ft_test_loader, cfg.device
)

results_g["GloVe-GRU"] = evaluate_metrics(
    gl_gru, gl_test_loader, cfg.device
)

results_g["BPE-GRU"] = evaluate_metrics(
    bpe_gru, bpe_test_loader, cfg.device
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
df_results_g = pd.DataFrame(results_g).T
df_results_g

Unnamed: 0,accuracy,precision,recall,f1
Word2Vec-GRU,0.736192,0.741849,0.726332,0.724996
FastText-GRU,0.675783,0.67899,0.667696,0.667967
GloVe-GRU,0.757568,0.753262,0.747903,0.748167
BPE-GRU,0.37958,0.376939,0.3747,0.365848


## 5.1. 코멘트

- BPE 기반 LSTM, GRU 모델은 Word2Vec, FastText, GloVe 대비 현저히 낮은 성능을 보였습니다.
- 이는 BPE가 사전학습된 의미 임베딩을 제공하지 않으며, 서브워드 단위 토큰화로 인해 시퀀스 길이가 크게 증가한 상태에서 최대 길이를 512로 제한하여 입력을 절단함으로써 문맥 정보 손실이 크게 발생했기 때문으로 해석할 수 있습니다.
- 동일한 모델 구조와 학습 조건 하에서 비교했을 때, 사전학습 임베딩을 사용하는 Word2Vec, FastText, GloVe는 RNN 계열 모델과 상대적으로 높은 적합성을 보인 반면, 사전학습이 되지 않은 BPE는 RNN 기반 구조와 가장 부적합한 조합으로 나타났습니다.
- BPE는 사전학습되지 않았다는 점을 고려하여 최대 길이와 훈련 에포크를 늘리면 성능이 크게 개선될 것으로 보입니다