<a href="https://colab.research.google.com/github/jasmis1229/midterm_new/blob/main/%ED%8A%B8%EB%A0%8C%EB%93%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ✅ CBOW 실험 초기화

# 1. 라이브러리 불러오기
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import re
import random
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

# 2. 데이터 불러오기
df = pd.read_csv('/content/신조어_스타일_패턴_완성본_UTF8SIG.csv')  # 파일명은 환경에 맞게 수정

In [2]:
# ✅ CBOW 데이터 정제

# 신조어 앞 번호 제거 함수
def clean_word(word):
    word = str(word)
    return re.sub(r'^\d+\.', '', word).strip()

# 클린 신조어 컬럼 생성
df['클린 신조어'] = df['신조어'].apply(clean_word)

In [3]:
# ✅ CBOW 설명 토크나이징

# 간단 토크나이징 함수
def simple_tokenize(text):
    text = re.sub(r"[^\w\s]", "", str(text))  # 특수문자 제거
    tokens = text.strip().split()
    tokens = [token for token in tokens if len(token) > 1]  # 한 글자 제거
    return tokens

# 설명 토크나이징 적용
descriptions = df['설명'].tolist()
tokenized_descriptions = [simple_tokenize(desc) for desc in descriptions]

In [4]:
# ✅ CBOW 어휘 사전 구축

vocab = set()
for tokens in tokenized_descriptions:
    vocab.update(tokens)

# 단어 ↔ 인덱스 매핑
vocab = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in vocab.items()}
vocab_size = len(vocab)

In [5]:
# ✅ CBOW 학습 데이터셋 생성

window_size = 2
context_center_pairs = []

for tokens in tokenized_descriptions:
    for idx, center_word in enumerate(tokens):
        context = []
        for i in range(idx - window_size, idx + window_size + 1):
            if i != idx and 0 <= i < len(tokens):
                context.append(vocab[tokens[i]])
        if context:
            context_center_pairs.append((context, vocab[center_word]))

In [6]:
# ✅ CBOW Dataset 클래스 정의

class CBOWDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, center = self.data[idx]
        return torch.tensor(context), torch.tensor(center)

# DataLoader 생성
dataset = CBOWDataset(context_center_pairs)
train_loader = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=lambda batch: list(zip(*batch)))

In [7]:
# ✅ CBOW 모델 클래스 정의

class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, contexts):
        embeds = []
        for context in contexts:
            embed = self.embeddings(context)
            embeds.append(embed.mean(dim=0))
        embeds = torch.stack(embeds)
        out = self.linear(embeds)
        return out

In [8]:
# ✅ CBOW 모델 초기화

model = CBOWModel(vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [9]:
# ✅ CBOW 학습 루프

num_epochs = 20

for epoch in range(num_epochs):
    total_loss = 0
    for contexts, centers in train_loader:
        outputs = model(contexts)
        loss = criterion(outputs, torch.tensor(centers))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}')

Epoch [1/20], Loss: 102.8183
Epoch [2/20], Loss: 99.6956
Epoch [3/20], Loss: 96.8455
Epoch [4/20], Loss: 94.1754
Epoch [5/20], Loss: 91.3577
Epoch [6/20], Loss: 88.4945
Epoch [7/20], Loss: 85.8633
Epoch [8/20], Loss: 83.0899
Epoch [9/20], Loss: 80.3510
Epoch [10/20], Loss: 77.5348
Epoch [11/20], Loss: 74.8449
Epoch [12/20], Loss: 72.0077
Epoch [13/20], Loss: 69.2950
Epoch [14/20], Loss: 66.4486
Epoch [15/20], Loss: 63.9486
Epoch [16/20], Loss: 61.0662
Epoch [17/20], Loss: 58.3412
Epoch [18/20], Loss: 55.7727
Epoch [19/20], Loss: 53.1345
Epoch [20/20], Loss: 50.5563
