<a href="https://colab.research.google.com/github/hzzz15/Soulsync/blob/model/modeling_last.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Colab 환경 설정 및 라이브러리 설치

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
!pip install transformers datasets sentencepiece --upgrade
!git clone https://github.com/SKTBrain/KoBERT.git
%cd KoBERT/kobert_hf
!pip install . --no-deps
%cd /content

2. 기본 import 및 토크나이저, 모델 로드

In [None]:
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import get_cosine_schedule_with_warmup, AutoTokenizer, BertModel, AutoConfig
from torch.optim import AdamW
from kobert_tokenizer import KoBERTTokenizer

# tokenizer와 bert backbone
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bert_model = BertModel.from_pretrained('skt/kobert-base-v1')

3. 하이퍼파라미터 정의

In [None]:
# 하이퍼파라미터 설정
max_len = 64
batch_size = 32
learning_rate = 5e-5
epochs = 5
max_grad_norm = 1.0

4. 데이터 로딩 및 전처리

In [None]:
# 데이터 불러오기
import pandas as pd
file_path = '/content/merged_full_dataset.csv'

# 파일 불러오기
df = pd.read_csv(file_path)
df.head()

# 감정 문자열 → 숫자 매핑
emotion_label_map = {
    "행복": 0, "놀람": 1, "분노": 2, "공포": 3, "혐오": 4, "슬픔": 5, "중립": 6
}
df["label"] = df["Emotion"].map(emotion_label_map)
df = df[["Sentence", "label"]].dropna()

# 리스트화
data_list = [[s, int(l)] for s, l in zip(df["Sentence"], df["label"])]

# train/test 분리
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data_list, test_size=0.2, random_state=42)

5. Dataset, DataLoader 정의

In [None]:
class BERTSentenceTransform:
    def __init__(self, tokenizer, max_seq_length, pad=True, pair=False):
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.pad = pad
        self.pair = pair

    def __call__(self, line):
        text = line[0]
        tokens = self.tokenizer.tokenize(text)
        tokens = ['[CLS]'] + tokens[:self.max_seq_length - 2] + ['[SEP]']
        segment_ids = [0] * len(tokens)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        valid_length = len(input_ids)
        pad_len = self.max_seq_length - valid_length
        input_ids += [self.tokenizer.convert_tokens_to_ids(['[PAD]'])[0]] * pad_len
        segment_ids += [0] * pad_len
        return np.array(input_ids), np.array(valid_length), np.array(segment_ids)

class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, tokenizer, max_len, pad, pair):
        self.transform = BERTSentenceTransform(tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = [self.transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]
    def __getitem__(self, i):
        return (*self.sentences[i], self.labels[i])
    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = BERTDataset(train_data, 0, 1, tokenizer, max_len, True, False)
test_dataset = BERTDataset(test_data, 0, 1, tokenizer, max_len, True, False)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

6. 모델 정의 + Optimizer/Scheduler

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=7, dr_rate=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.classifier = nn.Linear(hidden_size, num_classes)
        self.dr_rate = dr_rate
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()
    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooled_output = self.bert(input_ids=token_ids,
                                     token_type_ids=segment_ids.long(),
                                     attention_mask=attention_mask.to(token_ids.device),
                                     return_dict=False)
        if self.dr_rate:
            pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

model = BERTClassifier(bert_model, dr_rate=0.5).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

total_steps = len(train_dataloader) * epochs
warmup_steps = int(total_steps * 0.1)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

7. 학습 루프

In [None]:
from tqdm import tqdm

def calc_accuracy(preds, labels):
    _, pred_max = torch.max(preds, 1)
    return (pred_max == labels).sum().item() / len(labels)

def save_checkpoint(model, optimizer, epoch, path="./kobert_best.pt"):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, path)

best_val_acc = 0.0

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}")
    model.train()
    total_loss = 0
    train_acc = 0

    for step, batch in enumerate(tqdm(train_dataloader)):
        token_ids, valid_length, segment_ids, label = [x.to(device) for x in batch]
        label = label.long()

        optimizer.zero_grad()
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        train_acc += calc_accuracy(out, label)

    avg_train_loss = total_loss / len(train_dataloader)
    avg_train_acc = train_acc / len(train_dataloader)
    print(f"Train Loss: {avg_train_loss:.4f}, Accuracy: {avg_train_acc:.4f}")

    model.eval()
    val_loss = 0
    val_acc = 0

    with torch.no_grad():
        for batch in test_dataloader:
            token_ids, valid_length, segment_ids, label = [x.to(device) for x in batch]
            label = label.long()
            out = model(token_ids, valid_length, segment_ids)
            loss = loss_fn(out, label)
            val_loss += loss.item()
            val_acc += calc_accuracy(out, label)

    avg_val_loss = val_loss / len(test_dataloader)
    avg_val_acc = val_acc / len(test_dataloader)
    print(f"Val Loss: {avg_val_loss:.4f}, Accuracy: {avg_val_acc:.4f}")

    if avg_val_acc > best_val_acc:
        best_val_acc = avg_val_acc
        save_checkpoint(model, optimizer, epoch+1)
        print(f"Best model saved at epoch {epoch+1}")

8. 라벨 포함 config 저장 및 모델 export

In [None]:
import os
from transformers import AutoConfig

# 라벨 정의
labels = ["행복", "놀람", "분노", "공포", "혐오", "슬픔", "중립"]
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for idx, label in enumerate(labels)}

# config 생성
config = AutoConfig.from_pretrained(
    "skt/kobert-base-v1",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

# 저장 폴더 생성
save_path = "./soulsync_export"
os.makedirs(save_path, exist_ok=True)

# model의 state_dict만 저장 (pytorch 방식)
torch.save(model.state_dict(), os.path.join(save_path, "pytorch_model.bin"))

# config 저장
config.save_pretrained(save_path)

# tokenizer 저장
tokenizer.save_pretrained(save_path)

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    repo_id="hzz15/soulsync",
    folder_path="./soulsync_export",
    path_in_repo=".",
    commit_message="감정 라벨 포함된 KoBERT 모델 업로드"
)