In [1]:
!pip install -U lightning lightning_utilities typing_extensions pytorch-lightning torchmetrics fsspec[http] --no-deps 



In [2]:
%env TOKENIZERS_PARALLELISM=true
!pip install "torch>2" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

env: TOKENIZERS_PARALLELISM=true
Looking in indexes: https://download.pytorch.org/whl/cu118


In [3]:
# 라이브러리 불러오기
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 설치한 폰트를 matplotlib에서 사용할 수 있도록 설정
import matplotlib.font_manager as fm

# 나눔 폰트 경로 설정
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'

# 폰트 매니저에 폰트 추가
fm.fontManager.addfont(font_path)
plt.rc('font', family='NanumGothic')  # 폰트 설정


import re
import torch

In [4]:
torch.cuda.empty_cache() 

import gc
gc.collect()  # Additional step to ensure garbage collection

0

In [5]:
train_data_path ="/aiffel/aiffel/dlthon-minions/share/data/conversations.csv"
origin_data = pd.read_csv(train_data_path)

In [6]:
# 'class'를 'type'으로 매핑하는 딕셔너리 생성하기
class_to_type = {
    '협박 대화': 0,
    '갈취 대화': 1,
    '직장 내 괴롭힘 대화': 2,
    '기타 괴롭힘 대화': 3,
    '일반 대화': 4
}

In [7]:
# 'class' 열을 기반으로 새로운 'type' 열 추가하기
origin_data['label'] = origin_data['class'].map(class_to_type)
origin_data.drop(['idx', 'class'], axis=1, inplace=True)

In [8]:
# 전처리 함수 정의하기
def preprocess_sentence(sentence): 
    # \n을 공백으로 바꾸기
    sentence = re.sub("\n", " ", sentence)
    
    # (ㄱ-ㅎ, ㅏ-ㅣ, ".", "?", "!", ",", ' ')를 제외한 모든 문자를 없애기
    sentence = re.sub("[^ㄱ-ㅣ가-힣.?!, ]", "", sentence)
    
    # 단어와 구두점(punctuation) 사이에 공백 추가하기
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    
    return sentence

In [9]:
# 전처리 데이터 새로운 column에 저장하기
preprocessed = origin_data['conversation'].apply(preprocess_sentence).tolist()

In [10]:
# 대화 최대 길이 128으로 설정
MAX_LENGTH = 128

In [11]:
PRETRAINED_MODEL = "gogamza/kobart-base-v2"

from transformers import BartTokenizerFast, PreTrainedTokenizerFast
# 각 conversation을 토큰화하여 새로운 열 'tokenized'에 저장
tokenizer=PreTrainedTokenizerFast.from_pretrained(PRETRAINED_MODEL)

In [12]:
# 단어사전 크기 20000으로 제한
VOCAB_SIZE=len(tokenizer.vocab)

In [13]:
tokenized = tokenizer(
    preprocessed, 
    max_length=MAX_LENGTH,
    padding='max_length',  # Pad to the max_length
    truncation=True,       # Truncate sequences to the max_length
    return_tensors='pt')

In [14]:
VOCAB_SIZE

30000

In [15]:
from torch.utils.data import Dataset, StackDataset

dataset = StackDataset(**dict(tokenized), 
                       labels=origin_data['label'].values)

In [16]:
import torch
from torch.utils.data import random_split, default_collate
generator2 = torch.Generator().manual_seed(42)
train_dataset, val_dataset, test_dataset  = random_split(dataset, [0.8, 0.1, 0.1], generator=generator2)

In [17]:
print('Number of training data:', len(train_dataset))
print('Number of validation data:', len(val_dataset))
print('Number of test data:', len(test_dataset))


Number of training data: 3960
Number of validation data: 495
Number of test data: 495


In [18]:
# 데이터 로더 준비 함수 정의하기
from torch.utils.data import DataLoader

def prepare_dataloaders(train_dataset, val_dataset, test_dataset, batch_size, **kwargs):
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, **kwargs)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, **kwargs)
    
    return train_loader, val_loader, test_loader

In [19]:
import wandb
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset, StackDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np
from transformers import BartModel, BartConfig

In [20]:
!pip install wandb==0.16.0 -qq

In [21]:
wandb.login(key='746fb761ab2f1b53db2dafef7340caad69224513')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /aiffel/.netrc


True

In [22]:
def plot_table(cm, epoch=''):
    title = "Overall Prediction Result"
    # 실제 클래스명으로 변환
    classes = [
        '협박 대화 (0)',
        '갈취 대화 (1)',
        '직장 내 괴롭힘 대화 (2)',
        '기타 괴롭힘 대화 (3)',
        '일반 대화 (4)'
    ]
    fig = plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes, annot_kws={'size': 30})
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(title)
    return fig

In [23]:
from sklearn.metrics import confusion_matrix

class CompareResultsCallback:
    def __init__(self, class_num):
        self.class_num = class_num

    def plot_confusion_matrix(self, model, test_loader, device):
        model.eval()
        pred_ = []
        y_ = []
        with torch.no_grad():
            for batch in test_loader:
                batch = {key: d.to(device) for key, d in batch.items()}
                y_test_batch = batch['labels']
                loss, logits = model(**batch)
                pred = logits.argmax(dim=-1)
                pred_.extend(pred.cpu().numpy())
                y_.extend(y_test_batch.cpu().numpy())
        
        cm = confusion_matrix(y_, pred_)
        
        # 표 그리기
        cm_plot = plot_table(cm)

        # wandb에 로그로 저장
        cm_image = wandb.Image(cm_plot)
        wandb.log({"Overall Prediction Result": cm_image})

    def __call__(self, model, test_loader, device):
        self.plot_confusion_matrix(model, test_loader, device)

In [24]:

# BART 분류기 클래스 정의하기
class BartForSequenceClassification(nn.Module):
    def __init__(self, num_labels, wandb_config):
        super(BartForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.config = BartConfig.from_pretrained(PRETRAINED_MODEL, num_labels=num_labels)
        self.bart = BartModel.from_pretrained(PRETRAINED_MODEL, config=self.config)
        
        self.classifier = nn.Sequential()
        for _ in range(wandb_config.classifier_num_layer-1):
            self.classifier.append(nn.Dropout(0.1))
            self.classifier.append(nn.Linear(self.config.hidden_size, self.config.hidden_size))
            self.classifier.append( nn.GELU() )
        self.classifier.append(nn.Dropout(0.1))
        self.classifier.append(nn.Linear(self.config.hidden_size, num_labels))
        
    def forward(self, *args, token_type_ids=None, labels=None, **kwargs, ): #input_ids, attention_mask=None, labels=None):
        outputs = self.bart(*args, **kwargs,)
        pooled_output = outputs[0][:, -1, :]  # 마지막 토큰의 출력 사용
#         pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return loss, logits

from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch import Trainer

wandb_logger = WandbLogger(log_model="all")
trainer = Trainer(logger=wandb_logger)

In [25]:
# 옵티마이저 설정 함수 정의하기
def get_optimizer(optimizer_name, parameters, learning_rate):
    if optimizer_name == "adam":
        return torch.optim.Adam(parameters, lr=learning_rate)
    elif optimizer_name == "sgd":
        return torch.optim.SGD(parameters, lr=learning_rate)
    elif optimizer_name == "rmsprop":
        return torch.optim.RMSprop(parameters, lr=learning_rate)
    else:
        raise ValueError(f"Unknown optimizer: {optimizer_name}")

In [26]:
from tqdm import tqdm

def save_model(log_dir, model_state_dict, epoch, val_metrics_dict):
    model_path = os.path.join(log_dir, f'model_epoch{epoch}.pth')
    torch.save(model_state_dict, model_path)
        
    # Create an artifact
    meta_data = {
        'epoch': epoch, 
        'pre_trained_model': PRETRAINED_MODEL
    }
    meta_data.update(val_metrics_dict)
    
    artifact = wandb.Artifact(
        name=f'model-epoch-{epoch}',
        type='model',
        metadata=meta_data,
#         description='A model trained on the XYZ dataset for 10 epochs',
    )

    # Add a file to the artifact
    artifact.add_file(model_path)

    # Log the artifact
    wandb.log_artifact(artifact)

def eval_model(model, val_loader, device, prefix='', verbose=False):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    pred_val = []
    y_val = []
    with torch.no_grad():
        for batch in tqdm(val_loader) if verbose else val_loader:
            batch = {key: d.to(device) for key, d in batch.items()}
            y_val_batch = batch['labels']
            loss, logits = model(**batch)
            val_loss += loss.item()
            pred = logits.argmax(dim=-1)
            correct += (pred == y_val_batch).sum().item()
            total += y_val_batch.size(0)
            y_val.extend(y_val_batch.cpu().numpy())
            pred_val.extend(pred.cpu().numpy())

    val_loss /= len(val_loader)
    accuracy = correct / total
    f1_score_mic = f1_score(y_val, pred_val, average='micro')
    f1_score_mac = f1_score(y_val, pred_val, average='macro')
    metrics_dict = {
        f"{prefix}loss": val_loss, 
        f"{prefix}accuracy": accuracy,
        f"{prefix}f1_micro": f1_score_mic,
        f"{prefix}f1_macro": f1_score_mac,
    }
    wandb.log(metrics_dict)
    model.train()
    return metrics_dict


# 훈련 함수 정의하기
def train(train_dataset, val_dataset, test_dataset, default_config, log_dir='logs'):
    wandb.init(config=default_config)
    config = wandb.config
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    log_dir = os.path.join(log_dir, f'{wandb.run.id}') # current_run_id
    os.makedirs(log_dir, exist_ok=True)
    
    train_loader, val_loader, test_loader = prepare_dataloaders(
        train_dataset, val_dataset, test_dataset, 
        config.batch_size,
        pin_memory=True,
        pin_memory_device="cuda"
    )
    
    model = BartForSequenceClassification(config.class_num, config).to(device)
    
    if config.freeze_backbone:
        for w in model.bart.parameters():
            w._trainable = False
        training_params = model.classifier.parameters()
    else:
        training_params = model.parameters()

    optimizer = get_optimizer(config.optimizer, training_params, config.learning_rate)
    
    cm_callback = CompareResultsCallback(config.class_num)
    
    step = 0
    best_val_loss = 1e9
    for epoch in range(config.epoch):
        model.train()
        for batch in tqdm(train_loader):
            step += 1
            batch = {key: d.to(device) for key, d in batch.items()}
            
            optimizer.zero_grad()
            loss, logits = model(**batch)
            loss.backward()
            optimizer.step()
            
            if step // 100 == 0:
                wandb.log({'trn_batch_loss': loss})
                val_metrics_dict = eval_model(model, val_loader, device, prefix='val_')
                if val_metrics_dict['val_loss'] < best_val_loss:
                    best_val_loss = val_metrics_dict['val_loss']
                    best_model_state_dict = model.state_dict()
                    
        # end of epoch
        save_model(log_dir, model.state_dict(), epoch, val_metrics_dict)
        cm_callback(model, val_loader, device)

    # end of training
    save_model(log_dir, best_model_state_dict, 'best', val_metrics_dict)
    val_metrics_dict = eval_model(model, val_loader, device, prefix='val_')
    best_val_loss = min(best_val_loss, val_metrics_dict['val_loss'])

    # 테스트 단계
    cm_callback(model, test_loader, device)
    metrics_dict = eval_model(model, test_loader, device, prefix='test_')
    save_model(log_dir, model.state_dict(), 'latest', metrics_dict)  
    wandb.log({
        "Test Accuracy Rate": metrics_dict['test_accuracy'],
        "Test F1 Score (macro)": metrics_dict['test_f1_macro'],
        "Test Error Rate": 1 - metrics_dict['test_accuracy'],
    })
    wandb.finish()

In [27]:
import torch
from torch.utils.data import random_split, default_collate

# 스윕 훈련 함수 정의하기
def sweep_train():
    # 데이터 분할하기
    generator2 = torch.Generator().manual_seed(42)
    train_dataset, val_dataset, test_dataset  = random_split(dataset, [0.8, 0.1, 0.1], generator=generator2)
    # y 데이터의 최대값 + 1 을 class_num으로 설정하기
    default_config["class_num"] = max(origin_data['label']) + 1
    
    train(
        train_dataset, val_dataset, test_dataset, 
        default_config=default_config)


In [29]:
# 기본 설정
default_config = {
    "batch_size": 24,
    "epoch": 5,
    "learning_rate": 0.001,
    "optimizer": "adam",
    'freeze_backbone': True,
    "class_num": 5,  # 클래스 수 (필요에 따라 수정)
    "classifier_num_layer": 1
}

In [31]:
# 스윕 구성하기
sweep_config = {
    'method': 'random',
    'parameters': {
        'batch_size': {
            'values': [8, 16, 24, 32]
        },
        'epoch': {
            'values': [5, 10]
        },
        'learning_rate': {
            'max': 0.1,
            'min': 0.001
        },
        "classifier_num_layer": {
            'values': [1, 2]
        },
        "freeze_backbone": {
            'values': [True, False]
        },
        'optimizer': {
            'values': ['adam', 'sgd', 'rmsprop']
        }
    }
}

In [None]:
# 스윕 생성 및 에이전트 실행하기
sweep_id = wandb.sweep(sweep_config, 
                       entity='aiffel_minions', 
                       project='DLthon_finetune_KoBART')
wandb.agent(sweep_id, 
            function=sweep_train, 
            count=10)

Create sweep with ID: n6a4o84l
Sweep URL: https://wandb.ai/aiffel_minions/DLthon_finetune_koBart/sweeps/n6a4o84l


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9x0ocr7n with config:
[34m[1mwandb[0m: 	batch_size: 24
[34m[1mwandb[0m: 	classifier_num_layer: 2
[34m[1mwandb[0m: 	epoch: 5
[34m[1mwandb[0m: 	freeze_backbone: True
[34m[1mwandb[0m: 	learning_rate: 0.07274125430796448
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: Currently logged in as: [33mhojae-choi[0m ([33maiffel_minions[0m). Use [1m`wandb login --relogin`[0m to force relogin


Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]

 36%|███▋      | 60/165 [05:24<09:29,  5.43s/it]

In [None]:
import torch
from torch.utils.data import random_split, default_collate
generator2 = torch.Generator().manual_seed(42)
train_dataset, val_dataset, test_dataset  = random_split(dataset, [0.8, 0.1, 0.1], generator=generator2)

train(train_dataset, val_dataset, test_dataset, default_config)