##필요한 라이브러리 임포트

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, matthews_corrcoef
import numpy as np
from transformers import AlbertForSequenceClassification, Trainer, TrainingArguments
import torch
from transformers import AlbertTokenizer
from torch.utils.data import Dataset
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
from sklearn.utils import shuffle
from transformers import get_linear_schedule_with_warmup
from transformers import EarlyStoppingCallback
from google.colab import drive
drive.mount('/content/drive')
#파일 경로 설정
root = '/content/drive/MyDrive/국민대학교/D&A Conference/data/데이터 최종/'

Mounted at /content/drive


### 데이터 로드

In [None]:
train_data = pd.read_excel(root + 'train_data2.xlsx')

val_data = pd.read_excel(root + 'val_data.xlsx')

test_data = pd.read_excel(root + 'test_data.xlsx')

In [None]:
train_data = train_data.drop(columns='Unnamed: 0')
print(train_data['y'].value_counts())

y
no     116570
yes     14800
Name: count, dtype: int64


### 데이터 인코딩

In [None]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', clean_up_tokenization_spaces=True)

def encode_data(data, tokenizer, max_length=160):
    return tokenizer(
        data['text'].tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

#인코딩
train_encodings = encode_data(train_data, tokenizer)
val_encodings = encode_data(val_data, tokenizer)
test_encodings = encode_data(test_data, tokenizer)

#라벨->텐서
train_labels = torch.tensor(train_data['y'].apply(lambda x: 1 if x == 'yes' else 0).tolist(), dtype=torch.long)
val_labels = torch.tensor(val_data['y'].apply(lambda x: 1 if x == 'yes' else 0).tolist(), dtype=torch.long)
test_labels = torch.tensor(test_data['y'].apply(lambda x: 1 if x == 'yes' else 0).tolist(), dtype=torch.long)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

### 평가지표

In [None]:
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)

    f1_scores = f1_score(labels, preds, average=None)

    # Class_1에 대한 F1 score 추출
    f1_class_1 = f1_scores[1]

    mcc = matthews_corrcoef(labels, preds)

    return {
        'f1_label: 1': f1_class_1,
        'mcc': mcc,
        'eval_loss': p[0].mean()
    }

### 데이터셋으로 변환

In [None]:
class CustomerDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomerDataset(train_encodings, train_labels)
train_dataset = shuffle(train_dataset, random_state=42)
val_dataset = CustomerDataset(val_encodings, val_labels)
test_dataset = CustomerDataset(test_encodings, test_labels)

### class_weight 정의 및 커스텀 Trainer 정의

In [None]:
classes = np.array(['no', 'yes'])
class_weights = [1.0, 3.0]
device = torch.device("cuda")
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)


#class_weight 대신 사용한 Focal_loss 클래스- 사용하려면 custom trainer 클래스에서도 변경 필요
'''
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, labels):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(logits, labels)
        pt = torch.exp(-ce_loss)  # 확률 예측값
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss  # Focal Loss 공식

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss
'''

# 커스텀 Trainer 클래스 정의
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)  # 가중치 적용된 손실 함수
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
print("Class weights:", class_weights)

Class weights: [1.0, 3.0]


### 모델 하이퍼 파리미터 설정 및 훈련

In [None]:
# 모델 초기화
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)

# 훈련 인자 설정
training_args = TrainingArguments(
    output_dir='./results',
    report_to="none",
    num_train_epochs=10,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=62,
    gradient_accumulation_steps=4,
    label_smoothing_factor=0.05,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    learning_rate=4e-5,
    lr_scheduler_type="linear" #cosine
)

# Trainer 설정
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 모델 학습
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Label: 1,Mcc
0,0.4701,0.45797,0.5,0.431888
1,0.4689,0.453122,0.492509,0.423208
2,0.4571,0.444869,0.501908,0.434134
4,0.4655,0.444506,0.510407,0.444411
5,0.4584,0.444871,0.509624,0.443264
6,0.4437,0.44221,0.508507,0.441727


TrainOutput(global_step=1797, training_loss=0.4709636984630366, metrics={'train_runtime': 5802.1401, 'train_samples_per_second': 226.416, 'train_steps_per_second': 0.441, 'total_flos': 6867642311712000.0, 'train_loss': 0.4709636984630366, 'epoch': 6.9990262901655305})

In [None]:
# 평가 결과 출력 (평가 후 최종 성능 확인)
eval_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Validation Loss: {eval_results['eval_loss']}")
print(f"Validation F1: {eval_results['eval_f1_label: 1']}")
print(f"Validation MCC: {eval_results['eval_mcc']}")

Validation Loss: 0.44146978855133057
Validation F1: 0.5124423963133641
Validation MCC: 0.4464687002310551


In [None]:
test_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Test Loss: {test_results['eval_loss']}")
print(f"Test F1: {test_results['eval_f1_label: 1']}")
print(f"Test MCC: {test_results['eval_mcc']}")

Test Loss: 0.44197291135787964
Test F1: 0.508411214953271
Test MCC: 0.4416509073814116


### 모델 저장

In [None]:
save_path = '/content/drive/MyDrive/국민대학교/D&A Conference/data/데이터 최종/12data_1:3_modeling_model.pth'

# 모델을 CPU 텐서로 변환해 저장
torch.save(model.to('cpu').state_dict(), save_path)

In [None]:
!du -hsc "{save_path}"

45M	/content/drive/MyDrive/국민대학교/D&A Conference/data/데이터 최종/12data_1:3_modeling_model.pth
45M	total
