## Load Libraries

In [1]:
import os
import random
import wandb
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split


# wandb 프로젝트 설정
# os.environ["WANDB_PROJECT"] = "Level2_data-centric-label-estimate"

## Set Hyperparameters

In [2]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [4]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '../data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'outputs')

## Load Tokenizer and Model

In [5]:
model_name = 'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Dataset

In [6]:
data = pd.read_csv(os.path.join(DATA_DIR, 'self_train/contrastive_ver1_step1_multi.csv'))
dataset_train, dataset_valid = train_test_split(data, test_size=0.2, random_state=SEED)

In [7]:
dataset_train

Unnamed: 0,ID,text,target
1040,ynat-v1_train_01040,에어로스페이스 아시아 인사이트 부인…분기 적자 축소,5
1599,ynat-v1_train_01599,민주일반연맹 비정규직 차별철폐 공동행동,1
194,ynat-v1_train_00194,서울자유시민대학 초대 학장에 정재권 전 한겨레신문 논설위원,1
2461,ynat-v1_train_02461,아베 개헌논의 안 해도 좋은지 선거서 물을 것…개헌 이슈화,1
1479,ynat-v1_train_01479,신간,0
...,...,...,...
2543,ynat-v1_train_02543,박기원 감독 눈치 보지 말고…비예나 눈치 본 건 아닌데,1
2090,ynat-v1_train_02090,성 베드로 대학 베트남 캠퍼스 개교…중국 가전 기업 동남아 공략,4
2649,ynat-v1_train_02649,아이팩토리 상장폐지 이의신청서 제출,1
613,ynat-v1_train_00613,LG전자 미국서 G6 사면 구글 홈 준다,1


In [8]:
dataset_valid

Unnamed: 0,ID,text,target
2711,ynat-v1_train_02711,신동빈 경영복귀 첫 일성 적극적 투자로 국가 경제 이바지종합,1
2156,ynat-v1_train_02156,나스닥 3개월만에 최저 폭락 종합,5
549,ynat-v1_train_00549,中공안 장쑤 화학공단 폭발참사 책임자들 구금,1
933,ynat-v1_train_00933,더위 식히는 장맛비…남부·제주도 밤에 대부분 그쳐,1
186,ynat-v1_train_00186,KB증권 농심 4분기 라면 부문 실적개선…목표주가↑,1
...,...,...,...
873,ynat-v1_train_00873,청소년 짙은 안경 착용 항공 회사의 운,0
1144,ynat-v1_train_01144,비례대표^^권마다^^복되는 논쟁...해법책^없나,3
1775,ynat-v1_train_01775,시즌 첫 골 손흥민 모든 상황 준비해 좋은 결과로 이어졌다,1
394,ynat-v1_train_00394,중국군 시리아 순찰…뒤이어,6


In [9]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt', return_token_type_ids=False)
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

In [10]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Contrastive Loss

In [12]:
def multi_class_contrastive_loss(embeddings, labels, margin=1.0):
    # 거리 계산
    distance_matrix = torch.cdist(embeddings, embeddings, p=2)
    
    # 각 클래스의 긍정적 쌍과 부정적 쌍을 계산
    pos_loss = 0.0
    neg_loss = 0.0
    num_classes = 7
    for class_id in range(num_classes):  # num_classes는 전체 클래스 수
        # 긍정적 쌍: 현재 클래스의 임베딩
        pos_mask = (labels.unsqueeze(1) == class_id) & (labels.unsqueeze(0) == class_id)
        pos_pairs = distance_matrix[pos_mask]
        # print("class_id = ",class_id,"pos_pairs", pos_pairs)
        pos_loss += pos_pairs.sum()  # 긍정적 쌍에 대한 손실

        # 부정적 쌍: 현재 클래스의 임베딩이 아닌 것
        neg_mask = (labels.unsqueeze(1) == class_id) & (labels.unsqueeze(0) != class_id)
        neg_pairs = distance_matrix[neg_mask]
        neg_loss += F.relu(margin - neg_pairs).sum()  # 부정적 쌍에 대한 손실

    # 평균 손실 계산
    total_loss = (pos_loss / (pos_mask.sum() + 1e-6)) + (neg_loss / (neg_mask.sum() + 1e-6))
    
    return total_loss

# Trainer

In [13]:
class ContrastiveTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # 입력을 모델에 통과시키고 임베딩을 추출
        # outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], output_hidden_states=True)
        outputs = model(**inputs, output_hidden_states=True)
        logits = outputs.logits
        # embeddings = outputs.hidden_states[-1][:, 1:, :].mean(dim=1)  # [CLS]를 제외한 평균 임베딩
        # embeddings = outputs.hidden_states[-1].mean(dim=1)
        embeddings = outputs.hidden_states[-1][:, 0, :]  # [CLS] 임베딩
        # print("compute_loss(inputs) :::: ", inputs)
        print("compute_loss(embeddings) ::::", embeddings)
        # print("compute_loss(logits) :::: ", logits )
        labels = inputs.get("labels")

        # 대조 손실 계산
        # loss = contrastive_loss(embeddings, labels)
        loss = multi_class_contrastive_loss(embeddings, labels) 
        # print("loss-ContrastiveTrainer : ", loss)
        # loss_clamp = torch.clamp(loss, max=20)

        return (loss, outputs) if return_outputs else loss

## Define Metric

In [14]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


In [15]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # print("Predictions::", predictions)  # 예측값 출력

    # print("Predictions shape:", prediction.shape)  # 예측값 출력
    # print("Labels shape:", labels.shape)            # 레이블 출력
    # print("Predictions[0] : ", predictions[0])
    print("Predictions[0][0] : ", predictions[0][0])
    print("Predictions[0][1] : ", predictions[0][1])
    print("type predictions[0][1] : ", type(predictions[0][1]))
    print("len(predictions[1]) : ", len(predictions[1]))
    # print("Predictions[1] : ", predictions[1])
    predictions = np.argmax(predictions[0], axis=1)
    print("Predictions argmax ::", predictions)  # 예측값 출력
    
    print("Labels::", labels)            # 레이블 출력
    metrics = f1.compute(predictions=predictions, references=labels, average='macro')
    print("Computed metrics:", metrics)  # 계산된 메트릭 출력

    return metrics

## Train Model

In [16]:
### for wandb setting
#os.environ['WANDB_DISABLED'] = 'true'

In [17]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR+"/contrastive_ver1_step2",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    # report_to="wandb",
    logging_strategy='steps',
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=50,
    save_steps=350,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

In [18]:
trainer = ContrastiveTrainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [19]:
trainer.train()



[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


compute_loss(embeddings) :::: tensor([[ 0.5649, -0.7605,  0.5872,  ..., -0.2095, -1.0537,  0.2122],
        [-0.4860,  0.6894, -0.3407,  ..., -0.0779, -0.9592,  0.9828],
        [-0.0970, -1.2212,  0.7742,  ..., -2.4614, -1.6063,  0.1938],
        ...,
        [-1.6097, -0.8362, -0.4895,  ..., -0.6628, -1.2558,  0.6308],
        [-0.9141, -0.3129, -0.8909,  ..., -0.6258, -0.8752,  1.1629],
        [-0.3520, -1.4728, -0.9667,  ..., -1.3288, -1.1729, -0.5008]],
       device='cuda:0', grad_fn=<SliceBackward0>)
compute_loss(embeddings) :::: tensor([[-0.7048, -0.6835, -1.1906,  ..., -0.1505, -0.7902,  0.4441],
        [-0.3045, -0.1296, -0.4870,  ..., -0.1429, -0.5141, -0.8573],
        [ 0.2335,  0.2850, -0.6048,  ..., -0.3927, -0.9117,  0.3440],
        ...,
        [ 0.0690, -1.6204, -1.3426,  ..., -0.6698, -1.2513, -0.1908],
        [ 0.7391, -0.9000, -0.9082,  ...,  0.3295, -0.7247, -0.7030],
        [ 0.2717, -0.4802, -0.6515,  ...,  0.0042, -0.7416, -0.0752]],
       device='cuda:0'

Step,Training Loss,Validation Loss,F1
50,No log,11701172.0,0.091775
100,42845132.800000,14048970.0,0.091775
150,42845132.800000,15171713.0,0.091775
200,11579627.520000,15531454.0,0.091775
250,11579627.520000,15586308.0,0.091775
300,10121146.880000,15572484.0,0.091775
350,10121146.880000,15536219.0,0.091775


compute_loss(embeddings) :::: tensor([[-0.4638, -0.0912, -0.5899,  ...,  0.0124, -2.0367,  0.6780],
        [ 0.1962, -0.4925,  0.5105,  ..., -0.0673, -0.8647,  0.3757],
        [ 0.5860, -0.8879,  0.2918,  ..., -0.3863,  0.1104,  0.6815],
        ...,
        [ 0.0386, -0.4080, -0.5852,  ..., -0.0836, -1.2761, -0.2352],
        [ 0.7410, -1.4769, -0.5602,  ...,  0.0267, -0.8232,  0.1352],
        [-0.9839, -1.0233, -0.9057,  ..., -0.9175, -0.7389,  0.2307]],
       device='cuda:0', grad_fn=<SliceBackward0>)
compute_loss(embeddings) :::: tensor([[-0.0586, -0.5106, -0.5799,  ...,  0.0391, -1.3535,  0.5903],
        [ 0.6494, -0.6674, -0.3191,  ..., -0.4002, -0.5756, -0.0187],
        [-0.2457, -0.6176, -0.8403,  ...,  0.5093, -0.8911,  0.5601],
        ...,
        [ 0.8292, -1.0140, -0.3589,  ...,  0.6133, -0.8316,  0.3191],
        [-0.4714, -1.0781, -0.6193,  ...,  0.0691, -1.7980,  1.4779],
        [ 0.4463, -1.8912, -1.3018,  ...,  0.0363, -1.2924, -0.2045]],
       device='cuda:0'

TrainOutput(global_step=350, training_loss=19903977.508571427, metrics={'train_runtime': 473.1689, 'train_samples_per_second': 23.67, 'train_steps_per_second': 0.74, 'total_flos': 2946976112640000.0, 'train_loss': 19903977.508571427, 'epoch': 5.0})

In [20]:
labeled_data_path = os.path.join(DATA_DIR, "recovery_data")

# noise_list = []
# for label in range(7) :
#     df = pd.read_csv(labeled_data_path+f"/c1_label_dropped_{label}.csv")
#     noise_list.append(df)

# noise_df = pd.concat(noise_list, ignore_index=True)
df = pd.read_csv(labeled_data_path+f"/converted_train_ver2_fewshot.csv")
noise_df = df[df['is_noisy'] == 1]

In [21]:
ids = noise_df['ID'].values
ids

array(['ynat-v1_train_00000', 'ynat-v1_train_00001',
       'ynat-v1_train_00002', ..., 'ynat-v1_train_02790',
       'ynat-v1_train_02792', 'ynat-v1_train_02797'], dtype=object)

In [22]:
origin_train = pd.read_csv(os.path.join(DATA_DIR, 'self_train/contrastive_ver1_step1_multi.csv'))

model = AutoModelForSequenceClassification.from_pretrained(OUTPUT_DIR+"/contrastive_ver1_step2/checkpoint-350").to(DEVICE)
model.eval()

new_train = origin_train.copy()

for idx, sample in tqdm(origin_train.iterrows(), total=len(origin_train), desc="Evaluating"):
    target_id = sample['ID']

    if target_id in ids :
        continue

    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        new_train.loc[new_train['ID'] == target_id, 'target'] = pred[0]

new_train.to_csv(f'../data/self_train/contrastive_ver1_step2_multi.csv', index=False)

Evaluating: 100%|██████████| 2800/2800 [00:11<00:00, 239.33it/s]


## Evaluate Model