## Load Libraries

In [2]:
import os
import random
import wandb
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import evaluate
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split


# wandb 프로젝트 설정

# os.environ["WANDB_API_KEY"]="6ee5e8ab1b91bbcf8be3098caaec592b8a6682c4"
# os.environ["WANDB_PROJECT"] = "level2_data-centric-contrastive-relabeling"

# # # wandb 초기화
# # # wandb.finish()

# wandb.init()


## Set Hyperparameters

In [3]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [4]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [5]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '../data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'outputs')

# labeled_data_path = os.path.join(DATA_DIR, "cleaning_step1")

## Load Tokenizer and Model

In [6]:
model_name = 'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Dataset

In [8]:

df = pd.read_csv(DATA_DIR+f"/recovery_data/converted_train_ver2_fewshot.csv")
train_df = df[df['is_noisy'] == 1]
dataset_train, dataset_valid = train_test_split(train_df, test_size=0.2, random_state=SEED)

In [9]:
dataset_train

Unnamed: 0,ID,text,target,ascii_ratio,is_noisy
766,ynat-v1_train_00766,"T타임즈, 최고의 음식 애호가들을 위한 스마트폰 앱",4,0.451613,1
1010,ynat-v1_train_01010,노키리아·피씨손민…겨울은 베트남 연휴,0,0.566667,1
950,ynat-v1_train_00950,부처님 상원 공식 표창 받,6,0.562500,1
592,ynat-v1_train_00592,에어프라이어 업력 높아져 이지로 바꿔 연결,4,0.560000,1
2676,ynat-v1_train_02676,클래식 공연 즐겨 낮도 자…영화의 변신,0,0.240000,1
...,...,...,...,...,...
1573,ynat-v1_train_01573,여경투톱 체제 반 평…민주적재 한국 우선,2,0.387097,1
846,ynat-v1_train_00846,슬기로운 해양 경에 난민 대응 군 배치,6,0.428571,1
65,ynat-v1_train_00065,북 리수이 국제사회 임시 일원 되는데 기대감,2,0.500000,1
1016,ynat-v1_train_01016,오바마 폭스마켓 사^^ 대규모 감^^...누적 ^자^^^,6,0.583333,1


In [10]:
dataset_valid

Unnamed: 0,ID,text,target,ascii_ratio,is_noisy
2001,ynat-v1_train_02001,노동자 한낮 초록 더 미세·오존 위험,0,0.458333,1
1098,ynat-v1_train_01098,주택금융^^카^^뱅크 택금융 개발 협력,5,0.375000,1
2046,ynat-v1_train_02046,책을 쓸 때 비어 있는 페이지는 ^맞아요,0,0.483871,1
958,ynat-v1_train_00958,러시아 정식 제안 온면 푸틴 만나게 될 것,6,0.392857,1
329,ynat-v1_train_00329,김정은 평양 제2래과학자거리 착공 지시,2,0.307692,1
...,...,...,...,...,...
231,ynat-v1_train_00231,드라마 쉽게 열리지 않는 멕시코 골목,1,0.400000,1
646,ynat-v1_train_00646,삼성전자 설계전문가,4,0.454545,1
475,ynat-v1_train_00475,콘텐츠 왜 나한텐 페이스북은 유명하지,4,0.321429,1
527,ynat-v1_train_00527,롯데건설·폐수 슬러지 건조기술 환경에서 녹색인증,5,0.250000,1


In [11]:
train_counts = dataset_train['target'].value_counts().to_dict()
valid_counts = dataset_valid['target'].value_counts().to_dict()

# 결과 출력
print(train_counts)
print(valid_counts)

{3: 196, 1: 188, 0: 186, 4: 185, 5: 182, 2: 181, 6: 178}
{4: 56, 6: 50, 0: 45, 1: 45, 2: 45, 5: 44, 3: 39}


In [12]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

In [13]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
dataset_train['target']

766     4
1010    0
950     6
592     4
2676    0
       ..
1573    2
846     6
65      2
1016    6
1033    4
Name: target, Length: 1296, dtype: int64

# Contrastive loss

In [16]:
# # 대조 손실 함수 정의
# def contrastive_loss(embeddings, labels, margin=1.0):
#     distance_matrix = torch.cdist(embeddings, embeddings, p=2)  # 샘플 간 거리 계산
#     positive_pairs = (labels.unsqueeze(1) == labels.unsqueeze(0)).float()  # 긍정적 쌍
#     negative_pairs = 1 - positive_pairs  # 부정적 쌍

#     loss = (positive_pairs * (distance_matrix ** 2)).sum() + \
#            (negative_pairs * F.relu(margin - distance_matrix)).sum()

#     return loss / (embeddings.shape[0] ** 2)  # 평균 손실 반환

In [69]:
# 대조 손실 함수 정의
def contrastive_loss(embeddings, labels, margin=1.0):
    distance_matrix = torch.cdist(embeddings, embeddings, p=2)  # 샘플 간 거리 계산
    positive_pairs = (labels.unsqueeze(1) == labels.unsqueeze(0)).float()  # 긍정적 쌍
    negative_pairs = 1 - positive_pairs  # 부정적 쌍

    # 손실 계산
    pos_loss = (positive_pairs * (distance_matrix ** 2)).sum()  # 긍정적 손실
    neg_loss = (negative_pairs * F.relu(margin - distance_matrix)).sum()  # 부정적 손실

    # 긍정적 쌍과 부정적 쌍의 수
    num_positive_pairs = positive_pairs.sum() + 1e-6  # 0으로 나누는 것을 방지
    num_negative_pairs = negative_pairs.sum() + 1e-6

    # 평균 손실 계산
    total_loss = (pos_loss / num_positive_pairs) + (neg_loss / num_negative_pairs)
    
    return total_loss

In [None]:
def multi_class_contrastive_loss(embeddings, labels, margin=1.0):
    # 거리 계산
    distance_matrix = torch.cdist(embeddings, embeddings, p=2)
    
    # 각 클래스의 긍정적 쌍과 부정적 쌍을 계산
    pos_loss = 0.0
    neg_loss = 0.0
    num_classes = 7
    for class_id in range(num_classes):  # num_classes는 전체 클래스 수
        # 긍정적 쌍: 현재 클래스의 임베딩
        pos_mask = (labels.unsqueeze(1) == class_id) & (labels.unsqueeze(0) == class_id)
        pos_pairs = distance_matrix[pos_mask]
        # print("class_id = ",class_id,"pos_pairs", pos_pairs)
        pos_loss += pos_pairs.sum()  # 긍정적 쌍에 대한 손실

        # 부정적 쌍: 현재 클래스의 임베딩이 아닌 것
        neg_mask = (labels.unsqueeze(1) == class_id) & (labels.unsqueeze(0) != class_id)
        neg_pairs = distance_matrix[neg_mask]
        neg_loss += F.relu(margin - neg_pairs).sum()  # 부정적 쌍에 대한 손실

    # 평균 손실 계산
    total_loss = (pos_loss / (pos_mask.1esum() + -6)) + (neg_loss / (neg_mask.sum() + 1e-6))
    
    return total_loss

# Trainer

In [None]:
class ContrastiveTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # 입력을 모델에 통과시키고 임베딩을 추출
        # outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], output_hidden_states=True)
        outputs = model(**inputs, output_hidden_states=True)
        logits = outputs.logits
        # embeddings = outputs.hidden_states[-1][:, 1:, :].mean(dim=1)  # [CLS]를 제외한 평균 임베딩
        # embeddings = outputs.hidden_states[-1].mean(dim=1)
        embeddings = outputs.hidden_states[-1][:, 0, :]  # [CLS] 임베딩
        # print("compute_loss(inputs) :::: ", inputs)
        print("compute_loss(embeddings) ::::", embeddings)
        # print("compute_loss(logits) :::: ", logits )
        labels = inputs.get("labels")

        # 대조 손실 계산
        # loss = contrastive_loss(embeddings, labels)
        loss = multi_class_contrastive_loss(embeddings, labels) 
        # print("loss-ContrastiveTrainer : ", loss)
        # loss_clamp = torch.clamp(loss, max=20)

        return (loss, outputs) if return_outputs else loss

In [None]:

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
      # 모델의 출력을 얻기 위해 입력을 통과시킵니다.
      outputs = model(**inputs)
      
      # 모델의 logits를 가져옵니다.
      logits = outputs.logits
      # print("compute_loss(logits) :::: ", outputs.logits )
      # 레이블을 가져옵니다.
      labels = inputs.get("labels")

      # CrossEntropyLoss를 사용하여 손실을 계산합니다.
      loss_fct = torch.nn.CrossEntropyLoss()

      loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
      print("loss-customtrainer : ", loss)
      return (loss, outputs) if return_outputs else loss

## Define Metric

In [206]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # print("Predictions::", predictions)  # 예측값 출력

    # print("Predictions shape:", prediction.shape)  # 예측값 출력
    # print("Labels shape:", labels.shape)            # 레이블 출력
    # print("Predictions[0] : ", predictions[0])
    print("Predictions[0][0] : ", predictions[0][0])
    print("Predictions[0][1] : ", predictions[0][1])
    print("type predictions[0][1] : ", type(predictions[0][1]))
    print("len(predictions[1]) : ", len(predictions[1]))
    # print("Predictions[1] : ", predictions[1])
    predictions = np.argmax(predictions[0], axis=1)
    print("Predictions argmax ::", predictions)  # 예측값 출력
    
    print("Labels::", labels)            # 레이블 출력
    metrics = f1.compute(predictions=predictions, references=labels, average='macro')
    print("Computed metrics:", metrics)  # 계산된 메트릭 출력

    return metrics


In [None]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # print("Predictions::", predictions)  # 예측값 출력
    (1,)
    [[]]
    # print("Predictions shape:", prediction.shape)  # 예측값 출력
    # print("Labels shape:", labels.shape)            # 레이블 출력
    # print("Predictions[0] : ", predictions[0])
    # print("Predictions[0][0] : ", predictions[0][0])
    # print("Predictions[0][1] : ", predictions[0][1])
    # print("type predictions[0][1] : ", type(predictions[0][1]))
    # print("len(predictions[1]) : ", len(predictions[1]))
    # print("Predictions[1] : ", predictions[1])
    predictions = np.argmax(predictions, axis=1)
    # print("Predictions argmax ::", predictions)  # 예측값 출력
    print("Labels::", labels)            # 레이블 출력
    metrics = f1.compute(predictions=predictions, references=labels, average='macro')
    print("Computed metrics:", metrics)  # 계산된 메트릭 출력

    return metrics


## Train Model

In [193]:
### for wandb setting
os.environ['WANDB_DISABLED'] = 'true'

In [227]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR +'/contrastive_ver1_step1',
    logging_dir='./logs',  
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    # report_to="wandb",
    logging_strategy='steps',
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=50,
    eval_steps=50,
    save_steps=50,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [228]:
trainer = ContrastiveTrainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [229]:
trainer.train()

compute_loss(embeddings) :::: tensor([[ 0.1941, -0.3322, -0.0420,  ..., -0.1407, -0.0579,  0.2699],
        [ 0.2096, -0.3080, -0.0259,  ..., -0.1491, -0.0122,  0.2819],
        [ 0.2319, -0.3333, -0.0465,  ..., -0.1151, -0.1438,  0.2689],
        ...,
        [ 0.2519, -0.2791, -0.0172,  ..., -0.1796,  0.0138,  0.2855],
        [ 0.2484, -0.2684, -0.0223,  ..., -0.1657,  0.0181,  0.2921],
        [ 0.2539, -0.3408, -0.0402,  ..., -0.0893, -0.1582,  0.2762]],
       device='cuda:0', grad_fn=<SliceBackward0>)


NameError: name 'loss_clamp' is not defined

## Re-labeling Train dataset

In [137]:
origin_train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))

In [138]:
ids = train_df['ID'].values
ids

array(['ynat-v1_train_00000', 'ynat-v1_train_00001',
       'ynat-v1_train_00002', ..., 'ynat-v1_train_02790',
       'ynat-v1_train_02792', 'ynat-v1_train_02797'], dtype=object)

In [139]:
model = AutoModelForSequenceClassification.from_pretrained(OUTPUT_DIR+'/contrastive_ver1_step1' + "/checkpoint-400").to(DEVICE)
model.eval()

new_train = origin_train.copy()

for idx, sample in tqdm(origin_train.iterrows(), total=len(origin_train), desc="Evaluating"):
    target_id = sample['ID']
    if target_id in ids :
        cleaned_text = train_df.loc[train_df['ID'] == sample['ID']]['text'].values[0]
        new_train.loc[new_train['ID'] == target_id, 'text'] = cleaned_text
        continue

    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        new_train.loc[new_train['ID'] == target_id, 'target'] = pred[0]

Evaluating: 100%|██████████| 2800/2800 [00:12<00:00, 227.35it/s]


In [140]:
new_train.to_csv(f'../data/self_train/contrastive_ver1_step1_multi.csv', index=False)

In [141]:
new_train.head()

Unnamed: 0,ID,text,target
0,ynat-v1_train_00000,정파 미사일 이용기간 단 1분종 1보,4
1,ynat-v1_train_00001,찰스 국 회장 ^로한^ 송,3
2,ynat-v1_train_00002,北 김정은 자주통일 새 시대 열어 나가야 1보,2
3,ynat-v1_train_00003,갤노트8 주말 27만대 개통…시장은 불법 보조금 얼룩,1
4,ynat-v1_train_00004,미국 대선 앞두고 중국 단발이 비해 감시 강화,6
