## Load Libraries

In [1]:
import os
import random
import wandb
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import evaluate
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split


# wandb 프로젝트 설정

# os.environ["WANDB_API_KEY"]="6ee5e8ab1b91bbcf8be3098caaec592b8a6682c4"
# os.environ["WANDB_PROJECT"] = "level2_data-centric-contrastive-relabeling"

# # # wandb 초기화
# # # wandb.finish()

# wandb.init()


## Set Hyperparameters

In [2]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [8]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, 'data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'outputs')

# labeled_data_path = os.path.join(DATA_DIR, "cleaning_step1")

## Load Tokenizer and Model

In [40]:
model_name = 'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

## Define Dataset

In [10]:

train_df = pd.read_csv(DATA_DIR+f"/Self_train/self_train_step2_cleaned.csv")
#train_df = df[df['is_noisy'] == 1]
dataset_train, dataset_valid = train_test_split(train_df, test_size=0.2, random_state=SEED)

In [11]:
dataset_train

Unnamed: 0,ID,text,target
1916,ynat-v1_train_01944,추미애 검언유착 책임론 정면돌파…윤석열 고립·이성윤 신임종합,2
2425,ynat-v1_train_02457,게시판 삼성카드·준오뷰티 업무제휴,5
1076,ynat-v1_train_01094,김학민 가5성 보인 8득점…KB손}보험 컵대회y0a경기g승리,1
909,ynat-v1_train_00923,"전국 천여 어촌마을, 대규모 인명 피해 발생",4
1006,ynat-v1_train_01022,0용 공고 보는XK직자들,3
...,...,...,...
2543,ynat-v1_train_02575,침수 공사 기준 마련...정부 부담 줄이기 위한 방안 발표,5
2090,ynat-v1_train_02120,벨·뮬로바·마이스키…클래식 스타 일주일 간격 줄내한,1
2649,ynat-v1_train_02683,"전 대통령, 협력업체와 함께 사업 영위",4
613,ynat-v1_train_00621,프란치스코 교황 오는 8일 난민·난민 조력자 위해 미사,6


In [12]:
dataset_valid

Unnamed: 0,ID,text,target
1299,ynat-v1_train_01318,"해경, 해상 안전 강화 위해 박 척 센 본 도입",6
1329,ynat-v1_train_01348,금의환향한 류현진 추신수와 같은 팀 특별할 것 같다,1
2674,ynat-v1_train_02708,"文, 軍 기동훈련 참관…종합 훈련도 ""접종",2
593,ynat-v1_train_00601,"강정호, 매츠 상대로 홈런 쏴올리며 별처럼 빛나다!",6
1738,ynat-v1_train_01764,"韓獨, 플랜트 부품 공급에 협력…독일 업체와 기본 계약 체결",3
...,...,...,...
1016,ynat-v1_train_01033,"삼성, 장년층을 위한 특별한 폴더형 스마트폰을 출시하다",4
1573,ynat-v1_train_01594,"리우올림픽 홀로그램 공연, 화려한 무대 선보여",4
194,ynat-v1_train_00197,"남북 정상회담 준비, 또다시 리허설에 돌입하다",0
2431,ynat-v1_train_02463,"게시터, 기적의 힘",3


In [13]:
train_counts = dataset_train['target'].value_counts().to_dict()
valid_counts = dataset_valid['target'].value_counts().to_dict()

# 결과 출력
print(train_counts)
print(valid_counts)

{3: 344, 6: 334, 2: 328, 4: 307, 1: 306, 0: 305, 5: 287}
{1: 89, 0: 84, 4: 83, 2: 79, 6: 77, 3: 77, 5: 64}


In [14]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

In [15]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Contrastive loss

In [16]:
# # 대조 손실 함수 정의
# def contrastive_loss(embeddings, labels, margin=1.0):
#     distance_matrix = torch.cdist(embeddings, embeddings, p=2)  # 샘플 간 거리 계산
#     positive_pairs = (labels.unsqueeze(1) == labels.unsqueeze(0)).float()  # 긍정적 쌍
#     negative_pairs = 1 - positive_pairs  # 부정적 쌍

#     loss = (positive_pairs * (distance_matrix ** 2)).sum() + \
#            (negative_pairs * F.relu(margin - distance_matrix)).sum()

#     return loss / (embeddings.shape[0] ** 2)  # 평균 손실 반환

In [69]:
# 대조 손실 함수 정의
def contrastive_loss(embeddings, labels, margin=1.0):
    distance_matrix = torch.cdist(embeddings, embeddings, p=2)  # 샘플 간 거리 계산
    positive_pairs = (labels.unsqueeze(1) == labels.unsqueeze(0)).float()  # 긍정적 쌍
    negative_pairs = 1 - positive_pairs  # 부정적 쌍

    # 손실 계산
    pos_loss = (positive_pairs * (distance_matrix ** 2)).sum()  # 긍정적 손실
    neg_loss = (negative_pairs * F.relu(margin - distance_matrix)).sum()  # 부정적 손실

    # 긍정적 쌍과 부정적 쌍의 수
    num_positive_pairs = positive_pairs.sum() + 1e-6  # 0으로 나누는 것을 방지
    num_negative_pairs = negative_pairs.sum() + 1e-6

    # 평균 손실 계산
    total_loss = (pos_loss / num_positive_pairs) + (neg_loss / num_negative_pairs)
    
    return total_loss

In [None]:
def multi_class_contrastive_loss(embeddings, labels, margin=1.0):
    # 거리 계산
    distance_matrix = torch.cdist(embeddings, embeddings, p=2)
    
    # 각 클래스의 긍정적 쌍과 부정적 쌍을 계산
    pos_loss = 0.0
    neg_loss = 0.0
    num_classes = 7
    for class_id in range(num_classes):  # num_classes는 전체 클래스 수
        # 긍정적 쌍: 현재 클래스의 임베딩
        pos_mask = (labels.unsqueeze(1) == class_id) & (labels.unsqueeze(0) == class_id)
        pos_pairs = distance_matrix[pos_mask]
        # print("class_id = ",class_id,"pos_pairs", pos_pairs)
        pos_loss += pos_pairs.sum()  # 긍정적 쌍에 대한 손실

        # 부정적 쌍: 현재 클래스의 임베딩이 아닌 것
        neg_mask = (labels.unsqueeze(1) == class_id) & (labels.unsqueeze(0) != class_id)
        neg_pairs = distance_matrix[neg_mask]
        neg_loss += F.relu(margin - neg_pairs).sum()  # 부정적 쌍에 대한 손실

    # 평균 손실 계산
    total_loss = (pos_loss / (pos_mask.1esum() + -6)) + (neg_loss / (neg_mask.sum() + 1e-6))
    
    return total_loss

# Trainer

In [None]:
class ContrastiveTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # 입력을 모델에 통과시키고 임베딩을 추출
        # outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], output_hidden_states=True)
        outputs = model(**inputs, output_hidden_states=True)
        logits = outputs.logits
        # embeddings = outputs.hidden_states[-1][:, 1:, :].mean(dim=1)  # [CLS]를 제외한 평균 임베딩
        # embeddings = outputs.hidden_states[-1].mean(dim=1)
        embeddings = outputs.hidden_states[-1][:, 0, :]  # [CLS] 임베딩
        # print("compute_loss(inputs) :::: ", inputs)
        print("compute_loss(embeddings) ::::", embeddings)
        # print("compute_loss(logits) :::: ", logits )
        labels = inputs.get("labels")

        # 대조 손실 계산
        # loss = contrastive_loss(embeddings, labels)
        loss = multi_class_contrastive_loss(embeddings, labels) 
        # print("loss-ContrastiveTrainer : ", loss)
        # loss_clamp = torch.clamp(loss, max=20)

        return (loss, outputs) if return_outputs else loss

In [None]:

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
      # 모델의 출력을 얻기 위해 입력을 통과시킵니다.
      outputs = model(**inputs, output_hidden_states = True)
      print("logits : ", outputs.logits)
      print("logit shape : ", outputs.logits.shape)
      print("hidden_states : ", outputs.hidden_states)
      print("hidden_states shape : ", type(outputs.hidden_states))
      # 모델의 logits를 가져옵니다.
      logits = outputs.logits
      # print("compute_loss(logits) :::: ", outputs.logits )
      # 레이블을 가져옵니다.
      labels = inputs.get("labels")
      
      # CrossEntropyLoss를 사용하여 손실을 계산합니다.
      loss_fct = torch.nn.CrossEntropyLoss()

      loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
      
      return (loss, outputs) if return_outputs else loss

## Define Metric

In [25]:
"""f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # print("Predictions::", predictions)  # 예측값 출력

    # print("Predictions shape:", prediction.shape)  # 예측값 출력
    # print("Labels shape:", labels.shape)            # 레이블 출력
    # print("Predictions[0] : ", predictions[0])
    print("Predictions[0][0] : ", predictions[0][0])
    print("Predictions[0][1] : ", predictions[0][1])
    print("type predictions[0][1] : ", type(predictions[0][1]))
    print("len(predictions[1]) : ", len(predictions[1]))
    # print("Predictions[1] : ", predictions[1])
    predictions = np.argmax(predictions[0], axis=1)
    print("Predictions argmax ::", predictions)  # 예측값 출력
    
    print("Labels::", labels)            # 레이블 출력
    metrics = f1.compute(predictions=predictions, references=labels, average='macro')
    print("Computed metrics:", metrics)  # 계산된 메트릭 출력

    return metrics
"""
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


In [None]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # print("Predictions::", predictions)  # 예측값 출력
    (1,)
    [[]]
    # print("Predictions shape:", prediction.shape)  # 예측값 출력
    # print("Labels shape:", labels.shape)            # 레이블 출력
    # print("Predictions[0] : ", predictions[0])
    # print("Predictions[0][0] : ", predictions[0][0])
    # print("Predictions[0][1] : ", predictions[0][1])
    # print("type predictions[0][1] : ", type(predictions[0][1]))
    # print("len(predictions[1]) : ", len(predictions[1]))
    # print("Predictions[1] : ", predictions[1])
    predictions = np.argmax(predictions, axis=1)
    # print("Predictions argmax ::", predictions)  # 예측값 출력
    print("Labels::", labels)            # 레이블 출력
    metrics = f1.compute(predictions=predictions, references=labels, average='macro')
    print("Computed metrics:", metrics)  # 계산된 메트릭 출력

    return metrics


## Train Model

In [20]:
### for wandb setting
os.environ['WANDB_DISABLED'] = 'true'

In [26]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR +'/contrastive',
    logging_dir='./logs',  
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    # report_to="wandb",
    logging_strategy='steps',
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=50,
    eval_steps=50,
    save_steps=50,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [38]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [39]:
trainer.train()

logits :  tensor([[ 0.0159,  4.0113,  0.0683, -0.8905, -1.1357, -0.2976, -0.9804],
        [-0.9285, -1.0467, -1.2017,  0.1001,  3.9215,  0.8452, -1.5283],
        [-0.5002,  4.3545, -0.4818, -0.3963, -0.7467, -0.0851, -0.6304],
        [ 0.2322, -1.1572,  0.6532,  0.5852,  1.0632, -0.2661, -1.7763],
        [-1.0700, -0.5900, -1.0758, -0.5997, -0.4257, -1.0013,  4.4360],
        [ 3.5953, -0.4885, -1.1677,  0.2331,  0.2215, -0.6664, -1.7606],
        [-0.3837, -0.9651, -0.7500,  3.6271, -0.7444,  0.6269, -1.4789],
        [ 0.0859, -1.4922,  3.2040, -0.1455, -0.5197,  0.4980, -1.8349],
        [-0.9912, -1.4352, -0.0387,  3.4754, -0.0097,  0.0353, -1.6788],
        [-0.3423,  4.2387, -0.3018, -1.2998, -0.1973, -0.8227, -0.3407],
        [-1.1745, -0.5888,  3.3184,  0.8857, -0.9793, -0.7048,  0.5192],
        [ 1.2230, -0.1801, -1.1030, -0.9449, -0.2124, -0.5126,  2.2820],
        [-0.1688,  3.1505, -0.4608, -0.8064,  0.2805, -0.7743, -1.2042],
        [-0.2191,  4.0080, -0.2304, -1.38

ValueError: only one element tensors can be converted to Python scalars

## Re-labeling Train dataset

In [137]:
origin_train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))

In [138]:
ids = train_df['ID'].values
ids

array(['ynat-v1_train_00000', 'ynat-v1_train_00001',
       'ynat-v1_train_00002', ..., 'ynat-v1_train_02790',
       'ynat-v1_train_02792', 'ynat-v1_train_02797'], dtype=object)

In [139]:
model = AutoModelForSequenceClassification.from_pretrained(OUTPUT_DIR+'/contrastive_ver1_step1' + "/checkpoint-400").to(DEVICE)
model.eval()

new_train = origin_train.copy()

for idx, sample in tqdm(origin_train.iterrows(), total=len(origin_train), desc="Evaluating"):
    target_id = sample['ID']
    if target_id in ids :
        cleaned_text = train_df.loc[train_df['ID'] == sample['ID']]['text'].values[0]
        new_train.loc[new_train['ID'] == target_id, 'text'] = cleaned_text
        continue

    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        new_train.loc[new_train['ID'] == target_id, 'target'] = pred[0]

Evaluating: 100%|██████████| 2800/2800 [00:12<00:00, 227.35it/s]


In [140]:
new_train.to_csv(f'../data/self_train/contrastive_ver1_step1_multi.csv', index=False)

In [141]:
new_train.head()

Unnamed: 0,ID,text,target
0,ynat-v1_train_00000,정파 미사일 이용기간 단 1분종 1보,4
1,ynat-v1_train_00001,찰스 국 회장 ^로한^ 송,3
2,ynat-v1_train_00002,北 김정은 자주통일 새 시대 열어 나가야 1보,2
3,ynat-v1_train_00003,갤노트8 주말 27만대 개통…시장은 불법 보조금 얼룩,1
4,ynat-v1_train_00004,미국 대선 앞두고 중국 단발이 비해 감시 강화,6
