## Load Libraries

In [1]:
import os
import random
import wandb
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split


# wandb 프로젝트 설정
os.environ["WANDB_PROJECT"] = "P3_relabeling"
# wandb 초기화
wandb.init()


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


## Set Hyperparameters

In [2]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [7]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, 'data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'outputs')

labeled_data_path = os.path.join(DATA_DIR, "cleaning_step1")

## Load Tokenizer and Model

In [5]:
model_name = 'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Dataset

In [9]:
df_list = []
for label in range(7) :
    df = pd.read_csv(labeled_data_path+f"/c1_label_dropped_{label}.csv")
    df_list.append(df)

merged_df = pd.concat(df_list, ignore_index=True)
dataset_train, dataset_valid = train_test_split(merged_df, test_size=0.2, random_state=SEED)

In [10]:
train_counts = dataset_train['target'].value_counts().to_dict()
valid_counts = dataset_valid['target'].value_counts().to_dict()

# 결과 출력
print(train_counts)
print(valid_counts)

{1: 171, 6: 169, 4: 169, 3: 161, 2: 156, 5: 154, 0: 153}
{4: 52, 0: 46, 5: 43, 6: 41, 1: 40, 2: 34, 3: 28}


In [11]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

In [12]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [14]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


## Train Model

In [None]:
### for wandb setting
#os.environ['WANDB_DISABLED'] = 'true'

In [15]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    report_to="wandb",
    logging_strategy='steps',
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=50,
    save_steps=50,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [17]:
trainer.train()



Step,Training Loss,Validation Loss,F1
50,No log,1.474269,0.56941
100,2.925400,1.183434,0.624261
150,2.925400,1.167015,0.627026
200,1.529700,1.208315,0.616534
250,1.529700,1.210955,0.629904
300,0.886000,1.262557,0.616242
350,0.886000,1.270817,0.617404


TrainOutput(global_step=350, training_loss=1.6231160191127232, metrics={'train_runtime': 354.4273, 'train_samples_per_second': 31.967, 'train_steps_per_second': 0.988, 'total_flos': 2939871795225600.0, 'train_loss': 1.6231160191127232, 'epoch': 9.859154929577464})

## Re-labeling Train dataset

In [18]:
origin_train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))

In [None]:
ids = merged_df['ID'].values
ids

array(['ynat-v1_train_00015', 'ynat-v1_train_00066',
       'ynat-v1_train_00079', ..., 'ynat-v1_train_02742',
       'ynat-v1_train_02764', 'ynat-v1_train_02787'], dtype=object)

: 

In [58]:
model = AutoModelForSequenceClassification.from_pretrained(OUTPUT_DIR+"/checkpoint-350").to(DEVICE)
model.eval()

new_train = origin_train.copy()

for idx, sample in tqdm(origin_train.iterrows(), total=len(origin_train), desc="Evaluating"):
    target_id = sample['ID']
    if target_id in ids :
        cleaned_text = merged_df.loc[merged_df['ID'] == sample['ID']]['text'].values[0]
        new_train.loc[new_train['ID'] == target_id, 'text'] = cleaned_text
        continue

    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        new_train.loc[new_train['ID'] == target_id, 'target'] = pred[0]

Evaluating: 100%|██████████| 2800/2800 [00:23<00:00, 119.54it/s]


In [59]:
new_train.to_csv(f'./data/Self_train/self_train_step1.csv', index=False)

In [60]:
new_train.head()

Unnamed: 0,ID,text,target
0,ynat-v1_train_00000,KT 이용기간 2분 종료될 예정입니다.,4
1,ynat-v1_train_00001,K찰국대통령이 로L한 회장에게 2시간 동안 면담을 요청했다,3
2,ynat-v1_train_00002,"김정은, 자주통일 새해 메시지 발표",2
3,ynat-v1_train_00003,갤노트8 주말 27만대 개통…시장은 불법 보조금 얼룩,4
4,ynat-v1_train_00004,미 대선 앞두고 R2F 단 발 비해 감시 강화,6
