## Load Libraries

In [1]:
import os
import random
import wandb
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split


# wandb 프로젝트 설정
os.environ["WANDB_PROJECT"] = "Level2_data-centric-label-estimate"
# wandb 초기화
wandb.init()


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjilp1598[0m ([33mjilp1598-dd[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Set Hyperparameters

In [2]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [4]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '../data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'outputs')

# labeled_data_path = os.path.join(DATA_DIR, "cleaning_step1")

## Load Tokenizer and Model

In [5]:
model_name = 'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Dataset

In [6]:

df = pd.read_csv(DATA_DIR+f"/recovery_data/converted_train_ver2_fewshot.csv")
train_df = df[df['is_noisy'] == 1]
dataset_train, dataset_valid = train_test_split(train_df, test_size=0.2, random_state=SEED)

In [7]:
dataset_train

Unnamed: 0,ID,text,target,ascii_ratio,is_noisy
766,ynat-v1_train_00766,"T타임즈, 최고의 음식 애호가들을 위한 스마트폰 앱",4,0.451613,1
1010,ynat-v1_train_01010,노키리아·피씨손민…겨울은 베트남 연휴,0,0.566667,1
950,ynat-v1_train_00950,부처님 상원 공식 표창 받,6,0.562500,1
592,ynat-v1_train_00592,에어프라이어 업력 높아져 이지로 바꿔 연결,4,0.560000,1
2676,ynat-v1_train_02676,클래식 공연 즐겨 낮도 자…영화의 변신,0,0.240000,1
...,...,...,...,...,...
1573,ynat-v1_train_01573,여경투톱 체제 반 평…민주적재 한국 우선,2,0.387097,1
846,ynat-v1_train_00846,슬기로운 해양 경에 난민 대응 군 배치,6,0.428571,1
65,ynat-v1_train_00065,북 리수이 국제사회 임시 일원 되는데 기대감,2,0.500000,1
1016,ynat-v1_train_01016,오바마 폭스마켓 사^^ 대규모 감^^...누적 ^자^^^,6,0.583333,1


In [8]:
dataset_valid

Unnamed: 0,ID,text,target,ascii_ratio,is_noisy
2001,ynat-v1_train_02001,노동자 한낮 초록 더 미세·오존 위험,0,0.458333,1
1098,ynat-v1_train_01098,주택금융^^카^^뱅크 택금융 개발 협력,5,0.375000,1
2046,ynat-v1_train_02046,책을 쓸 때 비어 있는 페이지는 ^맞아요,0,0.483871,1
958,ynat-v1_train_00958,러시아 정식 제안 온면 푸틴 만나게 될 것,6,0.392857,1
329,ynat-v1_train_00329,김정은 평양 제2래과학자거리 착공 지시,2,0.307692,1
...,...,...,...,...,...
231,ynat-v1_train_00231,드라마 쉽게 열리지 않는 멕시코 골목,1,0.400000,1
646,ynat-v1_train_00646,삼성전자 설계전문가,4,0.454545,1
475,ynat-v1_train_00475,콘텐츠 왜 나한텐 페이스북은 유명하지,4,0.321429,1
527,ynat-v1_train_00527,롯데건설·폐수 슬러지 건조기술 환경에서 녹색인증,5,0.250000,1


In [9]:
train_counts = dataset_train['target'].value_counts().to_dict()
valid_counts = dataset_valid['target'].value_counts().to_dict()

# 결과 출력
print(train_counts)
print(valid_counts)

{3: 196, 1: 188, 0: 186, 4: 185, 5: 182, 2: 181, 6: 178}
{4: 56, 6: 50, 0: 45, 1: 45, 2: 45, 5: 44, 3: 39}


In [10]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

In [11]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [13]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


## Train Model

In [14]:
### for wandb setting
#os.environ['WANDB_DISABLED'] = 'true'

In [15]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR +'/self_train_ver2_step1',
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    report_to="wandb",
    logging_strategy='steps',
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=50,
    save_steps=50,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [17]:
trainer.train()



Step,Training Loss,Validation Loss,F1
50,No log,1.468755,0.514511
100,2.904000,1.283719,0.578784
150,2.904000,1.240349,0.586364
200,1.544000,1.281804,0.590322
250,1.544000,1.329944,0.568266
300,0.919100,1.364188,0.580375
350,0.919100,1.409888,0.563966
400,0.659300,1.425642,0.565138


TrainOutput(global_step=400, training_loss=1.5065975761413575, metrics={'train_runtime': 211.859, 'train_samples_per_second': 61.173, 'train_steps_per_second': 1.888, 'total_flos': 3367972700160000.0, 'train_loss': 1.5065975761413575, 'epoch': 9.876543209876543})

## Re-labeling Train dataset

In [18]:
origin_train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))

In [19]:
ids = train_df['ID'].values
ids

array(['ynat-v1_train_00000', 'ynat-v1_train_00001',
       'ynat-v1_train_00002', ..., 'ynat-v1_train_02790',
       'ynat-v1_train_02792', 'ynat-v1_train_02797'], dtype=object)

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(OUTPUT_DIR+'/self_train_ver2_step1' + "/checkpoint-400").to(DEVICE)
model.eval()

new_train = origin_train.copy()

for idx, sample in tqdm(origin_train.iterrows(), total=len(origin_train), desc="Evaluating"):
    target_id = sample['ID']
    if target_id in ids :
        cleaned_text = train_df.loc[train_df['ID'] == sample['ID']]['text'].values[0]
        new_train.loc[new_train['ID'] == target_id, 'text'] = cleaned_text
        continue

    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        new_train.loc[new_train['ID'] == target_id, 'target'] = pred[0]

Evaluating: 100%|██████████| 2800/2800 [00:06<00:00, 432.66it/s]


In [21]:
new_train.to_csv(f'../data/self_train/self_train_v2_step1.csv', index=False)

In [22]:
new_train.head()

Unnamed: 0,ID,text,target
0,ynat-v1_train_00000,정파 미사일 이용기간 단 1분종 1보,4
1,ynat-v1_train_00001,찰스 국 회장 ^로한^ 송,3
2,ynat-v1_train_00002,北 김정은 자주통일 새 시대 열어 나가야 1보,2
3,ynat-v1_train_00003,갤노트8 주말 27만대 개통…시장은 불법 보조금 얼룩,4
4,ynat-v1_train_00004,미국 대선 앞두고 중국 단발이 비해 감시 강화,6
