## Load Libraries

In [None]:
import os
import random
import wandb
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

# wandb 프로젝트 설정
os.environ["WANDB_PROJECT"] = "P3_relabeling"
# wandb 초기화
wandb.init()


## Set Hyperparameters

In [2]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [4]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, 'data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'outputs')

## Load Tokenizer and Model

In [5]:
model_name = 'klue/roberta-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Dataset

In [6]:
data = pd.read_csv(os.path.join(DATA_DIR, 'Self_train/self_train_step1.csv'))
dataset_train, dataset_valid = train_test_split(data, test_size=0.2, random_state=SEED)

In [7]:
dataset_train

Unnamed: 0,ID,text,target
1040,ynat-v1_train_01040,2억 5천만 달러로 인프라 투자 확대,5
1599,ynat-v1_train_01599,민주일반연맹 비정규직 차별철폐 공동행동,3
194,ynat-v1_train_00194,서울자유시민대학 초대 학장에 정재권 전 한겨레신문 논설위원,3
2461,ynat-v1_train_02461,아베 개헌논의 안 해도 좋은지 선거서 물을 것…개헌 이슈화,6
1479,ynat-v1_train_01479,"신간 ""우미령의 정복"" 연재 9기",0
...,...,...,...
2543,ynat-v1_train_02543,박기원 감독 눈치 보지 말고…비예나 눈치 본 건 아닌데,1
2090,ynat-v1_train_02090,삼성 갤럭시 J5가 동남아 시장에 출시,4
2649,ynat-v1_train_02649,아이팩토리 상장폐지 이의신청서 제출,5
613,ynat-v1_train_00613,LG전자 미국서 G6 사면 구글 홈 증정,4


In [8]:
dataset_valid

Unnamed: 0,ID,text,target
2711,ynat-v1_train_02711,신동빈 경영복귀 첫 일성 적극적 투자로 국가 경제 이바지종합,3
2156,ynat-v1_train_02156,나스닥 c6M폭락!6개월만에I최C폭 하락종합,5
549,ynat-v1_train_00549,中공안 장쑤 화학공단 폭발참사 책임자들 구금,6
933,ynat-v1_train_00933,더위 식히는 장맛비…남부·제주도 밤에 대부분 그쳐,0
186,ynat-v1_train_00186,KB증권 농심 4분기 라면 부문 실적개선…목표주가↑,5
...,...,...,...
873,ynat-v1_train_00873,대한항공 63편 항공기가 9시간 지연됨,0
1144,ynat-v1_train_01144,"N 비료 생산업체, 권마다 복제되는 논 위키백과 책 2 없나요?",3
1775,ynat-v1_train_01775,시즌 첫 골 손흥민 모든 상황 준비해 좋은 결과로 이어졌다,1
394,ynat-v1_train_00394,8시 30분 시리얼 순찰 후,6


In [9]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt', return_token_type_ids=False)
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

In [10]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [12]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


## Train Model

In [13]:
### for wandb setting
#os.environ['WANDB_DISABLED'] = 'true'

In [14]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR+"/self_train_step1",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    report_to="wandb",
    logging_strategy='steps',
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=50,
    save_steps=50,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [16]:
trainer.train()



[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkangjeonhwi[0m ([33mkangjeonhwi1[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,F1
50,No log,0.931601,0.724267
100,4.328600,0.908999,0.740844
150,4.328600,0.880327,0.741697
200,2.194600,1.016557,0.709236
250,2.194600,1.006281,0.725404
300,1.170100,0.991075,0.741623
350,1.170100,1.000813,0.742937


TrainOutput(global_step=350, training_loss=2.318537434169224, metrics={'train_runtime': 1193.2862, 'train_samples_per_second': 9.386, 'train_steps_per_second': 0.293, 'total_flos': 1.04378076020736e+16, 'train_loss': 2.318537434169224, 'epoch': 5.0})

In [18]:
labeled_data_path = os.path.join(DATA_DIR, "cleaning_step1")

noise_list = []
for label in range(7) :
    df = pd.read_csv(labeled_data_path+f"/c1_label_dropped_{label}.csv")
    noise_list.append(df)

noise_df = pd.concat(noise_list, ignore_index=True)

In [22]:
ids = noise_df['ID'].values
ids

array(['ynat-v1_train_00015', 'ynat-v1_train_00066',
       'ynat-v1_train_00079', ..., 'ynat-v1_train_02742',
       'ynat-v1_train_02764', 'ynat-v1_train_02787'], dtype=object)

In [25]:
origin_train = pd.read_csv(os.path.join(DATA_DIR, 'Self_train/self_train_step1.csv'))

model = AutoModelForSequenceClassification.from_pretrained(OUTPUT_DIR+"/self_train_step1/checkpoint-350").to(DEVICE)
model.eval()

new_train = origin_train.copy()

for idx, sample in tqdm(origin_train.iterrows(), total=len(origin_train), desc="Evaluating"):
    target_id = sample['ID']

    if target_id in ids :
        continue

    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        new_train.loc[new_train['ID'] == target_id, 'target'] = pred[0]

new_train.to_csv(f'./data/Self_train/self_train_step2.csv', index=False)

Evaluating: 100%|██████████| 2800/2800 [00:39<00:00, 70.67it/s] 


## Evaluate Model