# Finetuing

## module import


In [1]:
import wandb
import time
import numpy as np
import pandas as pd

import tensorflow as tf
import torch

import re
import os

import datasets
from datasets import load_dataset, load_metric, ClassLabel, Sequence, Dataset

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split as tts

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, AutoModelForMaskedLM
import matplotlib.pyplot as plt

print('torch version:', torch.__version__)
print('tf version:', tf.__version__)

2022-09-23 03:40:32.209816: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-23 03:40:32.385846: I tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: libtpu.so
I0923 03:40:32.496888569 1492377 ev_epoll1_linux.cc:121]     grpc epoll fd: 65
D0923 03:40:32.496906274 1492377 ev_posix.cc:141]            Using polling engine: epoll1
D0923 03:40:32.496934828 1492377 lb_policy_registry.cc:48]   registering LB policy factory for "grpclb"
D0923 03:40:32.496959696 1492377 lb_policy_registry.cc:48]   registering LB policy factory for "rls_experimental"
D0923 03:40:32.496968543 1492377 lb_policy_registry.cc:48]   registering LB policy factory for "priority_experimental"
D0923 03:40:32



[percpu.cc : 535] RAW: rseq syscall failed with errno 22 after membarrier sycall succeeded.


torch version: 1.12.0+cu102
tf version: 2.10.0


In [2]:
# using TPU through torch
import torch_xla
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.utils.serialization as xser
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp

print(torch_xla.__version__)

1.12


In [3]:
# random seed fix
import random

random.seed(2022)
torch.manual_seed(2022)
np.random.seed(2022)

### TPU setting

In [4]:
## Google cloud project에서 TPU 셋팅

# .py로 실행할 때 TPU 셋팅 명령어
#!export XRT_TPU_CONFIG="localservice;0;localhost:51011"

# 주피터 노트 또는 주피터 랩에서 실행할 때, TPU 셋팅 명령어
import os
os.environ['XRT_TPU_CONFIG'] = "localservice;0;localhost:51011"
device = xm.xla_device()
device

device(type='xla', index=1)

## data load & EDA

In [9]:
#  file load
data_pro = pd.read_csv("data_pro", sep = ',', index_col = 0)
print(len(data_pro))
data_pro.head()

82610


Unnamed: 0,content,label
0,아내가 드디어 출산하게 되어서 정말 신이 나,0
1,당뇨랑 합병증 때문에 먹어야 할 약이 열 가지가 넘어가니까 스트레스야,1
2,고등학교에 올라오니 중학교 때보다 수업이 갑자기 어려워져서 당황스러워,1
3,재취업이 돼서 받게 된 첫 월급으로 온 가족이 외식을 할 예정이야 너무 행복해,0
4,이제 곧 은퇴할 시기가 되었어 내가 먼저 은퇴를 하고 육 개월 후에 남편도 은퇴를 ...,1


## data split

In [10]:
train_data, val_data = tts(data_pro, test_size=0.2)

In [11]:
print(len(train_data))
print(len(val_data))

66088
16522


## input data transform

In [12]:
# load DAPT model, tokenizer
num_labels = 6
model = AutoModelForSequenceClassification.from_pretrained('JUNEYEOB/DAPT_batch512_lyric_con_sent', num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')

tcmalloc: large alloc 1855389696 bytes == 0x9445e000 @  0x7f59cc76c680 0x7f59cc78d824 0x7f59cc78db8a 0x7f5803be732e 0x7f5803bd2da2 0x7f5836f96451 0x7f584d6b3409 0x7f584d35c8d5 0x5f6929 0x5f74f6 0x50c383 0x570b26 0x569dba 0x5f6eb3 0x5f6082 0x56d2d5 0x569dba 0x5f6eb3 0x56cc1f 0x5f6cd6 0x56bacd 0x569dba 0x5f6eb3 0x50bc2c 0x5f6082 0x56d2d5 0x569dba 0x50bca0 0x56cc1f 0x569dba 0x6902a7
tcmalloc: large alloc 1855389696 bytes == 0x102dce000 @  0x7f59cc76c680 0x7f59cc78d824 0x5fb391 0x7f584d6b3422 0x7f584d35c8d5 0x5f6929 0x5f74f6 0x50c383 0x570b26 0x569dba 0x5f6eb3 0x5f6082 0x56d2d5 0x569dba 0x5f6eb3 0x56cc1f 0x5f6cd6 0x56bacd 0x569dba 0x5f6eb3 0x50bc2c 0x5f6082 0x56d2d5 0x569dba 0x50bca0 0x56cc1f 0x569dba 0x6902a7 0x6023c4 0x5c6730 0x56bacd
Some weights of the model checkpoint at JUNEYEOB/DAPT_batch512_lyric_con_sent were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.b

In [13]:
# tokenizing test
tokenizer(train_data['content'][0])

{'input_ids': [0, 4582, 2116, 7310, 6831, 2205, 2318, 859, 2051, 2112, 3944, 1327, 2052, 717, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
# preprocessing function
def preprocess_function(data):
    return tokenizer(
        # tokenizing
        data['content'],

        max_length=512,

        # 최대 길이보다 긴 시퀀스는 최대 길이에 맞춰 자름
        truncation=True,

        # tokenizer가 token_type_ids를 return하지 않게 함
        # roberta는 필요없기 때문
        return_token_type_ids=False,
    )

In [15]:
# dataset transform
df_train = pd.DataFrame({'content':train_data['content'], 'label':train_data['label']})
dataset_train = Dataset.from_pandas(df_train)

df_val = pd.DataFrame({'content':val_data['content'], 'label':val_data['label']})
dataset_val = Dataset.from_pandas(df_val)

In [16]:
tokenized_train_datasets = dataset_train.map(preprocess_function, batched=True)
tokenized_val_datasets = dataset_val.map(preprocess_function, batched=True)

  0%|          | 0/67 [00:00<?, ?ba/s]

  0%|          | 0/17 [00:00<?, ?ba/s]

In [17]:
tokenized_train_datasets

Dataset({
    features: ['content', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 66088
})

In [18]:
tokenized_val_datasets

Dataset({
    features: ['content', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 16522
})

## eval metric 정의

In [19]:
'''
multi sentiment analysis는 기본적으로 문장 분류 문제
KLUE task를 학습한 klue-roberta 중에서 TC(topic classification)와 유사(완전히 같지는 않음)
TC는 평가 지표로 macro f1 score를 사용

but klue에서는 다중감성분류를 학습하지 않았고,
glue에서는 sst2라는 긍부정 분류 task가 있는데 accuracy를 사용

but 우리는 데이터셋이 다소 imbalance하기 때문에 accuracy를 채택하면 오차가 발생할 수 있음

따라서 더 data imbalance에 robust한 metric를 만들기 위해
sklearn에서 제공하는 f1 score metric을 활용
'''

def eval_metric(pred, real):
    f1 = {'micro_f1':f1_score(real, pred, average='micro')}
    return f1

In [20]:
# metric test
np.random.seed(2022)
fake_preds = np.random.randint(0, 6, size=(64,))
fake_labels = np.random.randint(0, 6, size=(64,))
fake_preds, fake_labels

(array([5, 4, 5, 0, 1, 1, 0, 0, 2, 0, 0, 5, 1, 1, 3, 3, 3, 0, 3, 0, 3, 0,
        5, 0, 2, 2, 3, 1, 2, 0, 5, 5, 1, 5, 5, 4, 0, 2, 2, 2, 3, 4, 5, 1,
        2, 4, 0, 5, 5, 4, 2, 5, 0, 5, 5, 5, 5, 5, 0, 3, 3, 2, 4, 0]),
 array([1, 4, 1, 0, 5, 5, 0, 0, 3, 1, 5, 3, 2, 1, 5, 2, 1, 4, 1, 2, 2, 1,
        5, 1, 2, 1, 4, 3, 0, 4, 4, 4, 4, 0, 1, 2, 2, 4, 1, 5, 5, 2, 0, 0,
        4, 0, 0, 3, 4, 3, 1, 2, 3, 2, 3, 2, 3, 4, 5, 3, 3, 5, 3, 4]))

In [21]:
eval_metric(fake_preds, fake_labels)

{'micro_f1': 0.15625}

In [22]:
# eval metric
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    result = eval_metric(predictions, labels)
    return result

### Model to tpu

In [23]:
model = model.to(device)

## train arg

In [24]:
start = time.time()

# metric_name은 eval_metric에서 리턴받는 dict 형태의 키 이름
metric_name = "micro_f1"

# batch size 지정
batch_size = 32
num_train_epochs = 50

# path 설정
trained_model_path = f'batch{batch_size}_lcs_adafactor_ft_lr1e_6'

# Project name and run name
run_name = "adafactor_lr1e_6"
%env WANDB_PROJECT = fr_FT_lyric_con_sent


args = TrainingArguments(
    output_dir=trained_model_path,
    overwrite_output_dir=True,
    # evaluation_strategy="epoch",
    evaluation_strategy="steps",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    save_total_limit=2,
    tpu_num_cores = 85,
    seed = 2022,
    data_seed = 2022,
    dataloader_pin_memory = True,
    report_to="wandb",
    run_name=run_name,
    optim = 'adafactor'
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_val_datasets,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

trainer.train()
print("time :", time.time() - start)
wandb.finish()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, content. If __index_level_0__, content are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 66088
  Num Epochs = 50
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 103300
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


env: WANDB_PROJECT=fr_FT_lyric_con_sent


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


wandb: Currently logged in as: vazz (ethan_wyf). Use `wandb login --relogin` to force relogin


Step,Training Loss,Validation Loss,Micro F1
500,1.3408,0.969165,0.63709
1000,0.929,0.844929,0.681152
1500,0.86,0.793302,0.699915
2000,0.8081,0.76957,0.711173
2500,0.7837,0.758265,0.716499
3000,0.7636,0.75095,0.717347
3500,0.7612,0.734436,0.723883
4000,0.7509,0.740615,0.722794
4500,0.7232,0.736104,0.726304
5000,0.7173,0.741713,0.724186


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, content. If __index_level_0__, content are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 16522
  Batch size = 32
Saving model checkpoint to batch32_lcs_adafactor_ft_lr1e_6/checkpoint-500
Configuration saved in batch32_lcs_adafactor_ft_lr1e_6/checkpoint-500/config.json
Model weights saved in batch32_lcs_adafactor_ft_lr1e_6/checkpoint-500/pytorch_model.bin
tokenizer config file saved in batch32_lcs_adafactor_ft_lr1e_6/checkpoint-500/tokenizer_config.json
Special tokens file saved in batch32_lcs_adafactor_ft_lr1e_6/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [batch32_lcs_adafactor_ft_lr1e_6/checkpoint-3500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding 

time : 19661.67204093933


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▄▃▂▂▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/micro_f1,▁▄▆▆▇▇▇▇▇▇▇██▇███████████
eval/runtime,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/samples_per_second,▁███▆█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
eval/steps_per_second,▁███▆█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/learning_rate,██▇▇▇▇▆▆▆▅▅▅▅▄▄▄▃▃▃▂▂▂▂▁▁
train/loss,█▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
train/total_flos,▁

0,1
eval/loss,0.72704
eval/micro_f1,0.73036
eval/runtime,70.4713
eval/samples_per_second,234.45
eval/steps_per_second,7.336
train/epoch,6.05
train/global_step,12500.0
train/learning_rate,0.0
train/loss,0.6128
train/total_flos,3.306494936146723e+16


In [25]:
trainer.save_model(trained_model_path)

Saving model checkpoint to batch32_lcs_adafactor_ft_lr1e_6
Configuration saved in batch32_lcs_adafactor_ft_lr1e_6/config.json
Model weights saved in batch32_lcs_adafactor_ft_lr1e_6/pytorch_model.bin
tokenizer config file saved in batch32_lcs_adafactor_ft_lr1e_6/tokenizer_config.json
Special tokens file saved in batch32_lcs_adafactor_ft_lr1e_6/special_tokens_map.json


### Upload hugging face

In [26]:
# load model, tokenizer
# batch_size = 32 
# model = AutoModelForSequenceClassification.from_pretrained('jungyong/FT_batch32_lyric')
# tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')

MODEL_SAVE_REPO = f'FT_lcs_{run_name}'
HUGGINGFACE_AUTO_TOKEN = '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' # https://huggingface.co/settings/token
 
## Push to huggingface-hub
model.push_to_hub(
    MODEL_SAVE_REPO, 
    use_temp_dir=True, 
    use_auth_token=HUGGINGFACE_AUTO_TOKEN
)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning https://huggingface.co/JUNEYEOB/FT_lcs_adafactor_lr1e_6 into local empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Configuration saved in /tmp/tmpjnokmc9g/config.json
tcmalloc: large alloc 1469087744 bytes == 0x1d10084000 @  0x7f59cc76c680 0x7f59cc78cda2 0x5f8dfc 0x64f870 0x527012 0x5c64c0 0x5f4cc1 0x5f4f85 0x486664 0x539ccb 0x539bf9 0x66321b 0x53a821 0x53a01f 0x6632cc 0x53a164 0x53a01f 0x66321b 0x53a164 0x53a8d8 0x66134d 0x6615f0 0x505166 0x56bbfa 0x569dba 0x5f6eb3 0x56bacd 0x569dba 0x5f6eb3 0x56bacd 0x569dba
tcmalloc: large alloc 1836359680 bytes == 0x1c6c32c000 @  0x7f59cc76c680 0x7f59cc78cda2 0x5f8dfc 0x64f870 0x527012 0x5c64c0 0x5f4cc1 0x5f4f85 0x486664 0x539ccb 0x539bf9 0x66321b 0x53a821 0x53a01f 0x6632cc 0x53a164 0x53a01f 0x66321b 0x53a164 0x53a8d8 0x66134d 0x6615f0 0x505166 0x56bbfa 0x569dba 0x5f6eb3 0x56bacd 0x569dba 0x5f6eb3 0x56bacd 0x569dba
tcmalloc: large alloc 2295455744 bytes == 0x1d52c28000 @  0x7f59cc76c680 0x7f59cc78cda2 0x5f8dfc 0x64f870 0x527012 0x5c64c0 0x5f4cc1 0x5f4f85 0x486664 0x539ccb 0x539bf9 0x66321b 0x53a821 0x53a01f 0x6632cc 0x53a164 0x53a01f 0x66321b 0x53a164 0x53a8d8 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Upload file pytorch_model.bin:   0%|          | 32.0k/1.88G [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/JUNEYEOB/FT_lcs_adafactor_lr1e_6
   86e9f45..fb805fc  main -> main



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'https://huggingface.co/JUNEYEOB/FT_lcs_adafactor_lr1e_6/commit/fb805fcd88543d8e9b455a137be7fbd0867e249f'

In [5]:
!git clone https://huggingface.co/JUNEYEOB/FT_lcs_adafactor_lr1e_6

Cloning into 'FT_lcs_adafactor_lr1e_6'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 7 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (7/7), 1.29 KiB | 1.29 MiB/s, done.
tcmalloc: large alloc 1471086592 bytes == 0x560e8acde000 @  0x7f12d2724680 0x7f12d2744bdd 0x560e1c0f06b5 0x560e1c0bd840 0x560e1c05b6a1 0x560e1bfe496a 0x560e1bfe51a2 0x560e1bfe5bf2 0x560e1c009f3c 0x560e1c00a4e4 0x560e1c00ab32 0x560e1c0e4eac 0x560e1bf13f2c 0x560e1bef42f4 0x560e1bef53b4 0x560e1bef3e9e 0x7f12d2468083 0x560e1bef3f0e
tcmalloc: large alloc 2206621696 bytes == 0x560ee27ce000 @  0x7f12d2724680 0x7f12d2744bdd 0x560e1c0f06b5 0x560e1c0bd840 0x560e1c05b6a1 0x560e1bfe496a 0x560e1bfe51a2 0x560e1bfe5bf2 0x560e1c009f3c 0x560e1c00a4e4 0x560e1c00ab32 0x560e1c0e4eac 0x560e1bf13f2c 0x560e1bef42f4 0x560e1bef53b4 0x560e1bef3e9e 0x7f12d2468083 0x560e1bef3f0e


In [None]:
!cd FT_lcs_adafactor_lr1e_6
!cp -r ~/batch