In [29]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, AutoModelWithLMHead
from transformers import Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
import pandas as pd
import torch.nn.functional as F
from sklearn.metrics import classification_report


In [2]:
print("PyTorch CUDA 버전:", torch.version.cuda)

PyTorch CUDA 버전: 12.1


In [3]:
torch.cuda.is_available()

True

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: NVIDIA GeForce RTX 3060 Ti


In [5]:
df=pd.read_excel('천재교육 연수원 라벨링 최종본.xlsx')

In [6]:
review = df[['후기', '라벨링']]

In [7]:
review

Unnamed: 0,후기,라벨링
0,파워포인트 주로 사용하는데 매일 하던거만 하다 새로운걸 알아가네요~감사합니다~,1
1,도형을 이용해 간단하게 손그림 그리는 방법을 배우는 유익한 연수였어요..그런데 프로...,0
2,저는 미술에 정말 소질이 없는 사람입니다.. 그래서 아이들과 미술을 하는 것도 힘들...,1
3,교직 생활이 연수가 늘어갈수록 다양한 독서 연수 강좌를 들어왔습니다. 이번 연수는 ...,1
4,"근래 들어 정말 기분좋게, 즐겁게 들었던 연수입니다.환경인가? 하며 들었는데 다시 ...",1
...,...,...
2893,학교 진로교육의 목표 (2015개정)학생 자신의 진로를 창의적으로 개발하고 지속적으...,1
2894,"선생님들의 수학 개념 설명을 듣고, 실제 수학 수업을 볼 수 있어서 너무 유익했습니...",1
2895,진로라는게 흥미는 있지만 너무 포괄적이네요.,0
2896,진로교육에 관해 많은 것을 알게 되었습니다.\n감사합니다.,1


In [8]:
# 결측치 제거, 라벨링 숫자형 변환
review_df = review[~review['라벨링'].str.contains('결측치').fillna(False)]
review_df['라벨링'] = pd.to_numeric(review_df['라벨링'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_df['라벨링'] = pd.to_numeric(review_df['라벨링'])


In [9]:
review_df

Unnamed: 0,후기,라벨링
0,파워포인트 주로 사용하는데 매일 하던거만 하다 새로운걸 알아가네요~감사합니다~,1
1,도형을 이용해 간단하게 손그림 그리는 방법을 배우는 유익한 연수였어요..그런데 프로...,0
2,저는 미술에 정말 소질이 없는 사람입니다.. 그래서 아이들과 미술을 하는 것도 힘들...,1
3,교직 생활이 연수가 늘어갈수록 다양한 독서 연수 강좌를 들어왔습니다. 이번 연수는 ...,1
4,"근래 들어 정말 기분좋게, 즐겁게 들었던 연수입니다.환경인가? 하며 들었는데 다시 ...",1
...,...,...
2893,학교 진로교육의 목표 (2015개정)학생 자신의 진로를 창의적으로 개발하고 지속적으...,1
2894,"선생님들의 수학 개념 설명을 듣고, 실제 수학 수업을 볼 수 있어서 너무 유익했습니...",1
2895,진로라는게 흥미는 있지만 너무 포괄적이네요.,0
2896,진로교육에 관해 많은 것을 알게 되었습니다.\n감사합니다.,1


In [10]:
# 전체 데이터 중 80%를 학습하고, 20%로 검증한다. 난수를 42로 주면서 재현율을 높게한다.
train_df, val_df = train_test_split(review_df, test_size=0.2, random_state=42)

In [11]:
# kcbert-large 모델 불러오기
tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-large")
model = AutoModelForSequenceClassification.from_pretrained("beomi/kcbert-large", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# 텐서를 GPU로 옮기기
tensor_on_gpu = torch.randn((100, 100)).to(device)

# 모델을 GPU로 옮기기
model_on_gpu = model.to(device)

In [13]:
# 훈련, 확인 데이터 나누고 문자열 최대길이는 300, (넘어가면 오류 남) 하고 넘어가면 자르는 방식.
train_encodings = tokenizer(train_df['후기'].tolist(), truncation=True, padding=True, max_length=300)
val_encodings = tokenizer(val_df['후기'].tolist(), truncation=True, padding=True, max_length=300)

In [14]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_df['라벨링'].tolist())
val_dataset = SentimentDataset(val_encodings, val_df['라벨링'].tolist())


In [15]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=10,
    save_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


  1%|          | 10/1737 [00:05<13:27,  2.14it/s]

{'loss': 0.5533, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.02}


                                                 
  1%|          | 10/1737 [00:22<13:27,  2.14it/s]

{'eval_loss': 0.4673064053058624, 'eval_runtime': 17.2778, 'eval_samples_per_second': 33.511, 'eval_steps_per_second': 8.392, 'epoch': 0.02}


  1%|          | 20/1737 [00:30<21:49,  1.31it/s]  

{'loss': 0.3889, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.03}


                                                 
  1%|          | 20/1737 [00:48<21:49,  1.31it/s]

{'eval_loss': 0.33135634660720825, 'eval_runtime': 17.8474, 'eval_samples_per_second': 32.442, 'eval_steps_per_second': 8.124, 'epoch': 0.03}


  2%|▏         | 30/1737 [00:55<21:14,  1.34it/s]  

{'loss': 0.2563, 'learning_rate': 3e-06, 'epoch': 0.05}


                                                 
  2%|▏         | 30/1737 [01:13<21:14,  1.34it/s]

{'eval_loss': 0.25817805528640747, 'eval_runtime': 18.0109, 'eval_samples_per_second': 32.147, 'eval_steps_per_second': 8.051, 'epoch': 0.05}


  2%|▏         | 40/1737 [01:21<21:32,  1.31it/s]  

{'loss': 0.3599, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.07}


                                                 
  2%|▏         | 40/1737 [01:39<21:32,  1.31it/s]

{'eval_loss': 0.27349868416786194, 'eval_runtime': 17.9587, 'eval_samples_per_second': 32.241, 'eval_steps_per_second': 8.074, 'epoch': 0.07}


  3%|▎         | 50/1737 [01:46<21:19,  1.32it/s]  

{'loss': 0.2122, 'learning_rate': 5e-06, 'epoch': 0.09}


                                                 
  3%|▎         | 50/1737 [02:04<21:19,  1.32it/s]

{'eval_loss': 0.34287717938423157, 'eval_runtime': 17.8585, 'eval_samples_per_second': 32.422, 'eval_steps_per_second': 8.119, 'epoch': 0.09}


  3%|▎         | 60/1737 [02:11<21:21,  1.31it/s]  

{'loss': 0.6648, 'learning_rate': 6e-06, 'epoch': 0.1}


                                                 
  3%|▎         | 60/1737 [02:29<21:21,  1.31it/s]

{'eval_loss': 0.3471040427684784, 'eval_runtime': 17.9709, 'eval_samples_per_second': 32.219, 'eval_steps_per_second': 8.069, 'epoch': 0.1}


  4%|▍         | 70/1737 [02:37<20:58,  1.32it/s]  

{'loss': 0.102, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.12}


                                                 
  4%|▍         | 70/1737 [02:55<20:58,  1.32it/s]

{'eval_loss': 0.3815677762031555, 'eval_runtime': 18.0189, 'eval_samples_per_second': 32.133, 'eval_steps_per_second': 8.047, 'epoch': 0.12}


  5%|▍         | 80/1737 [03:03<20:41,  1.33it/s]  

{'loss': 0.383, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.14}


                                                 
  5%|▍         | 80/1737 [03:21<20:41,  1.33it/s]

{'eval_loss': 0.44225266575813293, 'eval_runtime': 18.101, 'eval_samples_per_second': 31.987, 'eval_steps_per_second': 8.011, 'epoch': 0.14}


  5%|▌         | 90/1737 [03:28<20:30,  1.34it/s]  

{'loss': 0.74, 'learning_rate': 9e-06, 'epoch': 0.16}


                                                 
  5%|▌         | 90/1737 [03:46<20:30,  1.34it/s]

{'eval_loss': 0.42595499753952026, 'eval_runtime': 18.0563, 'eval_samples_per_second': 32.066, 'eval_steps_per_second': 8.03, 'epoch': 0.16}


  6%|▌         | 100/1737 [03:54<20:45,  1.31it/s] 

{'loss': 0.3008, 'learning_rate': 1e-05, 'epoch': 0.17}


                                                  
  6%|▌         | 100/1737 [04:12<20:45,  1.31it/s]

{'eval_loss': 0.3968231678009033, 'eval_runtime': 18.0668, 'eval_samples_per_second': 32.048, 'eval_steps_per_second': 8.026, 'epoch': 0.17}


  6%|▋         | 110/1737 [04:19<20:24,  1.33it/s]  

{'loss': 0.2791, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.19}


                                                  
  6%|▋         | 110/1737 [04:37<20:24,  1.33it/s]

{'eval_loss': 0.36834603548049927, 'eval_runtime': 18.0466, 'eval_samples_per_second': 32.084, 'eval_steps_per_second': 8.035, 'epoch': 0.19}


  7%|▋         | 120/1737 [04:45<20:03,  1.34it/s]  

{'loss': 0.6355, 'learning_rate': 1.2e-05, 'epoch': 0.21}


                                                  
  7%|▋         | 120/1737 [05:03<20:03,  1.34it/s]

{'eval_loss': 0.35828062891960144, 'eval_runtime': 17.8558, 'eval_samples_per_second': 32.426, 'eval_steps_per_second': 8.121, 'epoch': 0.21}


  7%|▋         | 130/1737 [05:10<19:57,  1.34it/s]  

{'loss': 0.4248, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.22}


                                                  
  7%|▋         | 130/1737 [05:28<19:57,  1.34it/s]

{'eval_loss': 0.37725988030433655, 'eval_runtime': 17.9638, 'eval_samples_per_second': 32.231, 'eval_steps_per_second': 8.072, 'epoch': 0.22}


  8%|▊         | 140/1737 [05:35<20:05,  1.32it/s]  

{'loss': 0.3821, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.24}


                                                  
  8%|▊         | 140/1737 [05:54<20:05,  1.32it/s]

{'eval_loss': 0.34588658809661865, 'eval_runtime': 18.0734, 'eval_samples_per_second': 32.036, 'eval_steps_per_second': 8.023, 'epoch': 0.24}


  9%|▊         | 150/1737 [06:01<19:59,  1.32it/s]  

{'loss': 0.4183, 'learning_rate': 1.5e-05, 'epoch': 0.26}


                                                  
  9%|▊         | 150/1737 [06:19<19:59,  1.32it/s]

{'eval_loss': 0.3124123513698578, 'eval_runtime': 17.9706, 'eval_samples_per_second': 32.219, 'eval_steps_per_second': 8.069, 'epoch': 0.26}


  9%|▉         | 160/1737 [06:26<19:44,  1.33it/s]  

{'loss': 0.2651, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.28}


                                                  
  9%|▉         | 160/1737 [06:45<19:44,  1.33it/s]

{'eval_loss': 0.39555665850639343, 'eval_runtime': 18.0962, 'eval_samples_per_second': 31.996, 'eval_steps_per_second': 8.013, 'epoch': 0.28}


 10%|▉         | 170/1737 [06:52<19:38,  1.33it/s]  

{'loss': 0.5884, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.29}


                                                  
 10%|▉         | 170/1737 [07:10<19:38,  1.33it/s]

{'eval_loss': 0.26633942127227783, 'eval_runtime': 18.0633, 'eval_samples_per_second': 32.054, 'eval_steps_per_second': 8.027, 'epoch': 0.29}


 10%|█         | 180/1737 [07:18<19:41,  1.32it/s]  

{'loss': 0.3412, 'learning_rate': 1.8e-05, 'epoch': 0.31}


                                                  
 10%|█         | 180/1737 [07:36<19:41,  1.32it/s]

{'eval_loss': 0.26482295989990234, 'eval_runtime': 18.1049, 'eval_samples_per_second': 31.98, 'eval_steps_per_second': 8.009, 'epoch': 0.31}


 11%|█         | 190/1737 [07:43<19:29,  1.32it/s]  

{'loss': 0.304, 'learning_rate': 1.9e-05, 'epoch': 0.33}


                                                  
 11%|█         | 190/1737 [08:01<19:29,  1.32it/s]

{'eval_loss': 0.42660894989967346, 'eval_runtime': 18.0719, 'eval_samples_per_second': 32.039, 'eval_steps_per_second': 8.023, 'epoch': 0.33}


 12%|█▏        | 200/1737 [08:09<19:13,  1.33it/s]  

{'loss': 0.5938, 'learning_rate': 2e-05, 'epoch': 0.35}


                                                  
 12%|█▏        | 200/1737 [08:27<19:13,  1.33it/s]

{'eval_loss': 0.3117554485797882, 'eval_runtime': 17.9617, 'eval_samples_per_second': 32.235, 'eval_steps_per_second': 8.073, 'epoch': 0.35}


 12%|█▏        | 210/1737 [08:34<19:15,  1.32it/s]  

{'loss': 0.4437, 'learning_rate': 2.1e-05, 'epoch': 0.36}


                                                  
 12%|█▏        | 210/1737 [08:52<19:15,  1.32it/s]

{'eval_loss': 0.2689191997051239, 'eval_runtime': 17.9875, 'eval_samples_per_second': 32.189, 'eval_steps_per_second': 8.061, 'epoch': 0.36}


 13%|█▎        | 220/1737 [09:00<19:08,  1.32it/s]  

{'loss': 0.4816, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.38}


                                                  
 13%|█▎        | 220/1737 [09:18<19:08,  1.32it/s]

{'eval_loss': 0.31548812985420227, 'eval_runtime': 18.0248, 'eval_samples_per_second': 32.122, 'eval_steps_per_second': 8.044, 'epoch': 0.38}


 13%|█▎        | 230/1737 [09:25<18:34,  1.35it/s]  

{'loss': 0.1967, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.4}


                                                  
 13%|█▎        | 230/1737 [09:43<18:34,  1.35it/s]

{'eval_loss': 0.3176862895488739, 'eval_runtime': 17.9316, 'eval_samples_per_second': 32.289, 'eval_steps_per_second': 8.086, 'epoch': 0.4}


 14%|█▍        | 240/1737 [09:50<18:30,  1.35it/s]  

{'loss': 0.4201, 'learning_rate': 2.4e-05, 'epoch': 0.41}


                                                  
 14%|█▍        | 240/1737 [10:08<18:30,  1.35it/s]

{'eval_loss': 0.31903064250946045, 'eval_runtime': 17.9784, 'eval_samples_per_second': 32.205, 'eval_steps_per_second': 8.065, 'epoch': 0.41}


 14%|█▍        | 250/1737 [10:16<18:42,  1.32it/s]  

{'loss': 0.3544, 'learning_rate': 2.5e-05, 'epoch': 0.43}


                                                  
 14%|█▍        | 250/1737 [10:34<18:42,  1.32it/s]

{'eval_loss': 0.21888770163059235, 'eval_runtime': 17.9663, 'eval_samples_per_second': 32.227, 'eval_steps_per_second': 8.071, 'epoch': 0.43}


 15%|█▍        | 260/1737 [10:41<18:25,  1.34it/s]  

{'loss': 0.1721, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.45}


                                                  
 15%|█▍        | 260/1737 [10:59<18:25,  1.34it/s]

{'eval_loss': 0.3034839928150177, 'eval_runtime': 17.9958, 'eval_samples_per_second': 32.174, 'eval_steps_per_second': 8.057, 'epoch': 0.45}


 16%|█▌        | 270/1737 [11:07<18:26,  1.33it/s]  

{'loss': 0.356, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.47}


                                                  
 16%|█▌        | 270/1737 [11:25<18:26,  1.33it/s]

{'eval_loss': 0.290776789188385, 'eval_runtime': 18.0524, 'eval_samples_per_second': 32.073, 'eval_steps_per_second': 8.032, 'epoch': 0.47}


 16%|█▌        | 280/1737 [11:32<18:02,  1.35it/s]  

{'loss': 0.5662, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.48}


                                                  
 16%|█▌        | 280/1737 [11:50<18:02,  1.35it/s]

{'eval_loss': 0.1540752649307251, 'eval_runtime': 17.937, 'eval_samples_per_second': 32.28, 'eval_steps_per_second': 8.084, 'epoch': 0.48}


 17%|█▋        | 290/1737 [11:58<18:00,  1.34it/s]  

{'loss': 0.3129, 'learning_rate': 2.9e-05, 'epoch': 0.5}


                                                  
 17%|█▋        | 290/1737 [12:16<18:00,  1.34it/s]

{'eval_loss': 0.2961157262325287, 'eval_runtime': 17.987, 'eval_samples_per_second': 32.19, 'eval_steps_per_second': 8.061, 'epoch': 0.5}


 17%|█▋        | 300/1737 [12:23<18:06,  1.32it/s]  

{'loss': 0.1363, 'learning_rate': 3e-05, 'epoch': 0.52}


                                                  
 17%|█▋        | 300/1737 [12:41<18:06,  1.32it/s]

{'eval_loss': 0.38814911246299744, 'eval_runtime': 17.8911, 'eval_samples_per_second': 32.362, 'eval_steps_per_second': 8.105, 'epoch': 0.52}


 18%|█▊        | 310/1737 [12:48<17:53,  1.33it/s]  

{'loss': 0.4926, 'learning_rate': 3.1e-05, 'epoch': 0.54}


                                                  
 18%|█▊        | 310/1737 [13:07<17:53,  1.33it/s]

{'eval_loss': 0.15351572632789612, 'eval_runtime': 18.086, 'eval_samples_per_second': 32.014, 'eval_steps_per_second': 8.017, 'epoch': 0.54}


 18%|█▊        | 320/1737 [13:14<17:47,  1.33it/s]  

{'loss': 0.2708, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.55}


                                                  
 18%|█▊        | 320/1737 [13:32<17:47,  1.33it/s]

{'eval_loss': 0.13834166526794434, 'eval_runtime': 18.0764, 'eval_samples_per_second': 32.031, 'eval_steps_per_second': 8.021, 'epoch': 0.55}


 19%|█▉        | 330/1737 [13:40<17:49,  1.31it/s]  

{'loss': 0.0283, 'learning_rate': 3.3e-05, 'epoch': 0.57}


                                                  
 19%|█▉        | 330/1737 [13:58<17:49,  1.31it/s]

{'eval_loss': 0.1792161613702774, 'eval_runtime': 18.1053, 'eval_samples_per_second': 31.98, 'eval_steps_per_second': 8.009, 'epoch': 0.57}


 20%|█▉        | 340/1737 [14:05<17:32,  1.33it/s]  

{'loss': 0.2279, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.59}


                                                  
 20%|█▉        | 340/1737 [14:23<17:32,  1.33it/s]

{'eval_loss': 0.18696683645248413, 'eval_runtime': 17.929, 'eval_samples_per_second': 32.294, 'eval_steps_per_second': 8.087, 'epoch': 0.59}


 20%|██        | 350/1737 [14:31<17:21,  1.33it/s]  

{'loss': 0.0039, 'learning_rate': 3.5e-05, 'epoch': 0.6}


                                                  
 20%|██        | 350/1737 [14:48<17:21,  1.33it/s]

{'eval_loss': 0.18464286625385284, 'eval_runtime': 17.9548, 'eval_samples_per_second': 32.248, 'eval_steps_per_second': 8.076, 'epoch': 0.6}


 21%|██        | 360/1737 [14:56<17:05,  1.34it/s]  

{'loss': 0.1574, 'learning_rate': 3.6e-05, 'epoch': 0.62}


                                                  
 21%|██        | 360/1737 [15:14<17:05,  1.34it/s]

{'eval_loss': 0.1649155616760254, 'eval_runtime': 18.0526, 'eval_samples_per_second': 32.073, 'eval_steps_per_second': 8.032, 'epoch': 0.62}


 21%|██▏       | 370/1737 [15:21<17:08,  1.33it/s]  

{'loss': 0.1719, 'learning_rate': 3.7e-05, 'epoch': 0.64}


                                                  
 21%|██▏       | 370/1737 [15:40<17:08,  1.33it/s]

{'eval_loss': 0.2388560026884079, 'eval_runtime': 18.1205, 'eval_samples_per_second': 31.953, 'eval_steps_per_second': 8.002, 'epoch': 0.64}


 22%|██▏       | 380/1737 [15:47<17:10,  1.32it/s]  

{'loss': 0.3822, 'learning_rate': 3.8e-05, 'epoch': 0.66}


                                                  
 22%|██▏       | 380/1737 [16:05<17:10,  1.32it/s]

{'eval_loss': 0.16066429018974304, 'eval_runtime': 18.1677, 'eval_samples_per_second': 31.87, 'eval_steps_per_second': 7.981, 'epoch': 0.66}


 22%|██▏       | 390/1737 [16:13<17:03,  1.32it/s]  

{'loss': 0.1982, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.67}


                                                  
 22%|██▏       | 390/1737 [16:31<17:03,  1.32it/s]

{'eval_loss': 0.3097267150878906, 'eval_runtime': 17.8561, 'eval_samples_per_second': 32.426, 'eval_steps_per_second': 8.12, 'epoch': 0.67}


 23%|██▎       | 400/1737 [16:38<16:49,  1.32it/s]  

{'loss': 0.2998, 'learning_rate': 4e-05, 'epoch': 0.69}


                                                  
 23%|██▎       | 400/1737 [16:56<16:49,  1.32it/s]

{'eval_loss': 0.23817472159862518, 'eval_runtime': 17.9733, 'eval_samples_per_second': 32.215, 'eval_steps_per_second': 8.068, 'epoch': 0.69}


 24%|██▎       | 410/1737 [17:03<16:27,  1.34it/s]  

{'loss': 0.1493, 'learning_rate': 4.1e-05, 'epoch': 0.71}


                                                  
 24%|██▎       | 410/1737 [17:22<16:27,  1.34it/s]

{'eval_loss': 0.28684183955192566, 'eval_runtime': 18.0983, 'eval_samples_per_second': 31.992, 'eval_steps_per_second': 8.012, 'epoch': 0.71}


 24%|██▍       | 420/1737 [17:29<16:30,  1.33it/s]  

{'loss': 0.1848, 'learning_rate': 4.2e-05, 'epoch': 0.73}


                                                  
 24%|██▍       | 420/1737 [17:47<16:30,  1.33it/s]

{'eval_loss': 0.16958867013454437, 'eval_runtime': 18.0207, 'eval_samples_per_second': 32.13, 'eval_steps_per_second': 8.046, 'epoch': 0.73}


 25%|██▍       | 430/1737 [17:55<16:29,  1.32it/s]  

{'loss': 0.536, 'learning_rate': 4.3e-05, 'epoch': 0.74}


                                                  
 25%|██▍       | 430/1737 [18:12<16:29,  1.32it/s]

{'eval_loss': 0.19280262291431427, 'eval_runtime': 17.8443, 'eval_samples_per_second': 32.447, 'eval_steps_per_second': 8.126, 'epoch': 0.74}


 25%|██▌       | 440/1737 [18:20<16:08,  1.34it/s]  

{'loss': 0.3401, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.76}


                                                  
 25%|██▌       | 440/1737 [18:38<16:08,  1.34it/s]

{'eval_loss': 0.37343207001686096, 'eval_runtime': 18.0357, 'eval_samples_per_second': 32.103, 'eval_steps_per_second': 8.04, 'epoch': 0.76}


 26%|██▌       | 450/1737 [18:46<16:10,  1.33it/s]  

{'loss': 0.4672, 'learning_rate': 4.5e-05, 'epoch': 0.78}


                                                  
 26%|██▌       | 450/1737 [19:03<16:10,  1.33it/s]

{'eval_loss': 0.3358761668205261, 'eval_runtime': 17.9496, 'eval_samples_per_second': 32.257, 'eval_steps_per_second': 8.078, 'epoch': 0.78}


 26%|██▋       | 460/1737 [19:11<15:53,  1.34it/s]  

{'loss': 0.5407, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.79}


                                                  
 26%|██▋       | 460/1737 [19:29<15:53,  1.34it/s]

{'eval_loss': 0.16011811792850494, 'eval_runtime': 18.0342, 'eval_samples_per_second': 32.106, 'eval_steps_per_second': 8.04, 'epoch': 0.79}


 27%|██▋       | 470/1737 [19:37<15:54,  1.33it/s]  

{'loss': 0.6393, 'learning_rate': 4.7e-05, 'epoch': 0.81}


                                                  
 27%|██▋       | 470/1737 [19:55<15:54,  1.33it/s]

{'eval_loss': 0.345825731754303, 'eval_runtime': 18.0461, 'eval_samples_per_second': 32.084, 'eval_steps_per_second': 8.035, 'epoch': 0.81}


 28%|██▊       | 480/1737 [20:02<15:41,  1.34it/s]  

{'loss': 0.2767, 'learning_rate': 4.8e-05, 'epoch': 0.83}


                                                  
 28%|██▊       | 480/1737 [20:20<15:41,  1.34it/s]

{'eval_loss': 0.29111161828041077, 'eval_runtime': 17.9043, 'eval_samples_per_second': 32.339, 'eval_steps_per_second': 8.099, 'epoch': 0.83}


 28%|██▊       | 490/1737 [20:27<15:43,  1.32it/s]  

{'loss': 0.2037, 'learning_rate': 4.9e-05, 'epoch': 0.85}


                                                  
 28%|██▊       | 490/1737 [20:46<15:43,  1.32it/s]

{'eval_loss': 0.27512791752815247, 'eval_runtime': 18.0994, 'eval_samples_per_second': 31.99, 'eval_steps_per_second': 8.011, 'epoch': 0.85}


 29%|██▉       | 500/1737 [20:53<15:30,  1.33it/s]  

{'loss': 0.6813, 'learning_rate': 5e-05, 'epoch': 0.86}


                                                  
 29%|██▉       | 500/1737 [21:11<15:30,  1.33it/s]

{'eval_loss': 0.16660524904727936, 'eval_runtime': 17.9687, 'eval_samples_per_second': 32.223, 'eval_steps_per_second': 8.07, 'epoch': 0.86}


 29%|██▉       | 510/1737 [21:18<15:21,  1.33it/s]  

{'loss': 0.1411, 'learning_rate': 4.959579628132579e-05, 'epoch': 0.88}


                                                  
 29%|██▉       | 510/1737 [21:36<15:21,  1.33it/s]

{'eval_loss': 0.43188372254371643, 'eval_runtime': 18.0174, 'eval_samples_per_second': 32.136, 'eval_steps_per_second': 8.048, 'epoch': 0.88}


 30%|██▉       | 520/1737 [21:44<15:07,  1.34it/s]  

{'loss': 0.3024, 'learning_rate': 4.919159256265158e-05, 'epoch': 0.9}


                                                  
 30%|██▉       | 520/1737 [22:02<15:07,  1.34it/s]

{'eval_loss': 0.40103477239608765, 'eval_runtime': 17.9574, 'eval_samples_per_second': 32.243, 'eval_steps_per_second': 8.075, 'epoch': 0.9}


 31%|███       | 530/1737 [22:09<15:10,  1.33it/s]  

{'loss': 0.31, 'learning_rate': 4.878738884397737e-05, 'epoch': 0.92}


                                                  
 31%|███       | 530/1737 [22:27<15:10,  1.33it/s]

{'eval_loss': 0.36708804965019226, 'eval_runtime': 18.1021, 'eval_samples_per_second': 31.985, 'eval_steps_per_second': 8.01, 'epoch': 0.92}


 31%|███       | 540/1737 [22:35<15:12,  1.31it/s]  

{'loss': 0.1494, 'learning_rate': 4.8383185125303156e-05, 'epoch': 0.93}


                                                  
 31%|███       | 540/1737 [22:53<15:12,  1.31it/s]

{'eval_loss': 0.41124477982521057, 'eval_runtime': 18.045, 'eval_samples_per_second': 32.086, 'eval_steps_per_second': 8.035, 'epoch': 0.93}


 32%|███▏      | 550/1737 [23:01<14:52,  1.33it/s]  

{'loss': 0.4563, 'learning_rate': 4.7978981406628945e-05, 'epoch': 0.95}


                                                  
 32%|███▏      | 550/1737 [23:19<14:52,  1.33it/s]

{'eval_loss': 0.3902939260005951, 'eval_runtime': 18.0896, 'eval_samples_per_second': 32.007, 'eval_steps_per_second': 8.016, 'epoch': 0.95}


 32%|███▏      | 560/1737 [23:26<14:44,  1.33it/s]  

{'loss': 0.2588, 'learning_rate': 4.757477768795473e-05, 'epoch': 0.97}


                                                  
 32%|███▏      | 560/1737 [23:44<14:44,  1.33it/s]

{'eval_loss': 0.3142003118991852, 'eval_runtime': 18.0778, 'eval_samples_per_second': 32.028, 'eval_steps_per_second': 8.021, 'epoch': 0.97}


 33%|███▎      | 570/1737 [23:52<14:35,  1.33it/s]  

{'loss': 0.1494, 'learning_rate': 4.717057396928052e-05, 'epoch': 0.98}


                                                  
 33%|███▎      | 570/1737 [24:10<14:35,  1.33it/s]

{'eval_loss': 0.41119813919067383, 'eval_runtime': 18.0023, 'eval_samples_per_second': 32.163, 'eval_steps_per_second': 8.055, 'epoch': 0.98}


 33%|███▎      | 580/1737 [24:17<13:10,  1.46it/s]  

{'loss': 0.2877, 'learning_rate': 4.67663702506063e-05, 'epoch': 1.0}


                                                  
 33%|███▎      | 580/1737 [24:35<13:10,  1.46it/s]

{'eval_loss': 0.3665107488632202, 'eval_runtime': 18.0028, 'eval_samples_per_second': 32.162, 'eval_steps_per_second': 8.054, 'epoch': 1.0}


 34%|███▍      | 590/1737 [24:42<14:11,  1.35it/s]  

{'loss': 0.374, 'learning_rate': 4.636216653193209e-05, 'epoch': 1.02}


                                                  
 34%|███▍      | 590/1737 [25:00<14:11,  1.35it/s]

{'eval_loss': 0.32434526085853577, 'eval_runtime': 18.0993, 'eval_samples_per_second': 31.99, 'eval_steps_per_second': 8.011, 'epoch': 1.02}


 35%|███▍      | 600/1737 [25:08<14:21,  1.32it/s]  

{'loss': 0.2418, 'learning_rate': 4.595796281325788e-05, 'epoch': 1.04}


                                                  
 35%|███▍      | 600/1737 [25:26<14:21,  1.32it/s]

{'eval_loss': 0.34966936707496643, 'eval_runtime': 18.0105, 'eval_samples_per_second': 32.148, 'eval_steps_per_second': 8.051, 'epoch': 1.04}


 35%|███▌      | 610/1737 [25:33<14:13,  1.32it/s]  

{'loss': 0.4264, 'learning_rate': 4.555375909458367e-05, 'epoch': 1.05}


                                                  
 35%|███▌      | 610/1737 [25:52<14:13,  1.32it/s]

{'eval_loss': 0.3957551419734955, 'eval_runtime': 18.0591, 'eval_samples_per_second': 32.061, 'eval_steps_per_second': 8.029, 'epoch': 1.05}


 36%|███▌      | 620/1737 [25:59<14:03,  1.32it/s]  

{'loss': 0.1491, 'learning_rate': 4.514955537590946e-05, 'epoch': 1.07}


                                                  
 36%|███▌      | 620/1737 [26:17<14:03,  1.32it/s]

{'eval_loss': 0.27376607060432434, 'eval_runtime': 17.9687, 'eval_samples_per_second': 32.223, 'eval_steps_per_second': 8.07, 'epoch': 1.07}


 36%|███▋      | 630/1737 [26:25<14:01,  1.32it/s]  

{'loss': 0.186, 'learning_rate': 4.4745351657235245e-05, 'epoch': 1.09}


                                                  
 36%|███▋      | 630/1737 [26:43<14:01,  1.32it/s]

{'eval_loss': 0.207396999001503, 'eval_runtime': 18.0274, 'eval_samples_per_second': 32.118, 'eval_steps_per_second': 8.043, 'epoch': 1.09}


 37%|███▋      | 640/1737 [26:50<13:38,  1.34it/s]  

{'loss': 0.0769, 'learning_rate': 4.434114793856104e-05, 'epoch': 1.11}


                                                  
 37%|███▋      | 640/1737 [27:08<13:38,  1.34it/s]

{'eval_loss': 0.3001473844051361, 'eval_runtime': 17.9333, 'eval_samples_per_second': 32.286, 'eval_steps_per_second': 8.086, 'epoch': 1.11}


 37%|███▋      | 650/1737 [27:15<13:46,  1.32it/s]  

{'loss': 0.1518, 'learning_rate': 4.393694421988683e-05, 'epoch': 1.12}


                                                  
 37%|███▋      | 650/1737 [27:34<13:46,  1.32it/s]

{'eval_loss': 0.3071546256542206, 'eval_runtime': 18.0706, 'eval_samples_per_second': 32.041, 'eval_steps_per_second': 8.024, 'epoch': 1.12}


 38%|███▊      | 660/1737 [27:41<13:47,  1.30it/s]  

{'loss': 0.2729, 'learning_rate': 4.353274050121262e-05, 'epoch': 1.14}


                                                  
 38%|███▊      | 660/1737 [27:59<13:47,  1.30it/s]

{'eval_loss': 0.9010087847709656, 'eval_runtime': 18.0499, 'eval_samples_per_second': 32.078, 'eval_steps_per_second': 8.033, 'epoch': 1.14}


 39%|███▊      | 670/1737 [28:07<13:13,  1.34it/s]  

{'loss': 0.3699, 'learning_rate': 4.3128536782538406e-05, 'epoch': 1.16}


                                                  
 39%|███▊      | 670/1737 [28:25<13:13,  1.34it/s]

{'eval_loss': 0.3557395935058594, 'eval_runtime': 18.0523, 'eval_samples_per_second': 32.074, 'eval_steps_per_second': 8.032, 'epoch': 1.16}


 39%|███▉      | 680/1737 [28:32<13:25,  1.31it/s]  

{'loss': 0.2926, 'learning_rate': 4.2724333063864194e-05, 'epoch': 1.17}


                                                  
 39%|███▉      | 680/1737 [28:50<13:25,  1.31it/s]

{'eval_loss': 0.2326962649822235, 'eval_runtime': 17.9438, 'eval_samples_per_second': 32.267, 'eval_steps_per_second': 8.081, 'epoch': 1.17}


 40%|███▉      | 690/1737 [28:58<13:10,  1.32it/s]  

{'loss': 0.1051, 'learning_rate': 4.232012934518998e-05, 'epoch': 1.19}


                                                  
 40%|███▉      | 690/1737 [29:16<13:10,  1.32it/s]

{'eval_loss': 0.22069664299488068, 'eval_runtime': 17.9894, 'eval_samples_per_second': 32.186, 'eval_steps_per_second': 8.06, 'epoch': 1.19}


 40%|████      | 700/1737 [29:23<12:57,  1.33it/s]  

{'loss': 0.1264, 'learning_rate': 4.1915925626515764e-05, 'epoch': 1.21}


                                                  
 40%|████      | 700/1737 [29:41<12:57,  1.33it/s]

{'eval_loss': 0.3216777443885803, 'eval_runtime': 18.0556, 'eval_samples_per_second': 32.068, 'eval_steps_per_second': 8.031, 'epoch': 1.21}


 41%|████      | 710/1737 [29:49<12:46,  1.34it/s]  

{'loss': 0.1797, 'learning_rate': 4.151172190784155e-05, 'epoch': 1.23}


                                                  
 41%|████      | 710/1737 [30:07<12:46,  1.34it/s]

{'eval_loss': 0.30619072914123535, 'eval_runtime': 18.0017, 'eval_samples_per_second': 32.164, 'eval_steps_per_second': 8.055, 'epoch': 1.23}


 41%|████▏     | 720/1737 [30:14<12:34,  1.35it/s]  

{'loss': 0.0835, 'learning_rate': 4.110751818916734e-05, 'epoch': 1.24}


                                                  
 41%|████▏     | 720/1737 [30:32<12:34,  1.35it/s]

{'eval_loss': 0.24536825716495514, 'eval_runtime': 17.9979, 'eval_samples_per_second': 32.17, 'eval_steps_per_second': 8.056, 'epoch': 1.24}


 42%|████▏     | 730/1737 [30:40<12:38,  1.33it/s]  

{'loss': 0.202, 'learning_rate': 4.070331447049313e-05, 'epoch': 1.26}


                                                  
 42%|████▏     | 730/1737 [30:58<12:38,  1.33it/s]

{'eval_loss': 0.21758726239204407, 'eval_runtime': 18.0849, 'eval_samples_per_second': 32.016, 'eval_steps_per_second': 8.018, 'epoch': 1.26}


 43%|████▎     | 740/1737 [31:05<12:24,  1.34it/s]  

{'loss': 0.353, 'learning_rate': 4.029911075181892e-05, 'epoch': 1.28}


                                                  
 43%|████▎     | 740/1737 [31:23<12:24,  1.34it/s]

{'eval_loss': 0.2546902596950531, 'eval_runtime': 18.1932, 'eval_samples_per_second': 31.825, 'eval_steps_per_second': 7.97, 'epoch': 1.28}


 43%|████▎     | 750/1737 [31:31<12:24,  1.33it/s]  

{'loss': 0.1222, 'learning_rate': 3.9894907033144707e-05, 'epoch': 1.3}


                                                  
 43%|████▎     | 750/1737 [31:49<12:24,  1.33it/s]

{'eval_loss': 0.21308211982250214, 'eval_runtime': 18.077, 'eval_samples_per_second': 32.03, 'eval_steps_per_second': 8.021, 'epoch': 1.3}


 44%|████▍     | 760/1737 [31:56<12:15,  1.33it/s]  

{'loss': 0.3902, 'learning_rate': 3.9490703314470495e-05, 'epoch': 1.31}


                                                  
 44%|████▍     | 760/1737 [32:14<12:15,  1.33it/s]

{'eval_loss': 0.36087509989738464, 'eval_runtime': 17.9884, 'eval_samples_per_second': 32.187, 'eval_steps_per_second': 8.061, 'epoch': 1.31}


 44%|████▍     | 770/1737 [32:22<12:04,  1.33it/s]  

{'loss': 0.263, 'learning_rate': 3.908649959579628e-05, 'epoch': 1.33}


                                                  
 44%|████▍     | 770/1737 [32:40<12:04,  1.33it/s]

{'eval_loss': 0.3446989059448242, 'eval_runtime': 17.9055, 'eval_samples_per_second': 32.336, 'eval_steps_per_second': 8.098, 'epoch': 1.33}


 45%|████▍     | 780/1737 [32:47<11:56,  1.34it/s]  

{'loss': 0.0874, 'learning_rate': 3.868229587712207e-05, 'epoch': 1.35}


                                                  
 45%|████▍     | 780/1737 [33:05<11:56,  1.34it/s]

{'eval_loss': 0.34212955832481384, 'eval_runtime': 18.0473, 'eval_samples_per_second': 32.082, 'eval_steps_per_second': 8.034, 'epoch': 1.35}


 45%|████▌     | 790/1737 [33:13<12:04,  1.31it/s]  

{'loss': 0.4862, 'learning_rate': 3.827809215844786e-05, 'epoch': 1.36}


                                                  
 45%|████▌     | 790/1737 [33:31<12:04,  1.31it/s]

{'eval_loss': 0.1570235788822174, 'eval_runtime': 18.0859, 'eval_samples_per_second': 32.014, 'eval_steps_per_second': 8.017, 'epoch': 1.36}


 46%|████▌     | 800/1737 [33:39<11:50,  1.32it/s]  

{'loss': 0.2363, 'learning_rate': 3.787388843977365e-05, 'epoch': 1.38}


                                                  
 46%|████▌     | 800/1737 [33:57<11:50,  1.32it/s]

{'eval_loss': 0.14982794225215912, 'eval_runtime': 18.1042, 'eval_samples_per_second': 31.981, 'eval_steps_per_second': 8.009, 'epoch': 1.38}


 47%|████▋     | 810/1737 [34:04<11:34,  1.33it/s]  

{'loss': 0.1076, 'learning_rate': 3.746968472109944e-05, 'epoch': 1.4}


                                                  
 47%|████▋     | 810/1737 [34:22<11:34,  1.33it/s]

{'eval_loss': 0.19403739273548126, 'eval_runtime': 17.985, 'eval_samples_per_second': 32.194, 'eval_steps_per_second': 8.062, 'epoch': 1.4}


 47%|████▋     | 820/1737 [34:30<11:39,  1.31it/s]  

{'loss': 0.4926, 'learning_rate': 3.7065481002425226e-05, 'epoch': 1.42}


                                                  
 47%|████▋     | 820/1737 [34:48<11:39,  1.31it/s]

{'eval_loss': 0.23423603177070618, 'eval_runtime': 18.162, 'eval_samples_per_second': 31.88, 'eval_steps_per_second': 7.984, 'epoch': 1.42}


 48%|████▊     | 830/1737 [34:56<11:25,  1.32it/s]  

{'loss': 0.0095, 'learning_rate': 3.6661277283751014e-05, 'epoch': 1.43}


                                                  
 48%|████▊     | 830/1737 [35:14<11:25,  1.32it/s]

{'eval_loss': 0.19850410521030426, 'eval_runtime': 18.1175, 'eval_samples_per_second': 31.958, 'eval_steps_per_second': 8.003, 'epoch': 1.43}


 48%|████▊     | 840/1737 [35:21<11:16,  1.33it/s]  

{'loss': 0.1544, 'learning_rate': 3.6257073565076796e-05, 'epoch': 1.45}


                                                  
 48%|████▊     | 840/1737 [35:39<11:16,  1.33it/s]

{'eval_loss': 0.21834777295589447, 'eval_runtime': 17.9261, 'eval_samples_per_second': 32.299, 'eval_steps_per_second': 8.089, 'epoch': 1.45}


 49%|████▉     | 850/1737 [35:47<11:05,  1.33it/s]  

{'loss': 0.0329, 'learning_rate': 3.5852869846402584e-05, 'epoch': 1.47}


                                                  
 49%|████▉     | 850/1737 [36:05<11:05,  1.33it/s]

{'eval_loss': 0.19024768471717834, 'eval_runtime': 18.019, 'eval_samples_per_second': 32.133, 'eval_steps_per_second': 8.047, 'epoch': 1.47}


 50%|████▉     | 860/1737 [36:12<11:01,  1.33it/s]  

{'loss': 0.3415, 'learning_rate': 3.544866612772837e-05, 'epoch': 1.49}


                                                  
 50%|████▉     | 860/1737 [36:30<11:01,  1.33it/s]

{'eval_loss': 0.19530613720417023, 'eval_runtime': 18.0816, 'eval_samples_per_second': 32.022, 'eval_steps_per_second': 8.019, 'epoch': 1.49}


 50%|█████     | 870/1737 [36:38<11:03,  1.31it/s]  

{'loss': 0.1357, 'learning_rate': 3.504446240905416e-05, 'epoch': 1.5}


                                                  
 50%|█████     | 870/1737 [36:56<11:03,  1.31it/s]

{'eval_loss': 0.35961034893989563, 'eval_runtime': 18.0484, 'eval_samples_per_second': 32.08, 'eval_steps_per_second': 8.034, 'epoch': 1.5}


 51%|█████     | 880/1737 [37:04<10:48,  1.32it/s]  

{'loss': 0.2596, 'learning_rate': 3.464025869037995e-05, 'epoch': 1.52}


                                                  
 51%|█████     | 880/1737 [37:22<10:48,  1.32it/s]

{'eval_loss': 0.3151387572288513, 'eval_runtime': 18.0032, 'eval_samples_per_second': 32.161, 'eval_steps_per_second': 8.054, 'epoch': 1.52}


 51%|█████     | 890/1737 [37:29<10:28,  1.35it/s]  

{'loss': 0.2057, 'learning_rate': 3.423605497170574e-05, 'epoch': 1.54}


                                                  
 51%|█████     | 890/1737 [37:47<10:28,  1.35it/s]

{'eval_loss': 0.2751215398311615, 'eval_runtime': 18.028, 'eval_samples_per_second': 32.117, 'eval_steps_per_second': 8.043, 'epoch': 1.54}


 52%|█████▏    | 900/1737 [37:54<10:35,  1.32it/s]  

{'loss': 0.1457, 'learning_rate': 3.3831851253031526e-05, 'epoch': 1.55}


                                                  
 52%|█████▏    | 900/1737 [38:13<10:35,  1.32it/s]

{'eval_loss': 0.24840915203094482, 'eval_runtime': 18.0451, 'eval_samples_per_second': 32.086, 'eval_steps_per_second': 8.035, 'epoch': 1.55}


 52%|█████▏    | 910/1737 [38:20<10:19,  1.33it/s]  

{'loss': 0.001, 'learning_rate': 3.342764753435732e-05, 'epoch': 1.57}


                                                  
 52%|█████▏    | 910/1737 [38:38<10:19,  1.33it/s]

{'eval_loss': 0.2610012888908386, 'eval_runtime': 17.9608, 'eval_samples_per_second': 32.237, 'eval_steps_per_second': 8.073, 'epoch': 1.57}


 53%|█████▎    | 920/1737 [38:45<10:14,  1.33it/s]  

{'loss': 0.3149, 'learning_rate': 3.302344381568311e-05, 'epoch': 1.59}


                                                  
 53%|█████▎    | 920/1737 [39:03<10:14,  1.33it/s]

{'eval_loss': 0.2777611017227173, 'eval_runtime': 17.898, 'eval_samples_per_second': 32.35, 'eval_steps_per_second': 8.101, 'epoch': 1.59}


 54%|█████▎    | 930/1737 [39:11<10:03,  1.34it/s]  

{'loss': 0.54, 'learning_rate': 3.26192400970089e-05, 'epoch': 1.61}


                                                  
 54%|█████▎    | 930/1737 [39:29<10:03,  1.34it/s]

{'eval_loss': 0.25751954317092896, 'eval_runtime': 17.9538, 'eval_samples_per_second': 32.249, 'eval_steps_per_second': 8.076, 'epoch': 1.61}


 54%|█████▍    | 940/1737 [39:36<09:58,  1.33it/s]  

{'loss': 0.3254, 'learning_rate': 3.221503637833469e-05, 'epoch': 1.62}


                                                  
 54%|█████▍    | 940/1737 [39:54<09:58,  1.33it/s]

{'eval_loss': 0.14106641709804535, 'eval_runtime': 17.998, 'eval_samples_per_second': 32.17, 'eval_steps_per_second': 8.056, 'epoch': 1.62}


 55%|█████▍    | 950/1737 [40:03<09:57,  1.32it/s]  

{'loss': 0.2256, 'learning_rate': 3.1810832659660475e-05, 'epoch': 1.64}


                                                  
 55%|█████▍    | 950/1737 [40:21<09:57,  1.32it/s]

{'eval_loss': 0.1841306984424591, 'eval_runtime': 17.9969, 'eval_samples_per_second': 32.172, 'eval_steps_per_second': 8.057, 'epoch': 1.64}


 55%|█████▌    | 960/1737 [40:28<09:52,  1.31it/s]  

{'loss': 0.5291, 'learning_rate': 3.140662894098626e-05, 'epoch': 1.66}


                                                  
 55%|█████▌    | 960/1737 [40:47<09:52,  1.31it/s]

{'eval_loss': 0.21453149616718292, 'eval_runtime': 18.1306, 'eval_samples_per_second': 31.935, 'eval_steps_per_second': 7.998, 'epoch': 1.66}


 56%|█████▌    | 970/1737 [40:54<09:44,  1.31it/s]  

{'loss': 0.2891, 'learning_rate': 3.1002425222312045e-05, 'epoch': 1.68}


                                                  
 56%|█████▌    | 970/1737 [41:12<09:44,  1.31it/s]

{'eval_loss': 0.14918369054794312, 'eval_runtime': 18.0989, 'eval_samples_per_second': 31.991, 'eval_steps_per_second': 8.012, 'epoch': 1.68}


 56%|█████▋    | 980/1737 [41:20<09:27,  1.33it/s]  

{'loss': 0.236, 'learning_rate': 3.0598221503637834e-05, 'epoch': 1.69}


                                                  
 56%|█████▋    | 980/1737 [41:38<09:27,  1.33it/s]

{'eval_loss': 0.1682620793581009, 'eval_runtime': 18.0291, 'eval_samples_per_second': 32.115, 'eval_steps_per_second': 8.043, 'epoch': 1.69}


 57%|█████▋    | 990/1737 [41:45<09:35,  1.30it/s]  

{'loss': 0.0021, 'learning_rate': 3.0194017784963626e-05, 'epoch': 1.71}


                                                  
 57%|█████▋    | 990/1737 [42:03<09:35,  1.30it/s]

{'eval_loss': 0.315786212682724, 'eval_runtime': 17.888, 'eval_samples_per_second': 32.368, 'eval_steps_per_second': 8.106, 'epoch': 1.71}


 58%|█████▊    | 1000/1737 [42:11<09:08,  1.34it/s] 

{'loss': 0.6062, 'learning_rate': 2.978981406628941e-05, 'epoch': 1.73}


                                                   
 58%|█████▊    | 1000/1737 [42:29<09:08,  1.34it/s]

{'eval_loss': 0.29851946234703064, 'eval_runtime': 17.868, 'eval_samples_per_second': 32.404, 'eval_steps_per_second': 8.115, 'epoch': 1.73}


 58%|█████▊    | 1010/1737 [42:36<09:22,  1.29it/s]  

{'loss': 0.1154, 'learning_rate': 2.93856103476152e-05, 'epoch': 1.74}


                                                   
 58%|█████▊    | 1010/1737 [42:55<09:22,  1.29it/s]

{'eval_loss': 0.20104919373989105, 'eval_runtime': 18.6004, 'eval_samples_per_second': 31.128, 'eval_steps_per_second': 7.796, 'epoch': 1.74}


 59%|█████▊    | 1020/1737 [43:02<09:06,  1.31it/s]  

{'loss': 0.1074, 'learning_rate': 2.8981406628940987e-05, 'epoch': 1.76}


                                                   
 59%|█████▊    | 1020/1737 [43:20<09:06,  1.31it/s]

{'eval_loss': 0.21744506061077118, 'eval_runtime': 17.9999, 'eval_samples_per_second': 32.167, 'eval_steps_per_second': 8.056, 'epoch': 1.76}


 59%|█████▉    | 1030/1737 [43:28<09:00,  1.31it/s]  

{'loss': 0.1889, 'learning_rate': 2.8577202910266776e-05, 'epoch': 1.78}


                                                   
 59%|█████▉    | 1030/1737 [43:47<09:00,  1.31it/s]

{'eval_loss': 0.18093334138393402, 'eval_runtime': 18.1905, 'eval_samples_per_second': 31.83, 'eval_steps_per_second': 7.971, 'epoch': 1.78}


 60%|█████▉    | 1040/1737 [43:54<09:01,  1.29it/s]  

{'loss': 0.1646, 'learning_rate': 2.8172999191592564e-05, 'epoch': 1.8}


                                                   
 60%|█████▉    | 1040/1737 [44:12<09:01,  1.29it/s]

{'eval_loss': 0.2663845717906952, 'eval_runtime': 18.044, 'eval_samples_per_second': 32.088, 'eval_steps_per_second': 8.036, 'epoch': 1.8}


 60%|██████    | 1050/1737 [44:20<08:43,  1.31it/s]  

{'loss': 0.2674, 'learning_rate': 2.7768795472918353e-05, 'epoch': 1.81}


                                                   
 60%|██████    | 1050/1737 [44:38<08:43,  1.31it/s]

{'eval_loss': 0.25368425250053406, 'eval_runtime': 18.2535, 'eval_samples_per_second': 31.72, 'eval_steps_per_second': 7.944, 'epoch': 1.81}


 61%|██████    | 1060/1737 [44:46<08:32,  1.32it/s]  

{'loss': 0.0201, 'learning_rate': 2.736459175424414e-05, 'epoch': 1.83}


                                                   
 61%|██████    | 1060/1737 [45:04<08:32,  1.32it/s]

{'eval_loss': 0.2086157351732254, 'eval_runtime': 17.9573, 'eval_samples_per_second': 32.243, 'eval_steps_per_second': 8.075, 'epoch': 1.83}


 62%|██████▏   | 1070/1737 [45:12<08:22,  1.33it/s]  

{'loss': 0.2961, 'learning_rate': 2.6960388035569926e-05, 'epoch': 1.85}


                                                   
 62%|██████▏   | 1070/1737 [45:30<08:22,  1.33it/s]

{'eval_loss': 0.23232264816761017, 'eval_runtime': 18.0908, 'eval_samples_per_second': 32.005, 'eval_steps_per_second': 8.015, 'epoch': 1.85}


 62%|██████▏   | 1080/1737 [45:37<08:21,  1.31it/s]  

{'loss': 0.0005, 'learning_rate': 2.6556184316895715e-05, 'epoch': 1.87}


                                                   
 62%|██████▏   | 1080/1737 [45:55<08:21,  1.31it/s]

{'eval_loss': 0.20271563529968262, 'eval_runtime': 17.8665, 'eval_samples_per_second': 32.407, 'eval_steps_per_second': 8.116, 'epoch': 1.87}


 63%|██████▎   | 1090/1737 [46:03<08:09,  1.32it/s]  

{'loss': 0.341, 'learning_rate': 2.6151980598221503e-05, 'epoch': 1.88}


                                                   
 63%|██████▎   | 1090/1737 [46:21<08:09,  1.32it/s]

{'eval_loss': 0.18707998096942902, 'eval_runtime': 17.8389, 'eval_samples_per_second': 32.457, 'eval_steps_per_second': 8.128, 'epoch': 1.88}


 63%|██████▎   | 1100/1737 [46:28<08:04,  1.31it/s]  

{'loss': 0.4322, 'learning_rate': 2.574777687954729e-05, 'epoch': 1.9}


                                                   
 63%|██████▎   | 1100/1737 [46:46<08:04,  1.31it/s]

{'eval_loss': 0.1862645149230957, 'eval_runtime': 18.0427, 'eval_samples_per_second': 32.09, 'eval_steps_per_second': 8.036, 'epoch': 1.9}


 64%|██████▍   | 1110/1737 [46:55<07:56,  1.32it/s]  

{'loss': 0.285, 'learning_rate': 2.534357316087308e-05, 'epoch': 1.92}


                                                   
 64%|██████▍   | 1110/1737 [47:12<07:56,  1.32it/s]

{'eval_loss': 0.1768137812614441, 'eval_runtime': 17.7504, 'eval_samples_per_second': 32.619, 'eval_steps_per_second': 8.169, 'epoch': 1.92}


 64%|██████▍   | 1120/1737 [47:20<07:44,  1.33it/s]  

{'loss': 0.1938, 'learning_rate': 2.4939369442198872e-05, 'epoch': 1.93}


                                                   
 64%|██████▍   | 1120/1737 [47:38<07:44,  1.33it/s]

{'eval_loss': 0.16402478516101837, 'eval_runtime': 17.7371, 'eval_samples_per_second': 32.644, 'eval_steps_per_second': 8.175, 'epoch': 1.93}


 65%|██████▌   | 1130/1737 [47:45<07:39,  1.32it/s]  

{'loss': 0.0024, 'learning_rate': 2.4535165723524657e-05, 'epoch': 1.95}


                                                   
 65%|██████▌   | 1130/1737 [48:03<07:39,  1.32it/s]

{'eval_loss': 0.16581685841083527, 'eval_runtime': 17.7532, 'eval_samples_per_second': 32.614, 'eval_steps_per_second': 8.168, 'epoch': 1.95}


 66%|██████▌   | 1140/1737 [48:12<07:38,  1.30it/s]  

{'loss': 0.2638, 'learning_rate': 2.4130962004850445e-05, 'epoch': 1.97}


                                                   
 66%|██████▌   | 1140/1737 [48:29<07:38,  1.30it/s]

{'eval_loss': 0.1799985021352768, 'eval_runtime': 17.7783, 'eval_samples_per_second': 32.568, 'eval_steps_per_second': 8.156, 'epoch': 1.97}


 66%|██████▌   | 1150/1737 [48:37<07:21,  1.33it/s]  

{'loss': 0.1362, 'learning_rate': 2.3726758286176234e-05, 'epoch': 1.99}


                                                   
 66%|██████▌   | 1150/1737 [48:55<07:21,  1.33it/s]

{'eval_loss': 0.17405956983566284, 'eval_runtime': 17.7205, 'eval_samples_per_second': 32.674, 'eval_steps_per_second': 8.183, 'epoch': 1.99}


 67%|██████▋   | 1160/1737 [49:02<06:47,  1.41it/s]  

{'loss': 0.0976, 'learning_rate': 2.3322554567502022e-05, 'epoch': 2.0}


                                                   
 67%|██████▋   | 1160/1737 [49:20<06:47,  1.41it/s]

{'eval_loss': 0.18370619416236877, 'eval_runtime': 17.7593, 'eval_samples_per_second': 32.603, 'eval_steps_per_second': 8.165, 'epoch': 2.0}


 67%|██████▋   | 1170/1737 [49:27<07:03,  1.34it/s]  

{'loss': 0.1761, 'learning_rate': 2.291835084882781e-05, 'epoch': 2.02}


                                                   
 67%|██████▋   | 1170/1737 [49:45<07:03,  1.34it/s]

{'eval_loss': 0.19378185272216797, 'eval_runtime': 17.7288, 'eval_samples_per_second': 32.659, 'eval_steps_per_second': 8.179, 'epoch': 2.02}


 68%|██████▊   | 1180/1737 [49:53<06:57,  1.33it/s]  

{'loss': 0.002, 'learning_rate': 2.25141471301536e-05, 'epoch': 2.04}


                                                   
 68%|██████▊   | 1180/1737 [50:10<06:57,  1.33it/s]

{'eval_loss': 0.21915338933467865, 'eval_runtime': 17.764, 'eval_samples_per_second': 32.594, 'eval_steps_per_second': 8.163, 'epoch': 2.04}


 69%|██████▊   | 1190/1737 [50:18<06:54,  1.32it/s]  

{'loss': 0.0004, 'learning_rate': 2.2109943411479387e-05, 'epoch': 2.06}


                                                   
 69%|██████▊   | 1190/1737 [50:36<06:54,  1.32it/s]

{'eval_loss': 0.2545751929283142, 'eval_runtime': 17.6471, 'eval_samples_per_second': 32.81, 'eval_steps_per_second': 8.217, 'epoch': 2.06}


 69%|██████▉   | 1200/1737 [50:43<06:48,  1.32it/s]  

{'loss': 0.0004, 'learning_rate': 2.1705739692805176e-05, 'epoch': 2.07}


                                                   
 69%|██████▉   | 1200/1737 [51:01<06:48,  1.32it/s]

{'eval_loss': 0.27021950483322144, 'eval_runtime': 17.8474, 'eval_samples_per_second': 32.442, 'eval_steps_per_second': 8.124, 'epoch': 2.07}


 70%|██████▉   | 1210/1737 [51:09<06:35,  1.33it/s]  

{'loss': 0.5281, 'learning_rate': 2.130153597413096e-05, 'epoch': 2.09}


                                                   
 70%|██████▉   | 1210/1737 [51:27<06:35,  1.33it/s]

{'eval_loss': 0.18410997092723846, 'eval_runtime': 17.8575, 'eval_samples_per_second': 32.423, 'eval_steps_per_second': 8.12, 'epoch': 2.09}


 70%|███████   | 1220/1737 [51:34<06:22,  1.35it/s]

{'loss': 0.001, 'learning_rate': 2.089733225545675e-05, 'epoch': 2.11}


                                                   
 70%|███████   | 1220/1737 [51:52<06:22,  1.35it/s]

{'eval_loss': 0.18430793285369873, 'eval_runtime': 18.1448, 'eval_samples_per_second': 31.91, 'eval_steps_per_second': 7.991, 'epoch': 2.11}


 71%|███████   | 1230/1737 [52:00<06:34,  1.29it/s]

{'loss': 0.0059, 'learning_rate': 2.0493128536782538e-05, 'epoch': 2.12}


                                                   
 71%|███████   | 1230/1737 [52:18<06:34,  1.29it/s]

{'eval_loss': 0.22290655970573425, 'eval_runtime': 17.8521, 'eval_samples_per_second': 32.433, 'eval_steps_per_second': 8.122, 'epoch': 2.12}


 71%|███████▏  | 1240/1737 [52:25<06:03,  1.37it/s]

{'loss': 0.1796, 'learning_rate': 2.008892481810833e-05, 'epoch': 2.14}


                                                   
 71%|███████▏  | 1240/1737 [52:43<06:03,  1.37it/s]

{'eval_loss': 0.2115580290555954, 'eval_runtime': 17.5095, 'eval_samples_per_second': 33.068, 'eval_steps_per_second': 8.281, 'epoch': 2.14}


 72%|███████▏  | 1250/1737 [52:50<06:03,  1.34it/s]

{'loss': 0.1454, 'learning_rate': 1.9684721099434118e-05, 'epoch': 2.16}


                                                   
 72%|███████▏  | 1250/1737 [53:08<06:03,  1.34it/s]

{'eval_loss': 0.18270516395568848, 'eval_runtime': 17.3713, 'eval_samples_per_second': 33.331, 'eval_steps_per_second': 8.347, 'epoch': 2.16}


 73%|███████▎  | 1260/1737 [53:15<05:54,  1.34it/s]

{'loss': 0.0135, 'learning_rate': 1.9280517380759907e-05, 'epoch': 2.18}


                                                   
 73%|███████▎  | 1260/1737 [53:32<05:54,  1.34it/s]

{'eval_loss': 0.18114694952964783, 'eval_runtime': 17.2387, 'eval_samples_per_second': 33.587, 'eval_steps_per_second': 8.411, 'epoch': 2.18}


 73%|███████▎  | 1270/1737 [53:39<05:29,  1.42it/s]

{'loss': 0.065, 'learning_rate': 1.887631366208569e-05, 'epoch': 2.19}


                                                   
 73%|███████▎  | 1270/1737 [53:56<05:29,  1.42it/s]

{'eval_loss': 0.19451257586479187, 'eval_runtime': 16.9488, 'eval_samples_per_second': 34.162, 'eval_steps_per_second': 8.555, 'epoch': 2.19}


 74%|███████▎  | 1280/1737 [54:04<05:18,  1.43it/s]

{'loss': 0.0015, 'learning_rate': 1.847210994341148e-05, 'epoch': 2.21}


                                                   
 74%|███████▎  | 1280/1737 [54:21<05:18,  1.43it/s]

{'eval_loss': 0.1889997124671936, 'eval_runtime': 16.9535, 'eval_samples_per_second': 34.152, 'eval_steps_per_second': 8.553, 'epoch': 2.21}


 74%|███████▍  | 1290/1737 [54:28<05:11,  1.43it/s]

{'loss': 0.0012, 'learning_rate': 1.806790622473727e-05, 'epoch': 2.23}


                                                   
 74%|███████▍  | 1290/1737 [54:45<05:11,  1.43it/s]

{'eval_loss': 0.2190818190574646, 'eval_runtime': 17.0211, 'eval_samples_per_second': 34.017, 'eval_steps_per_second': 8.519, 'epoch': 2.23}


 75%|███████▍  | 1300/1737 [54:52<05:06,  1.43it/s]

{'loss': 0.0035, 'learning_rate': 1.7663702506063057e-05, 'epoch': 2.25}


                                                   
 75%|███████▍  | 1300/1737 [55:09<05:06,  1.43it/s]

{'eval_loss': 0.2348630428314209, 'eval_runtime': 17.1, 'eval_samples_per_second': 33.86, 'eval_steps_per_second': 8.48, 'epoch': 2.25}


 75%|███████▌  | 1310/1737 [55:16<05:02,  1.41it/s]

{'loss': 0.2885, 'learning_rate': 1.7259498787388845e-05, 'epoch': 2.26}


                                                   
 75%|███████▌  | 1310/1737 [55:33<05:02,  1.41it/s]

{'eval_loss': 0.21242846548557281, 'eval_runtime': 17.0649, 'eval_samples_per_second': 33.929, 'eval_steps_per_second': 8.497, 'epoch': 2.26}


 76%|███████▌  | 1320/1737 [55:41<04:55,  1.41it/s]

{'loss': 0.0008, 'learning_rate': 1.6855295068714634e-05, 'epoch': 2.28}


                                                   
 76%|███████▌  | 1320/1737 [55:58<04:55,  1.41it/s]

{'eval_loss': 0.1961270123720169, 'eval_runtime': 16.9329, 'eval_samples_per_second': 34.194, 'eval_steps_per_second': 8.563, 'epoch': 2.28}


 77%|███████▋  | 1330/1737 [56:05<04:42,  1.44it/s]

{'loss': 0.0012, 'learning_rate': 1.6451091350040422e-05, 'epoch': 2.3}


                                                   
 77%|███████▋  | 1330/1737 [56:22<04:42,  1.44it/s]

{'eval_loss': 0.2122092992067337, 'eval_runtime': 17.0132, 'eval_samples_per_second': 34.032, 'eval_steps_per_second': 8.523, 'epoch': 2.3}


 77%|███████▋  | 1340/1737 [56:29<04:40,  1.42it/s]

{'loss': 0.1247, 'learning_rate': 1.6046887631366207e-05, 'epoch': 2.31}


                                                   
 77%|███████▋  | 1340/1737 [56:46<04:40,  1.42it/s]

{'eval_loss': 0.20984363555908203, 'eval_runtime': 16.9827, 'eval_samples_per_second': 34.094, 'eval_steps_per_second': 8.538, 'epoch': 2.31}


 78%|███████▊  | 1350/1737 [56:53<04:32,  1.42it/s]

{'loss': 0.0003, 'learning_rate': 1.5642683912691996e-05, 'epoch': 2.33}


                                                   
 78%|███████▊  | 1350/1737 [57:10<04:32,  1.42it/s]

{'eval_loss': 0.20481285452842712, 'eval_runtime': 17.0224, 'eval_samples_per_second': 34.014, 'eval_steps_per_second': 8.518, 'epoch': 2.33}


 78%|███████▊  | 1360/1737 [57:17<04:24,  1.42it/s]

{'loss': 0.0003, 'learning_rate': 1.5238480194017784e-05, 'epoch': 2.35}


                                                   
 78%|███████▊  | 1360/1737 [57:34<04:24,  1.42it/s]

{'eval_loss': 0.20710040628910065, 'eval_runtime': 16.8734, 'eval_samples_per_second': 34.314, 'eval_steps_per_second': 8.593, 'epoch': 2.35}


 79%|███████▉  | 1370/1737 [57:41<04:16,  1.43it/s]

{'loss': 0.0763, 'learning_rate': 1.4834276475343573e-05, 'epoch': 2.37}


                                                   
 79%|███████▉  | 1370/1737 [57:58<04:16,  1.43it/s]

{'eval_loss': 0.2076493799686432, 'eval_runtime': 17.0564, 'eval_samples_per_second': 33.946, 'eval_steps_per_second': 8.501, 'epoch': 2.37}


 79%|███████▉  | 1380/1737 [58:07<04:17,  1.39it/s]

{'loss': 0.0004, 'learning_rate': 1.4430072756669363e-05, 'epoch': 2.38}


                                                   
 79%|███████▉  | 1380/1737 [58:24<04:17,  1.39it/s]

{'eval_loss': 0.19773231446743011, 'eval_runtime': 16.9333, 'eval_samples_per_second': 34.193, 'eval_steps_per_second': 8.563, 'epoch': 2.38}


 80%|████████  | 1390/1737 [58:31<04:01,  1.43it/s]

{'loss': 0.0003, 'learning_rate': 1.4025869037995151e-05, 'epoch': 2.4}


                                                   
 80%|████████  | 1390/1737 [58:48<04:01,  1.43it/s]

{'eval_loss': 0.19862815737724304, 'eval_runtime': 17.5881, 'eval_samples_per_second': 32.92, 'eval_steps_per_second': 8.244, 'epoch': 2.4}


 81%|████████  | 1400/1737 [58:56<04:01,  1.39it/s]

{'loss': 0.0005, 'learning_rate': 1.362166531932094e-05, 'epoch': 2.42}


                                                   
 81%|████████  | 1400/1737 [59:13<04:01,  1.39it/s]

{'eval_loss': 0.20536202192306519, 'eval_runtime': 16.8733, 'eval_samples_per_second': 34.315, 'eval_steps_per_second': 8.593, 'epoch': 2.42}


 81%|████████  | 1410/1737 [59:20<03:49,  1.42it/s]

{'loss': 0.0006, 'learning_rate': 1.3217461600646728e-05, 'epoch': 2.44}


                                                   
 81%|████████  | 1410/1737 [59:37<03:49,  1.42it/s]

{'eval_loss': 0.22055812180042267, 'eval_runtime': 17.2682, 'eval_samples_per_second': 33.53, 'eval_steps_per_second': 8.397, 'epoch': 2.44}


 82%|████████▏ | 1420/1737 [59:44<03:40,  1.44it/s]

{'loss': 0.0004, 'learning_rate': 1.2813257881972515e-05, 'epoch': 2.45}


                                                   
 82%|████████▏ | 1420/1737 [1:00:01<03:40,  1.44it/s]

{'eval_loss': 0.22861692309379578, 'eval_runtime': 16.6571, 'eval_samples_per_second': 34.76, 'eval_steps_per_second': 8.705, 'epoch': 2.45}


 82%|████████▏ | 1430/1737 [1:00:08<03:31,  1.45it/s]

{'loss': 0.215, 'learning_rate': 1.2409054163298303e-05, 'epoch': 2.47}


                                                     
 82%|████████▏ | 1430/1737 [1:00:25<03:31,  1.45it/s]

{'eval_loss': 0.21091917157173157, 'eval_runtime': 16.6779, 'eval_samples_per_second': 34.717, 'eval_steps_per_second': 8.694, 'epoch': 2.47}


 83%|████████▎ | 1440/1737 [1:00:32<03:22,  1.47it/s]

{'loss': 0.0707, 'learning_rate': 1.2004850444624092e-05, 'epoch': 2.49}


                                                     
 83%|████████▎ | 1440/1737 [1:00:48<03:22,  1.47it/s]

{'eval_loss': 0.2152888923883438, 'eval_runtime': 16.6742, 'eval_samples_per_second': 34.724, 'eval_steps_per_second': 8.696, 'epoch': 2.49}


 83%|████████▎ | 1450/1737 [1:00:55<03:17,  1.45it/s]

{'loss': 0.2103, 'learning_rate': 1.1600646725949878e-05, 'epoch': 2.5}


                                                     
 83%|████████▎ | 1450/1737 [1:01:12<03:17,  1.45it/s]

{'eval_loss': 0.2219618856906891, 'eval_runtime': 16.666, 'eval_samples_per_second': 34.741, 'eval_steps_per_second': 8.7, 'epoch': 2.5}


 84%|████████▍ | 1460/1737 [1:01:19<03:08,  1.47it/s]

{'loss': 0.0004, 'learning_rate': 1.1196443007275667e-05, 'epoch': 2.52}


                                                     
 84%|████████▍ | 1460/1737 [1:01:36<03:08,  1.47it/s]

{'eval_loss': 0.2288050353527069, 'eval_runtime': 16.6877, 'eval_samples_per_second': 34.696, 'eval_steps_per_second': 8.689, 'epoch': 2.52}


 85%|████████▍ | 1470/1737 [1:01:43<03:03,  1.45it/s]

{'loss': 0.0428, 'learning_rate': 1.0792239288601455e-05, 'epoch': 2.54}


                                                     
 85%|████████▍ | 1470/1737 [1:01:59<03:03,  1.45it/s]

{'eval_loss': 0.2623893618583679, 'eval_runtime': 16.662, 'eval_samples_per_second': 34.75, 'eval_steps_per_second': 8.702, 'epoch': 2.54}


 85%|████████▌ | 1480/1737 [1:02:06<02:57,  1.45it/s]

{'loss': 0.1308, 'learning_rate': 1.0388035569927244e-05, 'epoch': 2.56}


                                                     
 85%|████████▌ | 1480/1737 [1:02:23<02:57,  1.45it/s]

{'eval_loss': 0.242574080824852, 'eval_runtime': 16.6124, 'eval_samples_per_second': 34.854, 'eval_steps_per_second': 8.728, 'epoch': 2.56}


 86%|████████▌ | 1490/1737 [1:02:30<02:49,  1.46it/s]

{'loss': 0.0022, 'learning_rate': 9.983831851253032e-06, 'epoch': 2.57}


                                                     
 86%|████████▌ | 1490/1737 [1:02:47<02:49,  1.46it/s]

{'eval_loss': 0.19954954087734222, 'eval_runtime': 16.6362, 'eval_samples_per_second': 34.804, 'eval_steps_per_second': 8.716, 'epoch': 2.57}


 86%|████████▋ | 1500/1737 [1:02:54<02:42,  1.46it/s]

{'loss': 0.0007, 'learning_rate': 9.57962813257882e-06, 'epoch': 2.59}


                                                     
 86%|████████▋ | 1500/1737 [1:03:11<02:42,  1.46it/s]

{'eval_loss': 0.219426229596138, 'eval_runtime': 16.903, 'eval_samples_per_second': 34.254, 'eval_steps_per_second': 8.578, 'epoch': 2.59}


 87%|████████▋ | 1510/1737 [1:03:18<02:36,  1.45it/s]

{'loss': 0.0003, 'learning_rate': 9.175424413904609e-06, 'epoch': 2.61}


                                                     
 87%|████████▋ | 1510/1737 [1:03:35<02:36,  1.45it/s]

{'eval_loss': 0.2279873639345169, 'eval_runtime': 16.79, 'eval_samples_per_second': 34.485, 'eval_steps_per_second': 8.636, 'epoch': 2.61}


 88%|████████▊ | 1520/1737 [1:03:42<02:31,  1.43it/s]

{'loss': 0.0003, 'learning_rate': 8.771220695230396e-06, 'epoch': 2.63}


                                                     
 88%|████████▊ | 1520/1737 [1:03:58<02:31,  1.43it/s]

{'eval_loss': 0.23289671540260315, 'eval_runtime': 16.7067, 'eval_samples_per_second': 34.657, 'eval_steps_per_second': 8.679, 'epoch': 2.63}


 88%|████████▊ | 1530/1737 [1:04:05<02:21,  1.46it/s]

{'loss': 0.2154, 'learning_rate': 8.367016976556184e-06, 'epoch': 2.64}


                                                     
 88%|████████▊ | 1530/1737 [1:04:22<02:21,  1.46it/s]

{'eval_loss': 0.21895818412303925, 'eval_runtime': 16.6471, 'eval_samples_per_second': 34.781, 'eval_steps_per_second': 8.71, 'epoch': 2.64}


 89%|████████▊ | 1540/1737 [1:04:29<02:15,  1.45it/s]

{'loss': 0.0174, 'learning_rate': 7.962813257881973e-06, 'epoch': 2.66}


                                                     
 89%|████████▊ | 1540/1737 [1:04:46<02:15,  1.45it/s]

{'eval_loss': 0.20078252255916595, 'eval_runtime': 16.6284, 'eval_samples_per_second': 34.82, 'eval_steps_per_second': 8.72, 'epoch': 2.66}


 89%|████████▉ | 1550/1737 [1:04:53<02:07,  1.46it/s]

{'loss': 0.0004, 'learning_rate': 7.558609539207762e-06, 'epoch': 2.68}


                                                     
 89%|████████▉ | 1550/1737 [1:05:09<02:07,  1.46it/s]

{'eval_loss': 0.19300371408462524, 'eval_runtime': 16.6252, 'eval_samples_per_second': 34.827, 'eval_steps_per_second': 8.722, 'epoch': 2.68}


 90%|████████▉ | 1560/1737 [1:05:16<02:01,  1.45it/s]

{'loss': 0.1802, 'learning_rate': 7.1544058205335494e-06, 'epoch': 2.69}


                                                     
 90%|████████▉ | 1560/1737 [1:05:33<02:01,  1.45it/s]

{'eval_loss': 0.19147495925426483, 'eval_runtime': 16.5996, 'eval_samples_per_second': 34.88, 'eval_steps_per_second': 8.735, 'epoch': 2.69}


 90%|█████████ | 1570/1737 [1:05:40<01:53,  1.47it/s]

{'loss': 0.0048, 'learning_rate': 6.750202101859338e-06, 'epoch': 2.71}


                                                     
 90%|█████████ | 1570/1737 [1:05:56<01:53,  1.47it/s]

{'eval_loss': 0.19070138037204742, 'eval_runtime': 16.6184, 'eval_samples_per_second': 34.841, 'eval_steps_per_second': 8.725, 'epoch': 2.71}


 91%|█████████ | 1580/1737 [1:06:03<01:47,  1.46it/s]

{'loss': 0.1104, 'learning_rate': 6.3459983831851255e-06, 'epoch': 2.73}


                                                     
 91%|█████████ | 1580/1737 [1:06:20<01:47,  1.46it/s]

{'eval_loss': 0.21223053336143494, 'eval_runtime': 16.6184, 'eval_samples_per_second': 34.841, 'eval_steps_per_second': 8.725, 'epoch': 2.73}


 92%|█████████▏| 1590/1737 [1:06:27<01:40,  1.46it/s]

{'loss': 0.0004, 'learning_rate': 5.941794664510914e-06, 'epoch': 2.75}


                                                     
 92%|█████████▏| 1590/1737 [1:06:44<01:40,  1.46it/s]

{'eval_loss': 0.22028526663780212, 'eval_runtime': 16.6148, 'eval_samples_per_second': 34.848, 'eval_steps_per_second': 8.727, 'epoch': 2.75}


 92%|█████████▏| 1600/1737 [1:06:51<01:33,  1.47it/s]

{'loss': 0.0003, 'learning_rate': 5.537590945836702e-06, 'epoch': 2.76}


                                                     
 92%|█████████▏| 1600/1737 [1:07:07<01:33,  1.47it/s]

{'eval_loss': 0.22196651995182037, 'eval_runtime': 16.6084, 'eval_samples_per_second': 34.862, 'eval_steps_per_second': 8.731, 'epoch': 2.76}


 93%|█████████▎| 1610/1737 [1:07:14<01:26,  1.47it/s]

{'loss': 0.3298, 'learning_rate': 5.13338722716249e-06, 'epoch': 2.78}


                                                     
 93%|█████████▎| 1610/1737 [1:07:31<01:26,  1.47it/s]

{'eval_loss': 0.20438311994075775, 'eval_runtime': 16.6908, 'eval_samples_per_second': 34.69, 'eval_steps_per_second': 8.687, 'epoch': 2.78}


 93%|█████████▎| 1620/1737 [1:07:39<01:21,  1.44it/s]

{'loss': 0.0003, 'learning_rate': 4.729183508488278e-06, 'epoch': 2.8}


                                                     
 93%|█████████▎| 1620/1737 [1:07:55<01:21,  1.44it/s]

{'eval_loss': 0.20459656417369843, 'eval_runtime': 16.6431, 'eval_samples_per_second': 34.789, 'eval_steps_per_second': 8.712, 'epoch': 2.8}


 94%|█████████▍| 1630/1737 [1:08:02<01:13,  1.46it/s]

{'loss': 0.1479, 'learning_rate': 4.324979789814067e-06, 'epoch': 2.82}


                                                     
 94%|█████████▍| 1630/1737 [1:08:19<01:13,  1.46it/s]

{'eval_loss': 0.2059028595685959, 'eval_runtime': 16.6492, 'eval_samples_per_second': 34.776, 'eval_steps_per_second': 8.709, 'epoch': 2.82}


 94%|█████████▍| 1640/1737 [1:08:26<01:06,  1.46it/s]

{'loss': 0.0023, 'learning_rate': 3.920776071139854e-06, 'epoch': 2.83}


                                                     
 94%|█████████▍| 1640/1737 [1:08:42<01:06,  1.46it/s]

{'eval_loss': 0.20994265377521515, 'eval_runtime': 16.5777, 'eval_samples_per_second': 34.926, 'eval_steps_per_second': 8.747, 'epoch': 2.83}


 95%|█████████▍| 1650/1737 [1:08:49<00:59,  1.47it/s]

{'loss': 0.0002, 'learning_rate': 3.516572352465643e-06, 'epoch': 2.85}


                                                     
 95%|█████████▍| 1650/1737 [1:09:06<00:59,  1.47it/s]

{'eval_loss': 0.21361838281154633, 'eval_runtime': 16.6062, 'eval_samples_per_second': 34.867, 'eval_steps_per_second': 8.732, 'epoch': 2.85}


 96%|█████████▌| 1660/1737 [1:09:13<00:52,  1.46it/s]

{'loss': 0.0003, 'learning_rate': 3.112368633791431e-06, 'epoch': 2.87}


                                                     
 96%|█████████▌| 1660/1737 [1:09:30<00:52,  1.46it/s]

{'eval_loss': 0.2154587358236313, 'eval_runtime': 16.569, 'eval_samples_per_second': 34.945, 'eval_steps_per_second': 8.751, 'epoch': 2.87}


 96%|█████████▌| 1670/1737 [1:09:37<00:45,  1.46it/s]

{'loss': 0.0002, 'learning_rate': 2.7081649151172193e-06, 'epoch': 2.88}


                                                     
 96%|█████████▌| 1670/1737 [1:09:53<00:45,  1.46it/s]

{'eval_loss': 0.2164653241634369, 'eval_runtime': 16.6024, 'eval_samples_per_second': 34.874, 'eval_steps_per_second': 8.734, 'epoch': 2.88}


 97%|█████████▋| 1680/1737 [1:10:00<00:39,  1.45it/s]

{'loss': 0.0002, 'learning_rate': 2.3039611964430073e-06, 'epoch': 2.9}


                                                     
 97%|█████████▋| 1680/1737 [1:10:17<00:39,  1.45it/s]

{'eval_loss': 0.21711452305316925, 'eval_runtime': 16.6591, 'eval_samples_per_second': 34.756, 'eval_steps_per_second': 8.704, 'epoch': 2.9}


 97%|█████████▋| 1690/1737 [1:10:25<00:32,  1.46it/s]

{'loss': 0.084, 'learning_rate': 1.8997574777687957e-06, 'epoch': 2.92}


                                                     
 97%|█████████▋| 1690/1737 [1:10:41<00:32,  1.46it/s]

{'eval_loss': 0.21439732611179352, 'eval_runtime': 16.6274, 'eval_samples_per_second': 34.822, 'eval_steps_per_second': 8.721, 'epoch': 2.92}


 98%|█████████▊| 1700/1737 [1:10:48<00:25,  1.46it/s]

{'loss': 0.0003, 'learning_rate': 1.4955537590945837e-06, 'epoch': 2.94}


                                                     
 98%|█████████▊| 1700/1737 [1:11:05<00:25,  1.46it/s]

{'eval_loss': 0.21411290764808655, 'eval_runtime': 16.6223, 'eval_samples_per_second': 34.833, 'eval_steps_per_second': 8.723, 'epoch': 2.94}


 98%|█████████▊| 1710/1737 [1:11:12<00:18,  1.46it/s]

{'loss': 0.1661, 'learning_rate': 1.091350040420372e-06, 'epoch': 2.95}


                                                     
 98%|█████████▊| 1710/1737 [1:11:28<00:18,  1.46it/s]

{'eval_loss': 0.2140273153781891, 'eval_runtime': 16.6342, 'eval_samples_per_second': 34.808, 'eval_steps_per_second': 8.717, 'epoch': 2.95}


 99%|█████████▉| 1720/1737 [1:11:35<00:11,  1.46it/s]

{'loss': 0.1962, 'learning_rate': 6.871463217461601e-07, 'epoch': 2.97}


                                                     
 99%|█████████▉| 1720/1737 [1:11:52<00:11,  1.46it/s]

{'eval_loss': 0.21340599656105042, 'eval_runtime': 16.5688, 'eval_samples_per_second': 34.945, 'eval_steps_per_second': 8.751, 'epoch': 2.97}


100%|█████████▉| 1730/1737 [1:11:59<00:04,  1.46it/s]

{'loss': 0.1021, 'learning_rate': 2.8294260307194823e-07, 'epoch': 2.99}


                                                     
100%|█████████▉| 1730/1737 [1:12:16<00:04,  1.46it/s]

{'eval_loss': 0.2132006585597992, 'eval_runtime': 16.7118, 'eval_samples_per_second': 34.646, 'eval_steps_per_second': 8.677, 'epoch': 2.99}


100%|██████████| 1737/1737 [1:12:21<00:00,  2.50s/it]

{'train_runtime': 4341.5926, 'train_samples_per_second': 1.598, 'train_steps_per_second': 0.4, 'train_loss': 0.2120598661430474, 'epoch': 3.0}





TrainOutput(global_step=1737, training_loss=0.2120598661430474, metrics={'train_runtime': 4341.5926, 'train_samples_per_second': 1.598, 'train_steps_per_second': 0.4, 'train_loss': 0.2120598661430474, 'epoch': 3.0})

In [16]:
trainer.save_model("./finetuned_kcbert_large")

In [21]:
model_path = "./finetuned_kcbert_large"

In [19]:
tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-large")

In [23]:
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [24]:
texts = ["유익해요", "어려워요."]

In [25]:
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
outputs = model(**inputs)

In [27]:
probs = F.softmax(outputs.logits, dim=-1)
print(probs)


tensor([[1.0643e-04, 9.9989e-01],
        [9.9825e-01, 1.7488e-03]], grad_fn=<SoftmaxBackward0>)


In [28]:
predictions = torch.argmax(probs, dim=-1)
for i, text in enumerate(texts):
    print(f"Text: {text}")
    print(f"Predicted label: {predictions[i].item()}, Probability: {probs[i][predictions[i]].item()}")
    print()

Text: 유익해요
Predicted label: 1, Probability: 0.9998935461044312

Text: 어려워요.
Predicted label: 0, Probability: 0.9982511401176453



In [43]:
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report

# 모델을 평가 모드로 설정
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

all_preds = []
all_labels = []

# DataLoader 설정
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

with torch.no_grad():
    for batch in val_loader:
    # 배치에서 입력 데이터와 라벨을 분리
        inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        labels = batch["labels"].to(device)
        
        # 모델 예측
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=-1)
        
        # 결과 저장
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# 성능 지표 계산 및 출력
print(classification_report(all_labels, all_preds, target_names=['Negative', 'Positive']))


              precision    recall  f1-score   support

    Negative       0.84      0.65      0.73        40
    Positive       0.97      0.99      0.98       539

    accuracy                           0.97       579
   macro avg       0.91      0.82      0.86       579
weighted avg       0.97      0.97      0.97       579



In [None]:
### 클래스 별 성능 지표
# Negative (부정 감정)

# Precision: 84% (부정이라고 예측한 것 중 실제로 부정인 것의 비율)
# Recall: 65% (실제 부정인 것 중 모델이 부정으로 예측한 것의 비율)
# F1-score: 73% (Precision과 Recall의 조화 평균)
# Positive (긍정 감정)

# Precision: 97% (긍정이라고 예측한 것 중 실제로 긍정인 것의 비율)
# Recall: 99% (실제 긍정인 것 중 모델이 긍정으로 예측한 것의 비율)
# F1-score: 98% (Precision과 Recall의 조화 평균)
# 전체 성능 지표
# Accuracy: 97% (전체 데이터 중 모델이 올바르게 예측한 비율)
# Macro Avg:
# Precision: 91%
# Recall: 82%
# F1-score: 86%
# Weighted Avg:
# Precision: 97%
# Recall: 97%
# F1-score: 97%
# 해석
# 모델은 긍정 감정에 대해서는 매우 높은 성능을 보이지만, 부정 감정에 대해서는 상대적으로 낮은 성능을 보입니다.
# 이는 데이터셋에 긍정 감정의 샘플이 부정 감정의 샘플보다 훨씬 많기 때문일 수 있습니다 (데이터 불균형).
# 모델의 성능을 향상시키기 위해서는 부정 감정의 샘플을 추가하거나, 데이터 불균형을 해결하기 위한 방법을 고려해볼 필요가 있습니다.
# 전체적으로 모델의 성능은 매우 좋지만, 부정 감정을 더 잘 예측할 수 있도록 모델을 개선할 여지가 있습니다.