### Setting.py

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 14.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 57.5 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 56.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 61.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
 

In [None]:
import pandas as pd 
import numpy as np 
import os
import torch
import torch.nn as nn

import warnings 
warnings.filterwarnings("ignore")
from tqdm import tqdm
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer, AdamW
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
import re
from sklearn.model_selection import train_test_split

In [None]:
# Random Seed Fix
import random
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True  
seed_everything()

In [None]:
def load_data(path):
  TRAIN = os.path.join(path, 'benchmark_train_data.csv')
  AUG = os.path.join(path, 'aug_train_data.csv')
  TEST = os.path.join(path, 'test_data.csv')
  SS = os.path.join(path, 'sample_submission.csv')
  label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}
  train = pd.read_csv(TRAIN)
  aug = pd.read_csv(AUG)[:10000]
  
  test = pd.read_csv(TEST)
  sample_submission = pd.read_csv(SS)
  train['label'] = train['label'].map(label_dict)
  aug['label'] = aug['label'].map(label_dict)
  
  train = train.append(aug)
  train.reset_index(inplace=True)
  #train.set_index()
  display(train)
  return train,test,sample_submission

def text_clean(df):
  #=df["premise_"] = "<s>" + df["premise"].astype(str) + "[SEP]"
  df["premise_"] = "[CLS]"+df["premise"].astype(str)
  #df["hypothesis_"] = df["hypothesis"].astype(str) + "[SEP]"
  df["hypothesis_"] = df["hypothesis"].astype(str) + "[SEP]"
  df["text_sum"] = df.premise_ + "[SEP]" + df.hypothesis_
  df = df[['text_sum','label']]
  return df 

def random_deletion(sentence, p=0.2):
    words = sentence.split ()
    n = len (words)
    if n == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words))
    #print (remaining) 
    if len(remaining) == 0: # if not left, sample a random word
        return ' '.join ([random.choice(words)])
    else:
        return ' '.join (remaining)

def random_swap(sentence, n=2):
    sentence = sentence.split () 
    length = range(len(sentence))
    swapped = []
    if len(sentence) >2:
      for _ in range(n):
          idx1, idx2 = random.sample(length, 2)
          swapped.append ([sentence[idx1], sentence[idx2]])
          sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return ' '.join (sentence)

def eda_aug(df):

    cache = {'premise':[], 'hypothesis':[], 'label':[]}
    for idx in tqdm(range(len(df))):
        premise = df.iloc[idx]['premise']
        hypothesis = df.iloc[idx]['hypothesis']
        label = df.iloc[idx]['label']
        cache['premise'].append(premise)
        cache['hypothesis'].append(hypothesis)
        cache['label'].append(label)
        flag = random.randrange(10)
        if flag < 2:
          cache['premise'].append(random_deletion(premise))
          cache['hypothesis'].append(random_deletion(hypothesis))
          cache['label'].append(label)
          cache['premise'].append(random_swap(premise))
          cache['hypothesis'].append(random_swap(hypothesis))
          cache['label'].append(label)
    
    return pd.DataFrame(cache)

### Dataset.py

In [None]:
#ROOT = '/content/drive/MyDrive/DACON_MONTHLYNLI'
DATA = '/content/drive/Shareddrives/Dacon/data'
train,test,sample_submission = load_data(DATA)
###### AUGMENTATION ######
#train = eda_aug(train)
###### AUGMENTATION ######

clean_train,clean_test  = text_clean(train),text_clean(test)
display(clean_train)

Unnamed: 0,level_0,index,premise,hypothesis,label
0,0,0.0,"씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이...",씨름의 여자들의 놀이이다.,1
1,1,1.0,"삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 하였으나,...",자작극을 벌인 이는 3명이다.,1
2,2,2.0,이를 위해 예측적 범죄예방 시스템을 구축하고 고도화한다.,예측적 범죄예방 시스템 구축하고 고도화하는 것은 목적이 있기 때문이다.,0
3,3,3.0,광주광역시가 재개발 정비사업 원주민들에 대한 종합대책을 마련하는 등 원주민 보호에 ...,원주민들은 종합대책에 만족했다.,2
4,4,4.0,"진정 소비자와 직원들에게 사랑 받는 기업으로 오래 지속되고 싶으면, 이런 상황에서는...",이런 상황에서 책임 있는 모습을 보여주는 기업은 아주 드물다.,2
...,...,...,...,...,...
37993,9995,9995.0,2000년 집권한 로랑 그바그보 대통령은 지난해 11월 대선에서 패배해 재선에 실패했다.,재선에 실패한 대통령은 다시 대통령에 출마할 수 없다.,2
37994,9996,9996.0,시부야와 롯데로 이사하는 것이 좋습니다.,시부야나 롯데로만 이동할 수 있어요.,2
37995,9997,9997.0,"다만 지리산, 설악산, 덕유산 등 전국 국립공원 내 쉼터 14곳은 탐방객의 안전과 ...",전국 14개 국립공원이 모두 개장 대상에 포함됐다.,1
37996,9998,9998.0,이것은 현대 세계에서 고군분투하는 한 청소년에 관한 영화입니다.,그것은 현대 사회의 청소년들에 관한 영화였습니다.,0


Unnamed: 0,text_sum,label
0,"[CLS]씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나...",1
1,[CLS]삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 ...,1
2,[CLS]이를 위해 예측적 범죄예방 시스템을 구축하고 고도화한다.[SEP]예측적 범...,0
3,[CLS]광주광역시가 재개발 정비사업 원주민들에 대한 종합대책을 마련하는 등 원주민...,2
4,"[CLS]진정 소비자와 직원들에게 사랑 받는 기업으로 오래 지속되고 싶으면, 이런 ...",2
...,...,...
37993,[CLS]2000년 집권한 로랑 그바그보 대통령은 지난해 11월 대선에서 패배해 재...,2
37994,[CLS]시부야와 롯데로 이사하는 것이 좋습니다.[SEP]시부야나 롯데로만 이동할 ...,2
37995,"[CLS]다만 지리산, 설악산, 덕유산 등 전국 국립공원 내 쉼터 14곳은 탐방객의...",1
37996,[CLS]이것은 현대 세계에서 고군분투하는 한 청소년에 관한 영화입니다.[SEP]그...,0


In [None]:
model_roberta = AutoModel.from_pretrained("klue/roberta-large")
tokenizer_roberta = AutoTokenizer.from_pretrained("klue/roberta-large")

Downloading:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [None]:
def roberta_transform(text):
  transform = tokenizer_roberta(text,
                                pad_to_max_length=True,
                               truncation=True,
                               max_length=256,
                               return_tensors='pt',
                                add_special_tokens=False)
  return transform

In [None]:
class customDataset(Dataset):
  def __init__(self,dataset,mode='train',transform=roberta_transform):
    super(customDataset, self).__init__()
    self.mode = mode
    self.dataset = dataset.reset_index(drop=True)
    self.transform = transform

  def __getitem__(self, idx):
    
    text = self.dataset['text_sum'][idx]
    tokens = self.transform(text)
    token_ids = tokens['input_ids'][0]  # tensor of token ids
    attn_masks = tokens['attention_mask'][0]  # binary tensor with "0" for padded values and "1" for the other values
    token_type_ids = tokens['token_type_ids'][0]  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

    if self.mode == 'test':
      return token_ids,attn_masks,token_type_ids
    else: 
      labels = self.dataset['label'][idx]
      return token_ids,attn_masks,token_type_ids, labels
  
  def __len__(self):
    return(len(self.dataset))

### Model.py

In [None]:
class ROBERTaClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 1024,
                 num_classes=3,   ##클래스 수 조정##
                 params=None,
                 freeze_bert=False):
        super(ROBERTaClassifier, self).__init__()
        self.bert = bert
        self.freeze_bert=freeze_bert

        if self.freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False

                 
        self.classifier = nn.Linear(hidden_size , 256)
        self.dropout = nn.Dropout(p=0.5)
        self.fc_layer1 = nn.Linear(256,128)
        self.fc_layer2 = nn.Linear(128,num_classes)
    

    def forward(self, input_ids, attn_masks):
        
        _,pooler = self.bert(input_ids, attn_masks, return_dict=False)
        output1 = self.classifier(pooler)
        output2 = self.fc_layer1(output1)
        output3 = self.fc_layer2(self.dropout(output2))
        return (output3)

### Train.py

In [None]:
device = torch.device("cuda")
!nvidia-smi

Sun Feb 27 00:19:43 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
###### HYPERPARMS ######
lr = 2e-5

batch_size=8
warmup_ratio = 0.06
num_epochs = 10

log_interval = 500
#learning_rate = 5e-6

In [None]:
############### CV ################
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 5,shuffle=True,random_state=42)
folds=[]
for trn_idx,val_idx in skf.split(clean_train['text_sum'], train['label']):
    folds.append((trn_idx,val_idx))

In [None]:


############### CV Training ###############
best_models = []


for i,fold in enumerate(range(4,5)):
    print('===============',i+1,'fold start===============')
    model = ROBERTaClassifier(model_roberta).to(device)
    model=nn.DataParallel(model).to(device)
    
    optimizer = AdamW(model.parameters(), lr=lr)

    train_idx = folds[fold][0]
    valid_idx = folds[fold][1]
    train_data = train.loc[train_idx]
    val_data = train.loc[valid_idx]
    train_dataset = customDataset(train_data,'train')
    valid_dataset = customDataset(val_data,'train')
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    total_steps = len(train_loader) * num_epochs
    warmup_step = int(total_steps * warmup_ratio)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=5, num_training_steps=total_steps)

    
    for epoch in range( num_epochs):
        batches = 0
        total_loss = 0.0
        correct = 0
        total =0
        model.train()
        
        for input_ids_batch, attention_masks_batch, token_type_ids_batch, y_batch in tqdm(train_loader):
            optimizer.zero_grad()
            y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch.to(device),attn_masks= attention_masks_batch.to(device))
            loss1 = F.cross_entropy(y_pred, y_batch)
            one_hot = torch.nn.functional.one_hot(y_batch, 3).to(torch.float32)
            loss2 = F.mse_loss(y_pred, one_hot)
            loss3= F.hinge_embedding_loss(y_pred, one_hot)
            loss = 0.7*loss1+0.15*loss2+0.15*loss3
            loss.backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
            _, predicted = torch.max(y_pred, 1)
            correct += (predicted == y_batch).sum()
            total += len(y_batch)
            batches += 1
            if batches % log_interval == 0:
                print("Batch Loss: ", total_loss / batches, "Accuracy: ", correct.float() / total)
        
        val_loss = []
        val_acc = []
        
        for input_ids_batch, attention_masks_batch, token_type_ids, y_batch in tqdm(valid_loader):
            
            model.eval()
            with torch.no_grad():
                
                y_pred = model(input_ids_batch.to(device),attn_masks= attention_masks_batch.to(device))
                valid_loss = F.cross_entropy(y_pred,y_batch.to(device)).cpu().detach().numpy()

                preds = torch.argmax(y_pred,1)
                preds = preds.cpu().detach().numpy()
                y_batch = y_batch.cpu().detach().numpy()
                batch_acc = (preds==y_batch).mean()
                val_loss.append(valid_loss)
                val_acc.append(batch_acc)
                
        
        val_loss = np.mean(val_loss)
        val_acc = np.mean(val_acc)
        
        print(f'Epoch: {epoch} - valid Loss: {val_loss:.6f} - valid_acc : {val_acc:.6f}')
        print(optimizer.param_groups[0]["lr"])
        if epoch % 3 == 0:
            valid_acc_max = val_acc
            best_models.append(model)
            torch.save(model.state_dict(), f'/content/drive/Shareddrives/Dacon/hongsun/ROBERTa_large_fold_{fold}_{epoch}.pth') 
            print('model save, model val acc : ',val_acc)
            print('best_models size : ',len(best_models))



 13%|█▎        | 500/3800 [06:12<41:13,  1.33it/s]

Batch Loss:  0.9216977579593658 Accuracy:  tensor(0.3473, device='cuda:0')


 26%|██▋       | 1000/3800 [12:25<34:47,  1.34it/s]

Batch Loss:  0.9198011909723282 Accuracy:  tensor(0.3544, device='cuda:0')


 39%|███▉      | 1500/3800 [18:38<28:38,  1.34it/s]

Batch Loss:  0.920171161254247 Accuracy:  tensor(0.3470, device='cuda:0')


 53%|█████▎    | 2000/3800 [24:50<22:23,  1.34it/s]

Batch Loss:  0.9199990788400173 Accuracy:  tensor(0.3423, device='cuda:0')


 66%|██████▌   | 2500/3800 [31:03<16:10,  1.34it/s]

Batch Loss:  0.9198803700685501 Accuracy:  tensor(0.3416, device='cuda:0')


 79%|███████▉  | 3000/3800 [37:15<09:57,  1.34it/s]

Batch Loss:  0.9198085249265034 Accuracy:  tensor(0.3396, device='cuda:0')


 92%|█████████▏| 3500/3800 [43:28<03:43,  1.34it/s]

Batch Loss:  0.9197137882198606 Accuracy:  tensor(0.3373, device='cuda:0')


100%|██████████| 3800/3800 [47:12<00:00,  1.34it/s]
100%|██████████| 950/950 [03:59<00:00,  3.96it/s]


Epoch: 0 - valid Loss: 1.098176 - valid_acc : 0.342538
1.9511714293991217e-05
model save, model val acc :  0.3425375939849624
best_models size :  1


 13%|█▎        | 500/3800 [06:12<41:00,  1.34it/s]

Batch Loss:  0.9194196540117264 Accuracy:  tensor(0.3255, device='cuda:0')


 26%|██▋       | 1000/3800 [12:25<34:48,  1.34it/s]

Batch Loss:  0.9189809842705726 Accuracy:  tensor(0.3305, device='cuda:0')


 39%|███▉      | 1500/3800 [18:37<28:36,  1.34it/s]

Batch Loss:  0.9183636392752329 Accuracy:  tensor(0.3332, device='cuda:0')


 53%|█████▎    | 2000/3800 [24:50<22:24,  1.34it/s]

Batch Loss:  0.9181727999448777 Accuracy:  tensor(0.3339, device='cuda:0')


 66%|██████▌   | 2500/3800 [31:03<16:09,  1.34it/s]

Batch Loss:  0.9179832074642181 Accuracy:  tensor(0.3353, device='cuda:0')


 79%|███████▉  | 3000/3800 [37:15<09:56,  1.34it/s]

Batch Loss:  0.9178590579430262 Accuracy:  tensor(0.3353, device='cuda:0')


 92%|█████████▏| 3500/3800 [43:28<03:43,  1.34it/s]

Batch Loss:  0.9177494425603322 Accuracy:  tensor(0.3355, device='cuda:0')


100%|██████████| 3800/3800 [47:11<00:00,  1.34it/s]
100%|██████████| 950/950 [04:00<00:00,  3.94it/s]


Epoch: 1 - valid Loss: 1.098611 - valid_acc : 0.342538
1.8092113527373987e-05


 13%|█▎        | 500/3800 [06:12<41:01,  1.34it/s]

Batch Loss:  0.9175937502384186 Accuracy:  tensor(0.3378, device='cuda:0')


 26%|██▋       | 1000/3800 [12:25<34:52,  1.34it/s]

Batch Loss:  0.9163469630479812 Accuracy:  tensor(0.3486, device='cuda:0')


 39%|███▉      | 1500/3800 [18:38<28:35,  1.34it/s]

Batch Loss:  0.9166411354939142 Accuracy:  tensor(0.3466, device='cuda:0')


 53%|█████▎    | 2000/3800 [24:50<22:21,  1.34it/s]

Batch Loss:  0.9167067572176456 Accuracy:  tensor(0.3437, device='cuda:0')


 66%|██████▌   | 2500/3800 [31:03<16:08,  1.34it/s]

Batch Loss:  0.9167746053934097 Accuracy:  tensor(0.3433, device='cuda:0')


 79%|███████▉  | 3000/3800 [37:15<09:56,  1.34it/s]

Batch Loss:  0.9168381319840749 Accuracy:  tensor(0.3431, device='cuda:0')


 92%|█████████▏| 3500/3800 [43:28<03:43,  1.34it/s]

Batch Loss:  0.9169056904315949 Accuracy:  tensor(0.3421, device='cuda:0')


100%|██████████| 3800/3800 [47:12<00:00,  1.34it/s]
100%|██████████| 950/950 [04:00<00:00,  3.95it/s]


Epoch: 2 - valid Loss: 1.098814 - valid_acc : 0.319380
1.5880193533887023e-05


 13%|█▎        | 500/3800 [06:12<40:59,  1.34it/s]

Batch Loss:  0.9162483100891113 Accuracy:  tensor(0.3523, device='cuda:0')


 26%|██▋       | 1000/3800 [12:25<34:49,  1.34it/s]

Batch Loss:  0.9164621985554695 Accuracy:  tensor(0.3484, device='cuda:0')


 39%|███▉      | 1500/3800 [18:37<28:35,  1.34it/s]

Batch Loss:  0.9165676903327306 Accuracy:  tensor(0.3461, device='cuda:0')


 53%|█████▎    | 2000/3800 [24:50<22:23,  1.34it/s]

Batch Loss:  0.9167726722657681 Accuracy:  tensor(0.3436, device='cuda:0')


 66%|██████▌   | 2500/3800 [31:03<16:09,  1.34it/s]

Batch Loss:  0.9165817977428437 Accuracy:  tensor(0.3432, device='cuda:0')


 79%|███████▉  | 3000/3800 [37:15<09:57,  1.34it/s]

Batch Loss:  0.9163967281778653 Accuracy:  tensor(0.3442, device='cuda:0')


 92%|█████████▏| 3500/3800 [43:28<03:43,  1.34it/s]

Batch Loss:  0.9164272657803126 Accuracy:  tensor(0.3432, device='cuda:0')


100%|██████████| 3800/3800 [47:11<00:00,  1.34it/s]
100%|██████████| 950/950 [03:59<00:00,  3.96it/s]


Epoch: 3 - valid Loss: 1.098203 - valid_acc : 0.342538
1.309252897393379e-05
model save, model val acc :  0.3425375939849624
best_models size :  2


 13%|█▎        | 500/3800 [06:12<41:00,  1.34it/s]

Batch Loss:  0.9157886086702347 Accuracy:  tensor(0.3368, device='cuda:0')


 26%|██▋       | 1000/3800 [12:25<34:49,  1.34it/s]

Batch Loss:  0.9156170400381088 Accuracy:  tensor(0.3350, device='cuda:0')


 39%|███▉      | 1500/3800 [18:38<28:36,  1.34it/s]

Batch Loss:  0.9160131194194158 Accuracy:  tensor(0.3351, device='cuda:0')


 53%|█████▎    | 2000/3800 [24:50<22:23,  1.34it/s]

Batch Loss:  0.9161024796068669 Accuracy:  tensor(0.3351, device='cuda:0')


 66%|██████▌   | 2500/3800 [31:03<16:08,  1.34it/s]

Batch Loss:  0.9161710443496704 Accuracy:  tensor(0.3348, device='cuda:0')


 79%|███████▉  | 3000/3800 [37:16<09:56,  1.34it/s]

Batch Loss:  0.9161657460530599 Accuracy:  tensor(0.3351, device='cuda:0')


 92%|█████████▏| 3500/3800 [43:28<03:43,  1.34it/s]

Batch Loss:  0.916071194427354 Accuracy:  tensor(0.3369, device='cuda:0')


100%|██████████| 3800/3800 [47:11<00:00,  1.34it/s]
100%|██████████| 950/950 [03:59<00:00,  3.96it/s]


Epoch: 4 - valid Loss: 1.098195 - valid_acc : 0.342538
1.0002067109245441e-05


 13%|█▎        | 500/3800 [06:12<41:00,  1.34it/s]

Batch Loss:  0.9166970031261444 Accuracy:  tensor(0.3210, device='cuda:0')


 26%|██▋       | 1000/3800 [12:25<34:47,  1.34it/s]

Batch Loss:  0.9162932348847389 Accuracy:  tensor(0.3306, device='cuda:0')


 39%|███▉      | 1500/3800 [18:37<28:34,  1.34it/s]

Batch Loss:  0.9162282061179479 Accuracy:  tensor(0.3307, device='cuda:0')


 53%|█████▎    | 2000/3800 [24:50<22:23,  1.34it/s]

Batch Loss:  0.9159996773302556 Accuracy:  tensor(0.3336, device='cuda:0')


 66%|██████▌   | 2500/3800 [31:02<16:09,  1.34it/s]

Batch Loss:  0.915863318490982 Accuracy:  tensor(0.3342, device='cuda:0')


 79%|███████▉  | 3000/3800 [37:15<09:56,  1.34it/s]

Batch Loss:  0.9157683650851249 Accuracy:  tensor(0.3353, device='cuda:0')


 92%|█████████▏| 3500/3800 [43:28<03:43,  1.34it/s]

Batch Loss:  0.915660280755588 Accuracy:  tensor(0.3370, device='cuda:0')


100%|██████████| 3800/3800 [47:11<00:00,  1.34it/s]
100%|██████████| 950/950 [03:59<00:00,  3.96it/s]


Epoch: 5 - valid Loss: 1.098157 - valid_acc : 0.342538
6.911402848681934e-06


 13%|█▎        | 500/3800 [06:12<40:58,  1.34it/s]

Batch Loss:  0.9153591719865799 Accuracy:  tensor(0.3333, device='cuda:0')


 26%|██▋       | 1000/3800 [12:25<34:47,  1.34it/s]

Batch Loss:  0.915657383620739 Accuracy:  tensor(0.3295, device='cuda:0')


 39%|███▉      | 1500/3800 [18:37<28:35,  1.34it/s]

Batch Loss:  0.915679368019104 Accuracy:  tensor(0.3330, device='cuda:0')


 53%|█████▎    | 2000/3800 [24:50<22:23,  1.34it/s]

Batch Loss:  0.9155983111262321 Accuracy:  tensor(0.3356, device='cuda:0')


 66%|██████▌   | 2500/3800 [31:03<16:10,  1.34it/s]

Batch Loss:  0.9154852715492249 Accuracy:  tensor(0.3371, device='cuda:0')


 79%|███████▉  | 3000/3800 [37:15<09:56,  1.34it/s]

Batch Loss:  0.9154648790359498 Accuracy:  tensor(0.3380, device='cuda:0')


 92%|█████████▏| 3500/3800 [43:28<03:43,  1.34it/s]

Batch Loss:  0.9154887601477759 Accuracy:  tensor(0.3381, device='cuda:0')


100%|██████████| 3800/3800 [47:12<00:00,  1.34it/s]
100%|██████████| 950/950 [04:00<00:00,  3.95it/s]


Epoch: 6 - valid Loss: 1.098159 - valid_acc : 0.342538
4.1231509181934045e-06
model save, model val acc :  0.3425375939849624
best_models size :  3


 13%|█▎        | 500/3800 [06:12<41:04,  1.34it/s]

Batch Loss:  0.9150429039001465 Accuracy:  tensor(0.3438, device='cuda:0')


 26%|██▋       | 1000/3800 [12:25<34:50,  1.34it/s]

Batch Loss:  0.915354495882988 Accuracy:  tensor(0.3396, device='cuda:0')


 39%|███▉      | 1500/3800 [18:38<28:35,  1.34it/s]

Batch Loss:  0.9154304110209147 Accuracy:  tensor(0.3388, device='cuda:0')


 53%|█████▎    | 2000/3800 [24:50<22:23,  1.34it/s]

Batch Loss:  0.9152509293556214 Accuracy:  tensor(0.3422, device='cuda:0')


 66%|██████▌   | 2500/3800 [31:03<16:09,  1.34it/s]

Batch Loss:  0.9152272896289826 Accuracy:  tensor(0.3415, device='cuda:0')


 79%|███████▉  | 3000/3800 [37:16<09:56,  1.34it/s]

Batch Loss:  0.9152241521080335 Accuracy:  tensor(0.3417, device='cuda:0')


 92%|█████████▏| 3500/3800 [43:29<03:43,  1.34it/s]

Batch Loss:  0.9152404301507132 Accuracy:  tensor(0.3393, device='cuda:0')


100%|██████████| 3800/3800 [47:12<00:00,  1.34it/s]
100%|██████████| 950/950 [04:00<00:00,  3.96it/s]


Epoch: 7 - valid Loss: 1.098207 - valid_acc : 0.342538
1.910316090440226e-06


 13%|█▎        | 500/3800 [06:12<40:59,  1.34it/s]

Batch Loss:  0.9146191902160644 Accuracy:  tensor(0.3473, device='cuda:0')


 26%|██▋       | 1000/3800 [12:25<34:48,  1.34it/s]

Batch Loss:  0.9146729299426078 Accuracy:  tensor(0.3460, device='cuda:0')


 39%|███▉      | 1500/3800 [18:37<28:34,  1.34it/s]

Batch Loss:  0.9147026085456212 Accuracy:  tensor(0.3439, device='cuda:0')


 53%|█████▎    | 2000/3800 [24:50<22:22,  1.34it/s]

Batch Loss:  0.914687517285347 Accuracy:  tensor(0.3438, device='cuda:0')


 66%|██████▌   | 2500/3800 [31:03<16:10,  1.34it/s]

Batch Loss:  0.9148332039833069 Accuracy:  tensor(0.3409, device='cuda:0')


 79%|███████▉  | 3000/3800 [37:16<09:56,  1.34it/s]

Batch Loss:  0.9148482124010722 Accuracy:  tensor(0.3406, device='cuda:0')


 92%|█████████▏| 3500/3800 [43:29<03:43,  1.34it/s]

Batch Loss:  0.9149144597564425 Accuracy:  tensor(0.3385, device='cuda:0')


100%|██████████| 3800/3800 [47:12<00:00,  1.34it/s]
100%|██████████| 950/950 [04:00<00:00,  3.95it/s]


Epoch: 8 - valid Loss: 1.098184 - valid_acc : 0.342538
4.895625995541687e-07


 13%|█▎        | 500/3800 [06:12<41:03,  1.34it/s]

Batch Loss:  0.9148637340068817 Accuracy:  tensor(0.3395, device='cuda:0')


 26%|██▋       | 1000/3800 [12:25<34:52,  1.34it/s]

Batch Loss:  0.9150417333841324 Accuracy:  tensor(0.3383, device='cuda:0')


 39%|███▉      | 1500/3800 [18:38<28:35,  1.34it/s]

Batch Loss:  0.9150765441258748 Accuracy:  tensor(0.3415, device='cuda:0')


 53%|█████▎    | 2000/3800 [24:50<22:22,  1.34it/s]

Batch Loss:  0.9149330267310143 Accuracy:  tensor(0.3423, device='cuda:0')


 66%|██████▌   | 2500/3800 [31:03<16:10,  1.34it/s]

Batch Loss:  0.9148294288873673 Accuracy:  tensor(0.3436, device='cuda:0')


 79%|███████▉  | 3000/3800 [37:16<09:56,  1.34it/s]

Batch Loss:  0.914828966041406 Accuracy:  tensor(0.3427, device='cuda:0')


 92%|█████████▏| 3500/3800 [43:29<03:43,  1.34it/s]

Batch Loss:  0.9149523602553776 Accuracy:  tensor(0.3417, device='cuda:0')


100%|██████████| 3800/3800 [47:12<00:00,  1.34it/s]
100%|██████████| 950/950 [04:00<00:00,  3.95it/s]


Epoch: 9 - valid Loss: 1.098189 - valid_acc : 0.342538
0.0
model save, model val acc :  0.3425375939849624
best_models size :  4
