In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import numpy as np
from einops import rearrange, reduce, repeat

from tqdm import tqdm

import time
import copy
from collections import defaultdict
import joblib
import gc
import os

from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
N = 2
HIDDEN_DIM = 256
NUM_HEAD = 8 
INNER_DIM = 512

PAD_IDX = 0
EOS_IDX = 3

In [4]:
ddir = './'

src_train_path = os.path.join(ddir,'src_train.pkl')
src_valid_path = os.path.join(ddir,'src_valid.pkl')
trg_train_path = os.path.join(ddir,'trg_train.pkl')
trg_valid_path = os.path.join(ddir,'trg_valid.pkl')

src_train_path2 = os.path.join(ddir,'src_train2.pkl')
src_valid_path2 = os.path.join(ddir,'src_valid2.pkl')
trg_train_path2 = os.path.join(ddir,'trg_train2.pkl')
trg_valid_path2 = os.path.join(ddir,'trg_valid2.pkl')

In [5]:
src_train = joblib.load(src_train_path)
src_valid = joblib.load(src_valid_path)
trg_train = joblib.load(trg_train_path)
trg_valid = joblib.load(trg_valid_path)

src_train2 = joblib.load(src_train_path2)
src_valid2 = joblib.load(src_valid_path2)
trg_train2 = joblib.load(trg_train_path2)
trg_valid2 = joblib.load(trg_valid_path2)

In [6]:
labels = list(set(trg_train))
labels

['entertain',
 'society',
 'international',
 'economy',
 'culture',
 'politics',
 'it',
 'sport']

In [7]:
labels_dict = {}
for i in range(len(labels)) :
    labels_dict[labels[i]] = i
labels_dict

{'entertain': 0,
 'society': 1,
 'international': 2,
 'economy': 3,
 'culture': 4,
 'politics': 5,
 'it': 6,
 'sport': 7}

In [8]:
def multiLabelEncoder(labels_dict, target) :
    tmp = np.zeros((len(target), len(labels_dict)))
    for t in range(len(target)) :
        tmp[t][labels_dict[target[t]]] = 1
    return tmp

In [9]:
trg_valid2[:5]

['politics', 'politics', 'politics', 'society', 'economy']

In [10]:
test = multiLabelEncoder(labels_dict, trg_valid2)
test[:5]

array([[0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.]])

In [11]:
trg_train = multiLabelEncoder(labels_dict, trg_train)
trg_valid = multiLabelEncoder(labels_dict, trg_valid)
trg_train2 = multiLabelEncoder(labels_dict, trg_train2)
trg_valid2 = multiLabelEncoder(labels_dict, trg_valid2)
trg_valid2[:5]

array([[0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.]])

In [12]:
VOCAB_SIZE = 15*100*8
SEQ_LEN = 60*2

VOCAB_SIZE2 = 1108*8
SEQ_LEN2 = 4674*2

BATCH_SIZE = 64

In [13]:
class TrainDataset(Dataset):
    def __init__(self, src_data, trg_data):
        super().__init__()

        self.src_data = src_data
        self.trg_data = trg_data

    def __len__(self):
        return len(self.src_data)
        
    def __getitem__ (self, idx):
        src = self.src_data[idx]
        trg = self.trg_data[idx]

        return torch.Tensor(src).long(), torch.Tensor(trg).long()

train_dataset = TrainDataset(src_train, trg_train)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle= True, pin_memory=True)

In [14]:
class ValidDataset(Dataset):
    def __init__(self, src_data, trg_data):
        super().__init__()

        self.src_data = src_data
        self.trg_data = trg_data

    def __len__(self):
        return len(self.src_data)
        
    def __getitem__ (self, idx):
        src = self.src_data[idx]
        trg = self.trg_data[idx]

        return torch.Tensor(src).long(), torch.Tensor(trg).long()

valid_dataset = ValidDataset(src_valid, trg_valid)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle= False, pin_memory=True)

In [15]:
class FFN(nn.Module):
    def __init__ (self, hidden_dim, inner_dim):
        super().__init__()
 
        self.hidden_dim = hidden_dim

        self.inner_dim = inner_dim 

        self.fc1 = nn.Linear(hidden_dim, inner_dim)
        self.fc2 = nn.Linear(inner_dim, hidden_dim)
        self.relu = nn.ReLU(inplace=False)
        self.dropout = nn.Dropout(0.1)
   
    def forward(self, input):
        output = input
        output = self.fc1(output)
        output2 = self.relu(output)
        output2 = self.dropout(output)
        output3 = self.fc2(output2)

        return output3

In [16]:

def makeMask(tensor, option: str) -> torch.Tensor:
  
    if option == 'padding':
        tmp = torch.full_like(tensor, fill_value=PAD_IDX).to(device)
       
        mask = (tensor != tmp).float()
        
        mask = rearrange(mask, 'bs seq_len -> bs 1 1 seq_len ')

    elif option == 'lookahead':

        padding_mask = makeMask(tensor, 'padding')
        padding_mask = repeat(
            padding_mask, 'bs 1 1 k_len -> bs 1 new k_len', new=padding_mask.shape[3])
        
        mask = torch.ones_like(padding_mask)
        mask = torch.tril(mask)

        mask = mask * padding_mask
        
    return mask

In [17]:
class Multiheadattention(nn.Module):
    def __init__(self, hidden_dim: int, num_head: int):
        super().__init__()

        # embedding_dim, d_model, 512 in paper
        self.hidden_dim = hidden_dim
        # 8 in paper
        self.num_head = num_head
        # head_dim, d_key, d_query, d_value, 64 in paper (= 512 / 8)
        self.head_dim = hidden_dim // num_head
        self.scale = torch.sqrt(torch.FloatTensor()).to(device)

        self.fcQ = nn.Linear(hidden_dim, hidden_dim)
        self.fcK = nn.Linear(hidden_dim, hidden_dim)
        self.fcV = nn.Linear(hidden_dim, hidden_dim)
        self.fcOut = nn.Linear(hidden_dim, hidden_dim)

        self.dropout = nn.Dropout(0.1)


    def forward(self, srcQ, srcK, srcV, mask=None):

        ##### SCALED DOT PRODUCT ATTENTION ######

        Q = self.fcQ(srcQ)
        K = self.fcK(srcK)
        V = self.fcV(srcV)

        Q = rearrange(
            Q, 'bs seq_len (num_head head_dim) -> bs num_head seq_len head_dim', num_head=self.num_head)
        K_T = rearrange(
            K, 'bs seq_len (num_head head_dim) -> bs num_head head_dim seq_len', num_head=self.num_head)
        V = rearrange(
            V, 'bs seq_len (num_head head_dim) -> bs num_head seq_len head_dim', num_head=self.num_head)
        
        attention_energy = torch.matmul(Q, K_T)

        if mask is not None :
 
            attention_energy = torch.masked_fill(attention_energy, (mask == 0), -1e+4)
            
        attention_energy = torch.softmax(attention_energy, dim = -1)

        result = torch.matmul(self.dropout(attention_energy),V)

        ##### END OF SCALED DOT PRODUCT ATTENTION ######

        # CONCAT
        result = rearrange(result, 'bs num_head seq_len head_dim -> bs seq_len (num_head head_dim)')

        result = self.fcOut(result)

        return result

In [18]:
class EncoderLayer(nn.Module):
    def __init__(self, hidden_dim, num_head, inner_dim):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim
        
        self.multiheadattention = Multiheadattention(hidden_dim, num_head)
        self.ffn = FFN(hidden_dim, inner_dim)
        self.layerNorm1 = nn.LayerNorm(hidden_dim)
        self.layerNorm2 = nn.LayerNorm(hidden_dim)


        self.dropout1 = nn.Dropout(p=0.1)
        self.dropout2 = nn.Dropout(p=0.1)


    def forward(self, input, mask = None):

        output = self.multiheadattention(srcQ= input, srcK = input, srcV = input, mask = mask)
        output = self.dropout1(output)
        output = input + output
        output = self.layerNorm1(output)

        output_ = self.ffn(output)
        output_ = self.dropout2(output_)
        output = output + output_
        output = self.layerNorm2(output)

        return output

In [19]:
class Encoder(nn.Module):
    def __init__ (self, N, hidden_dim, num_head, inner_dim,max_length=1000):
        super().__init__()

        # N : number of encoder layer repeated 
        self.N = N
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.embedding = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=hidden_dim, padding_idx=-1)
        self.pos_embedding = nn.Embedding(max_length, hidden_dim)
        self.enc_layers = nn.ModuleList([EncoderLayer(hidden_dim, num_head, inner_dim) for _ in range(N)])

        self.dropout = nn.Dropout(p=0.1)

    def forward(self, input):
        
        batch_size = input.shape[0]
        seq_len = input.shape[1]


        mask = makeMask(input, option='padding')

        pos = torch.arange(0, seq_len).unsqueeze(0).repeat(batch_size, 1).to(device)

        output = self.dropout(self.embedding(input) + self.pos_embedding(pos))

        # Dropout
        output = self.dropout(output)

        # N encoder layer
        for layer in self.enc_layers:
            output = layer(output, mask)


        return output

In [20]:
class Transformer(nn.Module):
    def __init__(self, N = 2, hidden_dim = 256, num_head = 8, inner_dim = 512):
        super().__init__()
        self.encoder = Encoder(N, hidden_dim, num_head, inner_dim)
        self.mlp = nn.Sequential(nn.Linear(256, 64),
                                 nn.Linear(64,16),
                                 nn.GELU(),
                                 nn.Linear(16,8)
        )

    def forward(self, enc_src):

        enc_output = self.encoder(enc_src)
        pred = self.mlp(enc_output)

        return pred

In [21]:
model = Transformer(N, HIDDEN_DIM, NUM_HEAD, INNER_DIM).to(device)
model.eval()

Transformer(
  (encoder): Encoder(
    (embedding): Embedding(12000, 256, padding_idx=11999)
    (pos_embedding): Embedding(1000, 256)
    (enc_layers): ModuleList(
      (0): EncoderLayer(
        (multiheadattention): Multiheadattention(
          (fcQ): Linear(in_features=256, out_features=256, bias=True)
          (fcK): Linear(in_features=256, out_features=256, bias=True)
          (fcV): Linear(in_features=256, out_features=256, bias=True)
          (fcOut): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ffn): FFN(
          (fc1): Linear(in_features=256, out_features=512, bias=True)
          (fc2): Linear(in_features=512, out_features=256, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (layerNorm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (layerNorm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1):

In [22]:
optimizer = torch.optim.Adam(params = model.parameters(), lr = 1e-4, weight_decay = 0)

criterion = nn.CrossEntropyLoss()

In [23]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):

    model.train()

    dataset_size = 0
    running_loss = 0
    running_accuracy = 0
    accuracy = 0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))

    for step, (src, trg) in bar:
        src = src.to(device)
        trg = trg.to(device)

        batch_size = src.shape[0]

        pred = model(enc_src=src)
        
        loss = criterion(pred, trg)

        loss.backward()
    
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)  
     
        optimizer.step()

        # zero the parameter gradients
        optimizer.zero_grad()

        # change learning rate by Scheduler
        if scheduler is not None:
            scheduler.step()

        running_loss += loss.item() * batch_size
        running_accuracy = np.mean(pred.detach().cpu().numpy() == trg.detach().cpu().numpy())

        accuracy += running_accuracy

        dataset_size += batch_size
        epoch_loss = running_loss / dataset_size

        bar.set_postfix(
            Epoch=epoch, Train_Loss=epoch_loss, LR=optimizer.param_groups[0]["lr"], accuracy=accuracy / np.float(
                step+1)
        )

    accuracy /= len(dataloader)

    gc.collect()

    return epoch_loss, accuracy

In [24]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0
    accuracy = 0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))

    for step, (src, trg) in bar:
        src = src.to(device)
        trg = trg.to(device)

        batch_size = src.shape[0]

        pred = model(enc_src = src)
        loss = criterion(pred, trg)

        running_loss += loss.item() * batch_size
        dataset_size += batch_size

     
        val_loss = running_loss / dataset_size
        running_accuracy = np.mean(pred.view(-1).detach().cpu().numpy() == trg.view(-1).detach().cpu().numpy())
        
        accuracy += running_accuracy

        bar.set_postfix(
            Epoch=epoch, Valid_Loss=val_loss, LR=optimizer.param_groups[0]["lr"], accuracy = accuracy / np.float(step + 1)
        )

    accuracy /= len(dataloader)

    gc.collect()

    return val_loss, accuracy

In [25]:
def run_training(
    model,
    optimizer,
    scheduler,
    device,
    num_epochs,
    metric_prefix="",
    file_prefix="",
    early_stopping=True,
    early_stopping_step=10,
):

    if torch.cuda.is_available():
        print("[INFO] Using GPU:{}\n".format(torch.cuda.get_device_name()))

    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = np.inf
    history = defaultdict(list)
    early_stop_counter = 0

    for epoch in range(1, num_epochs + 1):
        gc.collect()

        train_epoch_loss, train_accuracy = train_one_epoch(
            model,
            optimizer,
            scheduler,
            dataloader= train_dataloader,
            device=device,
            epoch=epoch,
        )

        val_loss, val_accuracy = valid_one_epoch(
            model, valid_dataloader, device=device, epoch=epoch
        )

        history[f"{metric_prefix}Train Loss"].append(train_epoch_loss)
        history[f"{metric_prefix}Train Accuracy"].append(train_accuracy)
        history[f"{metric_prefix}Valid Loss"].append(val_loss)
        history[f"{metric_prefix}Valid Accuracy"].append(val_accuracy)


        print(f"Valid Loss : {val_loss}")

        if val_loss <= best_loss:
            early_stop_counter = 0

            print(
                f"Validation Loss improved( {best_loss} ---> {val_loss}  )"
            )

            # Update Best Loss
            best_loss = val_loss
            
            best_model_wts = copy.deepcopy(model.state_dict())

            PATH = "{}epoch{:.0f}_Loss{:.4f}.bin".format(file_prefix, epoch, best_loss)
            torch.save(model.state_dict(), PATH)
            torch.save(model.state_dict(), f"{file_prefix}best_{epoch}epoch.bin")

            print(f"Model Saved")

        elif early_stopping:
            early_stop_counter += 1
            if early_stop_counter > early_stopping_step:
                break
        
    end = time.time()
    time_elapsed = end - start
    print(
        "Training complete in {:.0f}h {:.0f}m {:.0f}s".format(
            time_elapsed // 3600,
            (time_elapsed % 3600) // 60,
            (time_elapsed % 3600) % 60,
        )
    )
    print("Best Loss: {:.4f}".format(best_loss))

    # load best model weights
    model.load_state_dict(best_model_wts)

    return model, history


In [26]:
run_training(
    model = model,
    optimizer = optimizer,
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=100, eta_min=1e-5),
    device = device,
    num_epochs = 2000,
    metric_prefix="",
    file_prefix="",
    early_stopping=True,
    early_stopping_step=10,
)

[INFO] Using GPU:NVIDIA A100-PCIE-40GB



  running_accuracy = np.mean(pred.detach().cpu().numpy() == trg.detach().cpu().numpy())
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  Epoch=epoch, Train_Loss=epoch_loss, LR=optimizer.param_groups[0]["lr"], accuracy=accuracy / np.float(
100%|██████████| 40/40 [00:00<00:00, 58.74it/s, Epoch=1, LR=6.89e-5, Train_Loss=3.71, accuracy=0]
  running_accuracy = np.mean(pred.view(-1).detach().cpu().numpy() == trg.view(-1).detach().cpu().numpy())
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  Epoch=epoch, Valid_Loss=val_loss, LR=optimizer.param_groups[0]["lr"], accuracy = accuracy / np.float(step + 1)
100%|██████████| 10/10 [00:00<00:00, 155.57it/s, Epoch=1, LR=6.89e-5, Valid_Loss=2.64, accuracy=0]


Valid Loss : 2.6440712555198913
Validation Loss improved( inf ---> 2.6440712555198913  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 58.22it/s, Epoch=2, LR=1.86e-5, Train_Loss=2.33, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 201.53it/s, Epoch=2, LR=1.86e-5, Valid_Loss=1.92, accuracy=0]


Valid Loss : 1.9240868767355657
Validation Loss improved( 2.6440712555198913 ---> 1.9240868767355657  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 69.18it/s, Epoch=3, LR=1.86e-5, Train_Loss=1.96, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 204.40it/s, Epoch=3, LR=1.86e-5, Valid_Loss=1.76, accuracy=0]


Valid Loss : 1.7618302539655357
Validation Loss improved( 1.9240868767355657 ---> 1.7618302539655357  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 67.69it/s, Epoch=4, LR=6.89e-5, Train_Loss=1.7, accuracy=0] 
100%|██████████| 10/10 [00:00<00:00, 201.34it/s, Epoch=4, LR=6.89e-5, Valid_Loss=1.33, accuracy=0]


Valid Loss : 1.3331764982004835
Validation Loss improved( 1.7618302539655357 ---> 1.3331764982004835  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 68.25it/s, Epoch=5, LR=0.0001, Train_Loss=1.08, accuracy=0] 
100%|██████████| 10/10 [00:00<00:00, 194.90it/s, Epoch=5, LR=0.0001, Valid_Loss=0.686, accuracy=0]


Valid Loss : 0.685522800418222
Validation Loss improved( 1.3331764982004835 ---> 0.685522800418222  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 71.34it/s, Epoch=6, LR=6.89e-5, Train_Loss=0.554, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 226.24it/s, Epoch=6, LR=6.89e-5, Valid_Loss=0.425, accuracy=0]


Valid Loss : 0.4250426349366546
Validation Loss improved( 0.685522800418222 ---> 0.4250426349366546  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 65.16it/s, Epoch=7, LR=1.86e-5, Train_Loss=0.412, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 163.90it/s, Epoch=7, LR=1.86e-5, Valid_Loss=0.392, accuracy=0]


Valid Loss : 0.39159469277995407
Validation Loss improved( 0.4250426349366546 ---> 0.39159469277995407  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 50.38it/s, Epoch=8, LR=1.86e-5, Train_Loss=0.395, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 140.83it/s, Epoch=8, LR=1.86e-5, Valid_Loss=0.386, accuracy=0]


Valid Loss : 0.3863049141920296
Validation Loss improved( 0.39159469277995407 ---> 0.3863049141920296  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 58.91it/s, Epoch=9, LR=6.89e-5, Train_Loss=0.387, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 184.60it/s, Epoch=9, LR=6.89e-5, Valid_Loss=0.375, accuracy=0]


Valid Loss : 0.3745458615813286
Validation Loss improved( 0.3863049141920296 ---> 0.3745458615813286  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 61.76it/s, Epoch=10, LR=0.0001, Train_Loss=0.371, accuracy=0] 
100%|██████████| 10/10 [00:00<00:00, 207.89it/s, Epoch=10, LR=0.0001, Valid_Loss=0.362, accuracy=0]


Valid Loss : 0.3615802325260867
Validation Loss improved( 0.3745458615813286 ---> 0.3615802325260867  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 68.43it/s, Epoch=11, LR=6.89e-5, Train_Loss=0.359, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 202.16it/s, Epoch=11, LR=6.89e-5, Valid_Loss=0.354, accuracy=0]


Valid Loss : 0.3544577400016177
Validation Loss improved( 0.3615802325260867 ---> 0.3544577400016177  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 70.25it/s, Epoch=12, LR=1.86e-5, Train_Loss=0.353, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 189.79it/s, Epoch=12, LR=1.86e-5, Valid_Loss=0.352, accuracy=0]


Valid Loss : 0.35239705860994425
Validation Loss improved( 0.3544577400016177 ---> 0.35239705860994425  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 69.28it/s, Epoch=13, LR=1.86e-5, Train_Loss=0.351, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 222.10it/s, Epoch=13, LR=1.86e-5, Valid_Loss=0.352, accuracy=0]


Valid Loss : 0.35163886133272937
Validation Loss improved( 0.35239705860994425 ---> 0.35163886133272937  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 66.99it/s, Epoch=14, LR=6.89e-5, Train_Loss=0.349, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 168.56it/s, Epoch=14, LR=6.89e-5, Valid_Loss=0.35, accuracy=0]


Valid Loss : 0.3496918951629833
Validation Loss improved( 0.35163886133272937 ---> 0.3496918951629833  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 72.10it/s, Epoch=15, LR=0.0001, Train_Loss=0.345, accuracy=0] 
100%|██████████| 10/10 [00:00<00:00, 205.26it/s, Epoch=15, LR=0.0001, Valid_Loss=0.347, accuracy=0]


Valid Loss : 0.34679678993619933
Validation Loss improved( 0.3496918951629833 ---> 0.34679678993619933  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 66.27it/s, Epoch=16, LR=6.89e-5, Train_Loss=0.342, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 211.41it/s, Epoch=16, LR=6.89e-5, Valid_Loss=0.344, accuracy=0]


Valid Loss : 0.3441055775827663
Validation Loss improved( 0.34679678993619933 ---> 0.3441055775827663  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 70.77it/s, Epoch=17, LR=1.86e-5, Train_Loss=0.338, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 170.02it/s, Epoch=17, LR=1.86e-5, Valid_Loss=0.342, accuracy=0]


Valid Loss : 0.3417198788967861
Validation Loss improved( 0.3441055775827663 ---> 0.3417198788967861  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 68.97it/s, Epoch=18, LR=1.86e-5, Train_Loss=0.337, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 229.75it/s, Epoch=18, LR=1.86e-5, Valid_Loss=0.341, accuracy=0]


Valid Loss : 0.3411487625662688
Validation Loss improved( 0.3417198788967861 ---> 0.3411487625662688  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 68.57it/s, Epoch=19, LR=6.89e-5, Train_Loss=0.337, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 181.81it/s, Epoch=19, LR=6.89e-5, Valid_Loss=0.34, accuracy=0]


Valid Loss : 0.3395765144733866
Validation Loss improved( 0.3411487625662688 ---> 0.3395765144733866  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 73.09it/s, Epoch=20, LR=0.0001, Train_Loss=0.335, accuracy=0] 
100%|██████████| 10/10 [00:00<00:00, 226.05it/s, Epoch=20, LR=0.0001, Valid_Loss=0.337, accuracy=0]


Valid Loss : 0.3366292563213664
Validation Loss improved( 0.3395765144733866 ---> 0.3366292563213664  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 68.27it/s, Epoch=21, LR=6.89e-5, Train_Loss=0.33, accuracy=0] 
100%|██████████| 10/10 [00:00<00:00, 238.45it/s, Epoch=21, LR=6.89e-5, Valid_Loss=0.338, accuracy=0]


Valid Loss : 0.33840304651078146


100%|██████████| 40/40 [00:00<00:00, 67.12it/s, Epoch=22, LR=1.86e-5, Train_Loss=0.324, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 204.41it/s, Epoch=22, LR=1.86e-5, Valid_Loss=0.333, accuracy=0]


Valid Loss : 0.3327225450497524
Validation Loss improved( 0.3366292563213664 ---> 0.3327225450497524  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 72.41it/s, Epoch=23, LR=1.86e-5, Train_Loss=0.322, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 223.60it/s, Epoch=23, LR=1.86e-5, Valid_Loss=0.333, accuracy=0]


Valid Loss : 0.33260982621247603
Validation Loss improved( 0.3327225450497524 ---> 0.33260982621247603  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 69.30it/s, Epoch=24, LR=6.89e-5, Train_Loss=0.32, accuracy=0] 
100%|██████████| 10/10 [00:00<00:00, 208.19it/s, Epoch=24, LR=6.89e-5, Valid_Loss=0.33, accuracy=0]


Valid Loss : 0.3303360198713412
Validation Loss improved( 0.33260982621247603 ---> 0.3303360198713412  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 74.29it/s, Epoch=25, LR=0.0001, Train_Loss=0.318, accuracy=0] 
100%|██████████| 10/10 [00:00<00:00, 231.38it/s, Epoch=25, LR=0.0001, Valid_Loss=0.328, accuracy=0]


Valid Loss : 0.3282006502531137
Validation Loss improved( 0.3303360198713412 ---> 0.3282006502531137  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 69.51it/s, Epoch=26, LR=6.89e-5, Train_Loss=0.311, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 239.40it/s, Epoch=26, LR=6.89e-5, Valid_Loss=0.325, accuracy=0]


Valid Loss : 0.3250592278826768
Validation Loss improved( 0.3282006502531137 ---> 0.3250592278826768  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 70.30it/s, Epoch=27, LR=1.86e-5, Train_Loss=0.304, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 183.80it/s, Epoch=27, LR=1.86e-5, Valid_Loss=0.323, accuracy=0]


Valid Loss : 0.32303788593620253
Validation Loss improved( 0.3250592278826768 ---> 0.32303788593620253  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 54.58it/s, Epoch=28, LR=1.86e-5, Train_Loss=0.3, accuracy=0]  
100%|██████████| 10/10 [00:00<00:00, 165.22it/s, Epoch=28, LR=1.86e-5, Valid_Loss=0.323, accuracy=0]


Valid Loss : 0.3225363371478524
Validation Loss improved( 0.32303788593620253 ---> 0.3225363371478524  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 59.91it/s, Epoch=29, LR=6.89e-5, Train_Loss=0.298, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 209.64it/s, Epoch=29, LR=6.89e-5, Valid_Loss=0.324, accuracy=0]


Valid Loss : 0.32387557948470874


100%|██████████| 40/40 [00:00<00:00, 60.76it/s, Epoch=30, LR=0.0001, Train_Loss=0.299, accuracy=0] 
100%|██████████| 10/10 [00:00<00:00, 191.99it/s, Epoch=30, LR=0.0001, Valid_Loss=0.321, accuracy=0]


Valid Loss : 0.3212295083483313
Validation Loss improved( 0.3225363371478524 ---> 0.3212295083483313  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 70.83it/s, Epoch=31, LR=6.89e-5, Train_Loss=0.292, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 228.10it/s, Epoch=31, LR=6.89e-5, Valid_Loss=0.319, accuracy=0]


Valid Loss : 0.3185038655806499
Validation Loss improved( 0.3212295083483313 ---> 0.3185038655806499  )
Model Saved


100%|██████████| 40/40 [00:00<00:00, 68.58it/s, Epoch=32, LR=1.86e-5, Train_Loss=0.28, accuracy=0] 
100%|██████████| 10/10 [00:00<00:00, 217.09it/s, Epoch=32, LR=1.86e-5, Valid_Loss=0.321, accuracy=0]


Valid Loss : 0.32071771249649633


100%|██████████| 40/40 [00:00<00:00, 74.30it/s, Epoch=33, LR=1.86e-5, Train_Loss=0.279, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 196.44it/s, Epoch=33, LR=1.86e-5, Valid_Loss=0.32, accuracy=0]


Valid Loss : 0.32015845749028926


100%|██████████| 40/40 [00:00<00:00, 65.79it/s, Epoch=34, LR=6.89e-5, Train_Loss=0.277, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 194.64it/s, Epoch=34, LR=6.89e-5, Valid_Loss=0.322, accuracy=0]


Valid Loss : 0.3222711196370945


100%|██████████| 40/40 [00:00<00:00, 60.93it/s, Epoch=35, LR=0.0001, Train_Loss=0.277, accuracy=0] 
100%|██████████| 10/10 [00:00<00:00, 184.69it/s, Epoch=35, LR=0.0001, Valid_Loss=0.324, accuracy=0]


Valid Loss : 0.323979553523337


100%|██████████| 40/40 [00:00<00:00, 69.61it/s, Epoch=36, LR=6.89e-5, Train_Loss=0.268, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 218.92it/s, Epoch=36, LR=6.89e-5, Valid_Loss=0.328, accuracy=0]


Valid Loss : 0.3277472534756752


100%|██████████| 40/40 [00:00<00:00, 64.05it/s, Epoch=37, LR=1.86e-5, Train_Loss=0.263, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 170.74it/s, Epoch=37, LR=1.86e-5, Valid_Loss=0.327, accuracy=0]


Valid Loss : 0.32672906700213245


100%|██████████| 40/40 [00:00<00:00, 67.54it/s, Epoch=38, LR=1.86e-5, Train_Loss=0.255, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 203.92it/s, Epoch=38, LR=1.86e-5, Valid_Loss=0.325, accuracy=0]


Valid Loss : 0.32507601133577385


100%|██████████| 40/40 [00:00<00:00, 65.96it/s, Epoch=39, LR=6.89e-5, Train_Loss=0.257, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 227.76it/s, Epoch=39, LR=6.89e-5, Valid_Loss=0.331, accuracy=0]


Valid Loss : 0.33080442354177975


100%|██████████| 40/40 [00:00<00:00, 69.55it/s, Epoch=40, LR=0.0001, Train_Loss=0.258, accuracy=0] 
100%|██████████| 10/10 [00:00<00:00, 208.87it/s, Epoch=40, LR=0.0001, Valid_Loss=0.326, accuracy=0]


Valid Loss : 0.3262888005205021


100%|██████████| 40/40 [00:00<00:00, 74.82it/s, Epoch=41, LR=6.89e-5, Train_Loss=0.25, accuracy=0] 
100%|██████████| 10/10 [00:00<00:00, 220.61it/s, Epoch=41, LR=6.89e-5, Valid_Loss=0.328, accuracy=0]


Valid Loss : 0.3280114288542681


100%|██████████| 40/40 [00:00<00:00, 66.00it/s, Epoch=42, LR=1.86e-5, Train_Loss=0.246, accuracy=0]
100%|██████████| 10/10 [00:00<00:00, 198.22it/s, Epoch=42, LR=1.86e-5, Valid_Loss=0.328, accuracy=0]


Valid Loss : 0.3282806338018672
Training complete in 0h 0m 40s
Best Loss: 0.3185


(Transformer(
   (encoder): Encoder(
     (embedding): Embedding(12000, 256, padding_idx=11999)
     (pos_embedding): Embedding(1000, 256)
     (enc_layers): ModuleList(
       (0): EncoderLayer(
         (multiheadattention): Multiheadattention(
           (fcQ): Linear(in_features=256, out_features=256, bias=True)
           (fcK): Linear(in_features=256, out_features=256, bias=True)
           (fcV): Linear(in_features=256, out_features=256, bias=True)
           (fcOut): Linear(in_features=256, out_features=256, bias=True)
           (dropout): Dropout(p=0.1, inplace=False)
         )
         (ffn): FFN(
           (fc1): Linear(in_features=256, out_features=512, bias=True)
           (fc2): Linear(in_features=512, out_features=256, bias=True)
           (relu): ReLU()
           (dropout): Dropout(p=0.1, inplace=False)
         )
         (layerNorm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
         (layerNorm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True