# Encoder - Decoder (seq2seq) from scratch in pytorch

In [1]:
import torch
import pandas as pd

from torch.utils.data import Dataset,DataLoader
import random
from dataclasses import dataclass
import requests
import gzip
import shutil
# from torchtext.utils import download_from_url,extract_archive

In [2]:
def set_seed(seed:int = 42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

In [3]:
@dataclass
class ModelArgs():
    epochs = 500
    batch_size = 32
    no_of_neurons = 64
    embedding_dim = 32
    max_lr = 1e-4
    ht_size = no_of_neurons
    ct_size = ht_size
    device = "cuda" if torch.cuda.is_available() else "cpu"
    block_size = 32 # seq length
    
    no_of_lstm_layes = 4
    de_vocab_size = None
    en_vocab_size = None

In [4]:
base_url = "https://github.com/multi30k/dataset/raw/refs/heads/master/data/task1/raw/"

train_url = ("train.de.gz","train.en.gz")
val_url = ("val.de.gz","val.en.gz",)
test_url = ("test_2016_flickr.de.gz","test_2016_flickr.en.gz",)

import requests
from time import sleep

def download_file(url, file_name, retries=3):
    for attempt in range(retries):
        try:
            with requests.get(url, stream=True, timeout=10) as r:
                r.raise_for_status()
                with open(file_name, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
            print(f"Downloaded: {file_name}")
            break
        except (requests.exceptions.RequestException, ConnectionResetError) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            sleep(2)  # wait before retrying
            if attempt == retries - 1:
                print(f"Failed to download {file_name} after {retries} attempts.")

        
download_file(base_url+train_url[0],train_url[0])
download_file(base_url+train_url[1],train_url[1])
download_file(base_url+val_url[0],val_url[0])
download_file(base_url+val_url[1],val_url[1])
download_file(base_url+test_url[0],test_url[0])
download_file(base_url+test_url[1],test_url[1])

Attempt 1 failed: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Downloaded: train.de.gz
Downloaded: train.en.gz
Downloaded: val.de.gz
Downloaded: val.en.gz
Downloaded: test_2016_flickr.de.gz
Downloaded: test_2016_flickr.en.gz


In [5]:
def extract_file(in_file,out_file):
    with gzip.open(in_file,"rb") as f_in:
        with open(out_file,"wb") as f_out:
            shutil.copyfileobj(f_in,f_out)
    
    return out_file
            
    
            

train_paths = [extract_file(i,i[:-3]) for i in train_url]
val_paths = [extract_file(i,i[:-3]) for i in val_url]
test_paths = [extract_file(i,i[:-3]) for i in test_url]


In [6]:
train_paths,val_paths,test_paths

(['train.de', 'train.en'],
 ['val.de', 'val.en'],
 ['test_2016_flickr.de', 'test_2016_flickr.en'])

In [7]:
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
     ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
     - -------------------------------------- 0.5/14.6 MB 4.2 MB/s eta 0:00:04
     ---- ----------------------------------- 1.6/14.6 MB 5.2 MB/s eta 0:00:03
     ------ --------------------------------- 2.4/14.6 MB 4.6 MB/s eta 0:00:03
     --------- ------------------------------ 3.4/14.6 MB 4.4 MB/s eta 0:00:03
     ----------- ---------------------------- 4.2/14.6 MB 4.3 MB/s eta 0:00:03
     ------------- -------------------------- 5.0/14.6 MB 4.2 MB/s eta 0:00:03
     --------------- ------------------------ 5.8/14.6 MB 4.1 MB/s eta 0:00:03
     ----------------- ---------------------- 6.6/14.6 MB 4.1 MB/s eta 0:00:02
     -------------------- ------------------- 7.6/14.6 MB 4.1 MB/s eta 0:00:02
     ---------------------- ----------

In [8]:
import spacy
de_tokenizer = spacy.load("de_core_news_sm")
en_tokenizer = spacy.load("en_core_web_sm")

In [9]:
from collections import Counter,defaultdict
import io

def tokenize(text,tokenizer):
    doc = tokenizer(text)
    return [token.text.lower() for token in doc if not token.is_space]

def build_vocab(filepath,tokenizer,min_freq=1,specials=['<unk>','<bos>','<eos>','<pad>']):
    
    counter = Counter()
    with io.open(filepath,encoding='utf-8') as f:
        for string_ in f:
            counter.update(tokenize(string_,tokenizer))
            
    tokens = [tok for tok,freq in counter.items() if freq>=min_freq]
    print("completed collecting tokens")
    
    vocab = {tok:idx for idx,tok in enumerate(specials+tokens)}
    
    unk_idx = vocab["<unk>"]
    vocab = defaultdict(lambda : unk_idx,vocab)
        
    return vocab

de_vocab = build_vocab(train_paths[0],de_tokenizer)
ModelArgs.de_vocab_size = len(de_vocab)+1
en_vocab = build_vocab(train_paths[1],en_tokenizer)
ModelArgs.en_vocab_size = len(en_vocab)+1

completed collecting tokens
completed collecting tokens


In [10]:
ModelArgs.de_vocab_size,ModelArgs.en_vocab_size


(18667, 9797)

In [11]:
def create_data(filepaths):
    raw_de_iter = iter(io.open(filepaths[0],encoding='utf-8'))
    raw_en_iter = iter(io.open(filepaths[1],encoding="utf-8"))
    
    de_bos_idx = de_vocab["<bos>"]
    de_eos_idx = de_vocab["<eos>"]
    en_bos_idx = en_vocab["<bos>"]
    en_eos_idx = en_vocab["<eos>"]
    
    data = []
    
    for raw_de,raw_en in zip(raw_de_iter,raw_en_iter):
        de_tensor = torch.tensor([de_vocab[token] for token in raw_de],dtype=torch.long)
        en_tensor = torch.tensor([en_vocab[token] for token in raw_en],dtype=torch.long)
        
        en_tensor = torch.cat([torch.tensor([en_bos_idx]),en_tensor,torch.tensor([en_eos_idx])])
        
        de_tensor = torch.flip(de_tensor,dims=[0])
        
        data.append((de_tensor,en_tensor))
        
    return data

train_data = create_data(train_paths)
val_data = create_data(val_paths)
test_data = create_data(test_paths)



In [12]:
class TranslationDataset(Dataset):
    def __init__(self,data):
        self.data = data
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]
    
train_dataset = TranslationDataset(train_data)
val_dataset = TranslationDataset(val_data)
test_dataset = TranslationDataset(test_data)


In [13]:
from torch.nn.utils.rnn import pad_sequence

def collate_function(batch,block_size=ModelArgs.block_size):
    """
    Collate function to pad or truncate sequences to a fixed block size.

    Args:
        batch: A list of tuples (de_tensor, en_tensor).
        block_size: The fixed length to pad or truncate sequences to.

    Returns:
        de_batch: Padded/truncated German sequences (batch_size, block_size).
        en_batch: Padded/truncated English sequences (batch_size, block_size).
    """
    
    de_batch , en_batch = zip(*batch)
    
    def pad_or_truncate(sequence,block_size,pad_value):
        if len(sequence) > block_size: # truncate
            return sequence[:block_size]
        
        else: # pad
            padding_len = block_size - len(sequence)
            return torch.cat([sequence,torch.full(size=(padding_len,),fill_value=pad_value,dtype=sequence.dtype)])
    
    de_batch = [pad_or_truncate(sample,block_size=block_size,pad_value=de_vocab["<pad>"]) for sample in de_batch ]
    en_batch = [pad_or_truncate(sample,block_size=block_size,pad_value=en_vocab["<pad>"]) for sample in en_batch ]
    
    de_batch = torch.stack(de_batch)
    en_batch = torch.stack(en_batch)
    
    return de_batch,en_batch
            

In [14]:
# Note 

# When you use a DataLoader, PyTorch does this internally:
# It draws batch_size samples from your dataset using __getitem__
# It puts those samples in a list (e.g., batch = [sample1, sample2, ..., sampleN])
# It calls your collate_fn(batch) to process that list into a proper batch



In [15]:
train_dataloader= DataLoader(dataset=train_dataset,
                             batch_size=ModelArgs.batch_size,
                             shuffle=True,
                             collate_fn=collate_function,
                             drop_last=True)

val_dataloader = DataLoader(dataset=val_dataset,
                            batch_size=ModelArgs.batch_size,
                            shuffle=True,
                            collate_fn=collate_function,
                            drop_last=True)

test_dataloader = DataLoader(dataset=test_dataset,
                            batch_size=ModelArgs.block_size,
                            shuffle=True,
                            collate_fn=collate_function,
                            drop_last=True)



In [16]:
for batch in train_dataloader:
    print(len(batch))
    break

for deb,enb in train_dataloader:
    print(deb.shape)
    print(enb.shape)
    break

2
torch.Size([32, 32])
torch.Size([32, 32])


## LSTM

In [17]:
import torch.nn as nn

class ForgetGate(nn.Module):
    def __init__(self,h_t_size,embedding_dim):
        super(ForgetGate,self).__init__()
        
        self.sigma_nn = nn.Sequential(
            nn.Linear(in_features=h_t_size+embedding_dim,out_features=h_t_size),
            nn.Sigmoid()
        )
        
    def forward(self,h_t,X_t):
        # print(f"h_t : {h_t.shape}")
        # print(f"X_t : {X_t.shape}")
        combined = torch.cat([h_t,X_t],dim=1)
        
        # print(f"combined : {combined.shape}")
        
        # print(self.sigma_nn[0].in_features)
        
        f_t = self.sigma_nn(combined)
        
        return f_t

In [18]:
class InputGate(nn.Module):
    def __init__(self,h_t_size,embedding_dim):
        super(InputGate,self).__init__()
        
        self.sigma_nn = nn.Sequential(
            nn.Linear(in_features=h_t_size+embedding_dim,out_features=h_t_size),
            nn.Sigmoid()
        )
        
        self.tanh_nn = nn.Sequential(
            nn.Linear(in_features=h_t_size+embedding_dim,out_features=h_t_size),
            nn.Tanh()
        )
        
    def forward(self,h_t,X_t):
        combined = torch.cat([h_t,X_t],dim=1)
        
        i_t = self.sigma_nn(combined)
        
        candidate_hidden_state = self.tanh_nn(combined)
        
        c_t_dash = i_t * candidate_hidden_state
        
        return c_t_dash

In [19]:
class OutputGate(nn.Module):
    def __init__(self,h_t_size,embedding_dim):
        super(OutputGate,self).__init__()
        
        self.sigma_nn = nn.Sequential(
            nn.Linear(in_features=h_t_size+embedding_dim,out_features=h_t_size),
            nn.Sigmoid()
        )
        
    def forward(self,h_t,X_t):
        combined = torch.cat([h_t,X_t],dim=1)
        
        o_t = self.sigma_nn(combined)
        
        return o_t

In [20]:
class LSTMCell(nn.Module):
    def __init__(self,h_t_size,embedding_dim):
        super(LSTMCell,self).__init__()
        
        self.forget_gate = ForgetGate(h_t_size=h_t_size,embedding_dim=embedding_dim)
        
        self.input_gate = InputGate(h_t_size=h_t_size,embedding_dim=embedding_dim)
        
        self.output_gate = ForgetGate(h_t_size=h_t_size,embedding_dim=embedding_dim)
        
    def forward(self,h_t,c_t,X_t):
        f_t = self.forget_gate(h_t,X_t)
        
        c_t = c_t * f_t
        
        c_t_dash = self.input_gate(h_t,X_t)
        
        c_t = c_t + c_t_dash
        
        o_t = self.output_gate(h_t,X_t)
        
        h_t = torch.tanh(c_t) * o_t
        
        return h_t,c_t

In [21]:
class LSTMModel(nn.Module):
    def __init__(self,h_t_size,embedding_dim):
        # embedding dim -> input dim
        super(LSTMModel,self).__init__()
        
        self.lstm_cell = LSTMCell(h_t_size=h_t_size,embedding_dim=embedding_dim)
        
    def forward(self,X,h_t=None,c_t=None):
        """
        Returns the final hidden state vector & cell state vector after performaing forward pass for all time steps through the single LSTM cell 
        
        Args:
            X : input
            y : target
            h_t : hidden state vector , default = None
            c_t : cell state vector , default = None
        Return:
            h_t : hidden state vector after all time steps
            c_t : cell satet vector after all time steps
        """
        if h_t == None:
            h_t =torch.zeros(size=[ModelArgs.batch_size,ModelArgs.ht_size],device=ModelArgs.device) # if we facilitate automatic device conversion while converting the model device , we have to register them to model parameters(i.e, in __ini__())
        if c_t == None:
            c_t = torch.zeros(size=[ModelArgs.batch_size,ModelArgs.ht_size],device=ModelArgs.device) # if we facilitate automatic device conversion while converting the model device , we have to register them to model parameters(i.e, in __ini__())
    
        # print(f"X shape : {X.shape}")
        # for i in range(X.shape[0]):
        #     X_i = X[:,i].unsqueeze(1)
        #     print(f"X_i : {X_i.shape}")
        #     h_t,c_t = self.lstm_cell(h_t,c_t,X_i)
        
        h_t,c_t = self.lstm_cell(h_t,c_t,X)
        
        return h_t,c_t
        
        

In [22]:
import torch.nn.functional as F
class Embeddingtable(nn.Module):
    def __init__(self,vocab_size,embedding_dim):
        super(Embeddingtable,self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=vocab_size,embedding_dim=embedding_dim)
        
    def forward(self,X):
        # print(self.embedding_layer.num_embeddings)
        # print(X.device)
        return self.embedding_layer(X)
    
    def find_index_from_embedding(self,embedding):
        all_embeddings = self.embedding_layer.weight
        similarities = F.cosine_similarity(all_embeddings,embedding,dim=1)
        
        best_index = torch.argmax(similarities).item()
        return best_index

In [23]:
from collections import deque
class Encoder(nn.Module):
    def __init__(self,h_t_size,embedding_dim,no_of_lstm_layers,vocab_size):
        super(Encoder,self).__init__()
        # embedding dim -> noting but input dim
        self.embedding_layer= Embeddingtable(vocab_size=vocab_size,embedding_dim=embedding_dim)
        self.layer_of_lstms = nn.ModuleList([LSTMModel(h_t_size=h_t_size,embedding_dim=embedding_dim)])
        for i in range(no_of_lstm_layers-1):
            self.layer_of_lstms.append(LSTMModel(h_t_size=h_t_size,embedding_dim=h_t_size))
        
    def forward(self,X):
        queue = deque()
        for time_step in range(X.shape[0]):
            X_i = X[:,time_step]
            e_i = self.embedding_layer(X_i)
            # print(f"e_i : {e_i.shape}")
            for layer in range(len(self.layer_of_lstms)):
                if time_step == 0:
                    h_t,c_t = self.layer_of_lstms[layer](e_i)
                else:
                    h_t,c_t = self.layer_of_lstms[layer](e_i,*queue.popleft())
                queue.append((h_t,c_t)) 
                e_i = h_t # for the upper layers the h_t becomes input
                
        return list(queue)
        

In [24]:
class Decoder(nn.Module):
    def __init__(self,h_t_size,embedding_dim,no_of_lstm_layers,vocab_size):
        super(Decoder,self).__init__()
        
        self.embedding_layer = nn.Embedding(num_embeddings=vocab_size,embedding_dim=embedding_dim)
        self.layer_of_lstms = nn.ModuleList([LSTMModel(h_t_size=h_t_size,embedding_dim=embedding_dim)])
        for i in range(no_of_lstm_layers-1):
            self.layer_of_lstms.append(LSTMModel(h_t_size=h_t_size,embedding_dim=h_t_size))
            # for the upper layer we send h_t as input so the embedding_dim (input_dim) is equal to h_t_size
        self.classification_head = nn.Linear(in_features=h_t_size,out_features=vocab_size)
        
    def forward(self,states,X=None):
        # print("Decoder")
        all_logits = []
        queue = deque(states)
        for time_step in range(ModelArgs.block_size):
            if X is not None:
                X_i = X[:,time_step]
                e_i = self.embedding_layer(X_i)
            else:
                if time_step==0:
                    X_i = torch.full(size=[ModelArgs.batch_size],fill_value=en_vocab["<bos>"],device=ModelArgs.device)
                    e_i = self.embedding_layer(X_i)
                else:
                    X_i = preds
                    e_i = self.embedding_layer(X_i)
            for layer in range(len(self.layer_of_lstms)):
                # t = queue.popleft()
                # print(f"Layer : {layer}")
                h_t,c_t = self.layer_of_lstms[layer](e_i,*queue.popleft())
                e_i = h_t
                queue.append((h_t,c_t))
            logit_curr_time_step = self.classification_head(h_t)
            preds = torch.softmax(logit_curr_time_step,dim=1)
            preds = torch.argmax(preds,dim=1)
            
            all_logits.append(logit_curr_time_step)
            
        return torch.stack(all_logits)

In [25]:
class Seq2Seq(nn.Module):
    def __init__(self,h_t_size,embedding_dim,no_of_lstm_layers,src_vocab_size,dest_vocab_size):
        super().__init__()
        self.enocder = Encoder(h_t_size=h_t_size,embedding_dim=embedding_dim,no_of_lstm_layers=no_of_lstm_layers,vocab_size=src_vocab_size)
        self.decoder = Decoder(h_t_size=h_t_size,embedding_dim=embedding_dim,no_of_lstm_layers=no_of_lstm_layers,vocab_size=dest_vocab_size)
        

    def forward(self,X,y=None):
        states = self.enocder(X)
        # print(len(states))
        if y is not None:
            all_logits = self.decoder(states,y)
        else:
            all_logits = self.decoder(states)
        
        return all_logits
        
        
        

In [26]:
model = Seq2Seq(h_t_size=ModelArgs.ht_size,embedding_dim=ModelArgs.embedding_dim,no_of_lstm_layers=ModelArgs.no_of_lstm_layes,src_vocab_size=ModelArgs.de_vocab_size,dest_vocab_size=ModelArgs.en_vocab_size)
model = model.to(ModelArgs.device)

In [27]:
sample_batch = next(iter(train_dataloader))

In [28]:
res = model(sample_batch[0].to(ModelArgs.device),sample_batch[1].to(ModelArgs.device))

In [29]:
print(res.shape)

torch.Size([32, 32, 9797])


In [30]:
res = model(sample_batch[0].to(ModelArgs.device))
print(res.shape)

torch.Size([32, 32, 9797])


In [31]:
def train(model,model_name,criterion,optimizer,epochs,min_val_loss,train_dataloader,val_dataloader,device,patience,lr_scheduler):
    from tqdm import tqdm
    best_val_loss = float('inf')
    model = model.to(device)
    
    train_losses,val_losses,train_accs,val_accs = [],[],[],[]
    for epoch in range(epochs):
        model.train()
        train_loss,correct,total = 0.0,0,0
        train_progress = tqdm(train_dataloader,desc="Training")
        
        for de_batch,en_batch in train_progress:
            
            de_batch = de_batch.to(device)
            en_batch = en_batch.to(device)
            
            optimizer.zero_grad()
            all_logits = model(de_batch,en_batch)
            all_logits = all_logits.view(-1,ModelArgs.en_vocab_size) # [number of samples , number of classes]
            en_batch = en_batch.view(-1) # [number of samples , ]
            
            loss = criterion(all_logits,en_batch)
            loss.backward()
            optimizer.step()
            
            preds = torch.softmax(all_logits,dim=1)
            preds = torch.argmax(preds,dim=1)
            
            train_loss += loss.item()
            correct += (preds == en_batch).sum()
            total += en_batch.shape[0]
            
            train_progress.set_postfix({"loss":f"{loss.item():.4f}"})
        
        train_loss /= len(train_dataloader)
        train_acc = correct/total
        
        with torch.inference_mode():
            model.eval()

            val_loss,correct,total = 0.0,0,0
            val_progress = tqdm(val_dataloader,desc="Evaluation")
            for de_batch,en_batch in val_progress:
                de_batch = de_batch.to(device)
                en_batch = en_batch.to(device)
                
                all_logits = model(de_batch,en_batch)
                all_logits = all_logits.view(-1,ModelArgs.en_vocab_size)
                en_batch = en_batch.view(-1)
                
                loss = criterion(all_logits,en_batch)
                
                preds = torch.softmax(all_logits,dim=1)
                preds = torch.argmax(preds,dim=1)
                
                val_loss += loss.item()
                correct += (preds == en_batch).sum()
                total += en_batch.shape[0]
                
                val_progress.set_postfix({"loss":f"{loss.item():.4f}"})
            
            val_loss /= len(val_dataloader)
            val_acc = correct/total
            
        print(f"EPOCH : {epoch}/{epochs} \n Train Loss : {train_loss:.4f} \n Val Loss : {val_loss:.4f} \n Train Acc : {train_acc:.4f} \n Val Acc : {val_acc:.4f}\n\n")
        
        lr_scheduler.step(val_loss)
        
        train_losses.append(train_loss)
        val_losses.append(val_losses)
        train_accs.append(train_acc)
        val_accs.append(val_acc)    
        
        if val_loss < min_val_loss:
            print("[SUCCESS] model trained successfully")
            break
        
        if best_val_loss > val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(),model_name)
    
    return train_losses,val_losses,train_accs,val_accs
            
        
        

In [32]:
from torchinfo import summary

summary(model=model,
        input_data=sample_batch,
        col_names=["input_size","output_size","num_params","trainable"],
        row_settings=["var_names"],
        col_width=20,
        device=ModelArgs.device)

Layer (type (var_name))                                 Input Shape          Output Shape         Param #              Trainable
Seq2Seq (Seq2Seq)                                       [32, 32]             [32, 32, 9797]       --                   True
├─Encoder (enocder)                                     [32, 32]             [32, 64]             --                   True
│    └─Embeddingtable (embedding_layer)                 [32]                 [32, 32]             --                   True
│    │    └─Embedding (embedding_layer)                 [32]                 [32, 32]             597,344              True
│    └─ModuleList (layer_of_lstms)                      --                   --                   (recursive)          True
│    │    └─LSTMModel (0)                               [32, 32]             [32, 64]             24,832               True
│    │    └─LSTMModel (1)                               [32, 64]             [32, 64]             33,024               True
│  

In [33]:
optimizer = torch.optim.Adam(model.parameters(),lr=ModelArgs.max_lr)
criterion = nn.CrossEntropyLoss()
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                          mode="min",
                                                          factor=0.1,
                                                          patience=2)

In [None]:
train_losses,val_losses,train_accs,val_accs = train(model=model,
      model_name="encoder-decoder.pth",
      criterion=criterion,
      optimizer=optimizer,
      epochs=ModelArgs.epochs,
      min_val_loss=1e-3,
      train_dataloader=train_dataloader,
      val_dataloader=val_dataloader,
      device=ModelArgs.device,
      patience=2,
      lr_scheduler=lr_scheduler)

Training: 100%|██████████| 906/906 [06:10<00:00,  2.45it/s, loss=2.4678]
Evaluation: 100%|██████████| 31/31 [00:06<00:00,  4.96it/s, loss=2.4686]


EPOCH : 0/500 
 Train Loss : 3.9190 
 Val Loss : 2.4799 
 Train Acc : 0.3320 
 Val Acc : 0.3482




Training: 100%|██████████| 906/906 [1:02:55<00:00,  4.17s/it, loss=2.4177]   
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.81it/s, loss=2.4421]


EPOCH : 1/500 
 Train Loss : 2.4435 
 Val Loss : 2.4253 
 Train Acc : 0.3514 
 Val Acc : 0.3488




Training: 100%|██████████| 906/906 [04:46<00:00,  3.16it/s, loss=2.4089]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.45it/s, loss=2.4316]


EPOCH : 2/500 
 Train Loss : 2.4198 
 Val Loss : 2.4146 
 Train Acc : 0.3514 
 Val Acc : 0.3486




Training: 100%|██████████| 906/906 [06:29<00:00,  2.33it/s, loss=2.4191]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.29it/s, loss=2.3971]


EPOCH : 3/500 
 Train Loss : 2.4138 
 Val Loss : 2.4112 
 Train Acc : 0.3514 
 Val Acc : 0.3486




Training: 100%|██████████| 906/906 [04:28<00:00,  3.37it/s, loss=2.4002]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.00it/s, loss=2.4195]


EPOCH : 4/500 
 Train Loss : 2.4112 
 Val Loss : 2.4090 
 Train Acc : 0.3514 
 Val Acc : 0.3491




Training: 100%|██████████| 906/906 [04:41<00:00,  3.22it/s, loss=2.4570]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.64it/s, loss=2.4185]


EPOCH : 5/500 
 Train Loss : 2.4099 
 Val Loss : 2.4084 
 Train Acc : 0.3514 
 Val Acc : 0.3487




Training: 100%|██████████| 906/906 [04:36<00:00,  3.27it/s, loss=2.3789]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.56it/s, loss=2.4365]


EPOCH : 6/500 
 Train Loss : 2.4092 
 Val Loss : 2.4094 
 Train Acc : 0.3514 
 Val Acc : 0.3482




Training: 100%|██████████| 906/906 [04:34<00:00,  3.30it/s, loss=2.4795]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.29it/s, loss=2.4015]


EPOCH : 7/500 
 Train Loss : 2.4087 
 Val Loss : 2.4079 
 Train Acc : 0.3514 
 Val Acc : 0.3485




Training: 100%|██████████| 906/906 [04:34<00:00,  3.30it/s, loss=2.3928]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.14it/s, loss=2.4184]


EPOCH : 8/500 
 Train Loss : 2.4083 
 Val Loss : 2.4082 
 Train Acc : 0.3514 
 Val Acc : 0.3485




Training: 100%|██████████| 906/906 [04:41<00:00,  3.22it/s, loss=2.4115]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.95it/s, loss=2.4054]


EPOCH : 9/500 
 Train Loss : 2.4081 
 Val Loss : 2.4080 
 Train Acc : 0.3514 
 Val Acc : 0.3486




Training: 100%|██████████| 906/906 [04:32<00:00,  3.33it/s, loss=2.3766]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.19it/s, loss=2.4060]


EPOCH : 10/500 
 Train Loss : 2.4080 
 Val Loss : 2.4070 
 Train Acc : 0.3514 
 Val Acc : 0.3487




Training: 100%|██████████| 906/906 [04:26<00:00,  3.40it/s, loss=2.4152]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.17it/s, loss=2.4042]


EPOCH : 11/500 
 Train Loss : 2.4079 
 Val Loss : 2.4074 
 Train Acc : 0.3514 
 Val Acc : 0.3485




Training: 100%|██████████| 906/906 [04:35<00:00,  3.29it/s, loss=2.3944]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  7.87it/s, loss=2.3907]


EPOCH : 12/500 
 Train Loss : 2.4079 
 Val Loss : 2.4086 
 Train Acc : 0.3514 
 Val Acc : 0.3481




Training: 100%|██████████| 906/906 [04:41<00:00,  3.22it/s, loss=2.4041]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.43it/s, loss=2.4054]


EPOCH : 13/500 
 Train Loss : 2.4078 
 Val Loss : 2.4079 
 Train Acc : 0.3514 
 Val Acc : 0.3485




Training: 100%|██████████| 906/906 [04:33<00:00,  3.31it/s, loss=2.4047]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.37it/s, loss=2.3450]


EPOCH : 14/500 
 Train Loss : 2.4076 
 Val Loss : 2.4060 
 Train Acc : 0.3514 
 Val Acc : 0.3491




Training: 100%|██████████| 906/906 [04:44<00:00,  3.18it/s, loss=2.3832]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.72it/s, loss=2.3906]


EPOCH : 15/500 
 Train Loss : 2.4076 
 Val Loss : 2.4059 
 Train Acc : 0.3514 
 Val Acc : 0.3492




Training: 100%|██████████| 906/906 [05:37<00:00,  2.68it/s, loss=2.4236]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.29it/s, loss=2.4084]


EPOCH : 16/500 
 Train Loss : 2.4076 
 Val Loss : 2.4073 
 Train Acc : 0.3514 
 Val Acc : 0.3488




Training: 100%|██████████| 906/906 [04:32<00:00,  3.32it/s, loss=2.3695]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.16it/s, loss=2.4041]


EPOCH : 17/500 
 Train Loss : 2.4076 
 Val Loss : 2.4063 
 Train Acc : 0.3514 
 Val Acc : 0.3489




Training: 100%|██████████| 906/906 [04:27<00:00,  3.39it/s, loss=2.4220]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.91it/s, loss=2.4001]


EPOCH : 18/500 
 Train Loss : 2.4076 
 Val Loss : 2.4070 
 Train Acc : 0.3514 
 Val Acc : 0.3487




Training: 100%|██████████| 906/906 [05:34<00:00,  2.71it/s, loss=2.3852]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.97it/s, loss=2.4200]


EPOCH : 19/500 
 Train Loss : 2.4076 
 Val Loss : 2.4061 
 Train Acc : 0.3514 
 Val Acc : 0.3490




Training: 100%|██████████| 906/906 [04:26<00:00,  3.40it/s, loss=2.3704]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.36it/s, loss=2.4329]


EPOCH : 20/500 
 Train Loss : 2.4076 
 Val Loss : 2.4074 
 Train Acc : 0.3514 
 Val Acc : 0.3487




Training: 100%|██████████| 906/906 [06:22<00:00,  2.37it/s, loss=2.4316]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.31it/s, loss=2.4343]


EPOCH : 21/500 
 Train Loss : 2.4076 
 Val Loss : 2.4059 
 Train Acc : 0.3514 
 Val Acc : 0.3487




Training: 100%|██████████| 906/906 [05:32<00:00,  2.73it/s, loss=2.4431]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.45it/s, loss=2.4249]


EPOCH : 22/500 
 Train Loss : 2.4076 
 Val Loss : 2.4065 
 Train Acc : 0.3514 
 Val Acc : 0.3488




Training: 100%|██████████| 906/906 [04:28<00:00,  3.38it/s, loss=2.4147]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  7.78it/s, loss=2.3956]


EPOCH : 23/500 
 Train Loss : 2.4076 
 Val Loss : 2.4093 
 Train Acc : 0.3514 
 Val Acc : 0.3479




Training: 100%|██████████| 906/906 [04:33<00:00,  3.31it/s, loss=2.4283]
Evaluation: 100%|██████████| 31/31 [00:03<00:00, 10.00it/s, loss=2.3923]


EPOCH : 24/500 
 Train Loss : 2.4076 
 Val Loss : 2.4087 
 Train Acc : 0.3514 
 Val Acc : 0.3482




Training: 100%|██████████| 906/906 [04:29<00:00,  3.36it/s, loss=2.4479]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.87it/s, loss=2.3801]


EPOCH : 25/500 
 Train Loss : 2.4076 
 Val Loss : 2.4082 
 Train Acc : 0.3514 
 Val Acc : 0.3483




Training: 100%|██████████| 906/906 [04:33<00:00,  3.32it/s, loss=2.4211]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.24it/s, loss=2.4380]


EPOCH : 26/500 
 Train Loss : 2.4076 
 Val Loss : 2.4079 
 Train Acc : 0.3514 
 Val Acc : 0.3483




Training: 100%|██████████| 906/906 [04:31<00:00,  3.34it/s, loss=2.3741]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  7.89it/s, loss=2.3789]


EPOCH : 27/500 
 Train Loss : 2.4076 
 Val Loss : 2.4068 
 Train Acc : 0.3514 
 Val Acc : 0.3489




Training: 100%|██████████| 906/906 [04:31<00:00,  3.34it/s, loss=2.3805]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.60it/s, loss=2.4202]


EPOCH : 28/500 
 Train Loss : 2.4076 
 Val Loss : 2.4081 
 Train Acc : 0.3514 
 Val Acc : 0.3481




Training: 100%|██████████| 906/906 [04:33<00:00,  3.31it/s, loss=2.3811]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.83it/s, loss=2.3846]


EPOCH : 29/500 
 Train Loss : 2.4076 
 Val Loss : 2.4062 
 Train Acc : 0.3514 
 Val Acc : 0.3490




Training: 100%|██████████| 906/906 [04:35<00:00,  3.29it/s, loss=2.4077]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.95it/s, loss=2.3731]


EPOCH : 30/500 
 Train Loss : 2.4076 
 Val Loss : 2.4072 
 Train Acc : 0.3514 
 Val Acc : 0.3487




Training: 100%|██████████| 906/906 [35:37<00:00,  2.36s/it, loss=2.3981]    
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.72it/s, loss=2.4219]


EPOCH : 31/500 
 Train Loss : 2.4075 
 Val Loss : 2.4072 
 Train Acc : 0.3514 
 Val Acc : 0.3488




Training: 100%|██████████| 906/906 [07:40<00:00,  1.97it/s, loss=2.4437]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.28it/s, loss=2.4238]


EPOCH : 32/500 
 Train Loss : 2.4076 
 Val Loss : 2.4074 
 Train Acc : 0.3514 
 Val Acc : 0.3484




Training: 100%|██████████| 906/906 [09:40<00:00,  1.56it/s, loss=2.4356]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.67it/s, loss=2.4221]


EPOCH : 33/500 
 Train Loss : 2.4076 
 Val Loss : 2.4082 
 Train Acc : 0.3514 
 Val Acc : 0.3482




Training: 100%|██████████| 906/906 [04:47<00:00,  3.16it/s, loss=2.3640]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.43it/s, loss=2.4332]


EPOCH : 34/500 
 Train Loss : 2.4076 
 Val Loss : 2.4081 
 Train Acc : 0.3514 
 Val Acc : 0.3482




Training: 100%|██████████| 906/906 [08:04<00:00,  1.87it/s, loss=2.4190]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.94it/s, loss=2.3353]


EPOCH : 35/500 
 Train Loss : 2.4076 
 Val Loss : 2.4073 
 Train Acc : 0.3514 
 Val Acc : 0.3486




Training: 100%|██████████| 906/906 [05:01<00:00,  3.00it/s, loss=2.3787]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.92it/s, loss=2.4220]


EPOCH : 36/500 
 Train Loss : 2.4076 
 Val Loss : 2.4087 
 Train Acc : 0.3514 
 Val Acc : 0.3480




Training: 100%|██████████| 906/906 [04:21<00:00,  3.47it/s, loss=2.4369]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.68it/s, loss=2.4023]


EPOCH : 37/500 
 Train Loss : 2.4076 
 Val Loss : 2.4069 
 Train Acc : 0.3514 
 Val Acc : 0.3486




Training: 100%|██████████| 906/906 [05:30<00:00,  2.74it/s, loss=2.4008]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.11it/s, loss=2.3896]


EPOCH : 38/500 
 Train Loss : 2.4076 
 Val Loss : 2.4081 
 Train Acc : 0.3514 
 Val Acc : 0.3483




Training: 100%|██████████| 906/906 [04:22<00:00,  3.46it/s, loss=2.4109]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.73it/s, loss=2.3927]


EPOCH : 39/500 
 Train Loss : 2.4076 
 Val Loss : 2.4051 
 Train Acc : 0.3514 
 Val Acc : 0.3487




Training: 100%|██████████| 906/906 [04:21<00:00,  3.47it/s, loss=2.4160]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  8.94it/s, loss=2.3958]


EPOCH : 40/500 
 Train Loss : 2.4076 
 Val Loss : 2.4068 
 Train Acc : 0.3514 
 Val Acc : 0.3489




Training: 100%|██████████| 906/906 [05:30<00:00,  2.74it/s, loss=2.3926]
Evaluation: 100%|██████████| 31/31 [00:03<00:00,  9.41it/s, loss=2.4008]


EPOCH : 41/500 
 Train Loss : 2.4076 
 Val Loss : 2.4069 
 Train Acc : 0.3514 
 Val Acc : 0.3488




Training:  90%|████████▉ | 813/906 [08:55<02:24,  1.55s/it, loss=2.4212]

In [None]:
import matplotlib