In [1]:
#Core Python, Pandas, and kaldi_io
import numpy as np
import pandas as pd
import string
from collections import Counter,OrderedDict 
import kaldi_io
from datetime import datetime

#ngrams
import nltk,re
import nltk.corpus
from nltk.corpus import switchboard
from nltk.util import ngrams

#Scikit
from sklearn import manifold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances,average_precision_score
from sklearn.metrics.pairwise import pairwise_kernels,paired_distances
from scipy import stats
from scipy.spatial.distance import pdist

#Plotting
from matplotlib import pyplot as plt
import seaborn as sns

#BigPhoney
from big_phoney import BigPhoney


#Torch and utilities
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset,DataLoader,random_split,ConcatDataset

#Import User defined classes
from data_helpers import DataHelper
from sfba4.utils import alignSequences
from models import SimpleNet, SiameseNet, OrthographicNet
from siamese_dataset import SiameseTriplets
from ami_dataset import AMI_dataset

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

Using TensorFlow backend.


In [2]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
#Load source model
source_net = SimpleNet(9974)
source_net = source_net.to(dev)
source_net_save_path = "./Models/awe_best_model.pth"
source_net.load_state_dict(torch.load(source_net_save_path))

<All keys matched successfully>

In [4]:
num_examples = np.Inf
frequency_bounds = (0,155)
train_sm_dataset = SiameseTriplets(num_examples = num_examples, split_set = "train", frequency_bounds = frequency_bounds)
val_sm_dataset = SiameseTriplets(num_examples = num_examples, split_set = "val", frequency_bounds = frequency_bounds)
test_sm_dataset = SiameseTriplets(num_examples = num_examples, split_set = "test", frequency_bounds = frequency_bounds)

Length before filtering on char length 317927
Length after filtering on char length 173657
Length before filtering on frequency_bounds 173657
Length after filtering on frequency_bounds 125006
Finished Loading the Data, 125006 examples
Number of Unique words  9974
torch.Size([59844, 3, 40, 100])
Length before filtering on char length 317927
Length after filtering on char length 173657
Length before filtering on frequency_bounds 173657
Length after filtering on frequency_bounds 125006
Finished Loading the Data, 125006 examples
Number of Unique words  9974
torch.Size([19948, 3, 40, 100])
Length before filtering on char length 317927
Length after filtering on char length 173657
Length before filtering on frequency_bounds 173657
Length after filtering on frequency_bounds 125006
Finished Loading the Data, 125006 examples
Number of Unique words  9974
torch.Size([19948, 3, 40, 100])


In [5]:
train_dl = torch.utils.data.DataLoader(train_sm_dataset, shuffle = True, batch_size = 64, pin_memory = True)
val_dl = torch.utils.data.DataLoader(val_sm_dataset, shuffle = True, batch_size = 64, pin_memory = True)
test_dl = torch.utils.data.DataLoader(test_sm_dataset, shuffle = True, batch_size = 64, pin_memory = True)

In [6]:
word_to_num,num_to_word = train_sm_dataset.word_to_num,train_sm_dataset.num_to_word

In [3]:
def process_words(word):
    #Remove punctuation
    word = word.translate(str.maketrans('', '', string.punctuation))
    return "["+word.lower()+"]"

In [4]:
def give_common_ngrams(num = 50000):
    switchboard.ensure_loaded()
    words = switchboard.words()
    #Add start and end of word markers and make words lower case
    words = list(map(process_words,words))
    #Filter empty words
    words = list(filter(lambda x: x!="[]", words))

    #get all n_grams up to n=10
    n = 8
    ngrams_list = []

    for word in words:
        ngrams_list.append(list(filter(lambda x: x!=tuple('[') and x!= tuple(']'),list(ngrams(list(word),1)))))
        for i in range(2,n+1):
            ngrams_list.append(list(ngrams(list(word),i)))

    flatten = lambda l: [item for sublist in l for item in sublist]
    #Unroll the list
    ngrams_list = flatten(ngrams_list)

    ngrams_counter = Counter(ngrams_list)
    print(len(ngrams_counter.keys()))

    common_ngrams = []
    for index,(key,value) in enumerate(ngrams_counter.most_common(num)):
        common_ngrams.append(key)
    
    return common_ngrams



In [5]:
num_ngrams = 10000
common_ngrams = give_common_ngrams(num_ngrams)

51794


In [6]:
common_ngrams

[('e',),
 ('t',),
 ('o',),
 ('a',),
 ('h',),
 ('i',),
 ('n',),
 ('s',),
 ('u',),
 ('r',),
 ('e', ']'),
 ('[', 't'),
 ('t', ']'),
 ('l',),
 ('d',),
 ('y',),
 ('t', 'h'),
 ('[', 't', 'h'),
 ('[', 'i'),
 ('w',),
 ('[', 'a'),
 ('s', ']'),
 ('m',),
 ('h', 'e'),
 ('d', ']'),
 ('g',),
 ('[', 's'),
 ('c',),
 ('h', ']'),
 ('i', 'n'),
 ('t', 'h', 'e'),
 ('[', 'w'),
 ('y', ']'),
 ('f',),
 ('a', 'n'),
 ('[', 't', 'h', 'e'),
 ('o', 'u'),
 ('n', ']'),
 ('h', 'a'),
 ('o', ']'),
 ('b',),
 ('[', 'o'),
 ('k',),
 ('r', 'e'),
 ('[', 'y'),
 ('p',),
 ('a', 't'),
 ('i', ']'),
 ('e', 'r'),
 ('[', 'i', ']'),
 ('n', 'd'),
 ('u', 'h'),
 ('[', 'u'),
 ('i', 't'),
 ('n', 'd', ']'),
 ('v',),
 ('r', ']'),
 ('[', 'a', 'n'),
 ('e', 'a'),
 ('a', 't', ']'),
 ('[', 'b'),
 ('a', 'n', 'd'),
 ('[', 'h'),
 ('a', 'n', 'd', ']'),
 ('h', 'e', ']'),
 ('h', 'a', 't'),
 ('[', 'm'),
 ('h', 'a', 't', ']'),
 ('[', 'a', 'n', 'd'),
 ('[', 'a', 'n', 'd', ']'),
 ('v', 'e'),
 ('u', 'h', ']'),
 ('y', 'o'),
 ('n', 'g'),
 ('[', 'u', 'h'),
 ('

In [7]:
#Map common ngrams to index values for one hot encoding
ngram_to_index = {}
#ngram_to_index
for index,ngram in enumerate(common_ngrams):
    ngram_to_index[ngram] = index

In [8]:
def give_letter_ngram(word):
    
    n=10
    word_list = list(word)
    letter_ngram = np.zeros(len(common_ngrams))
    
    #Extract ngrams from the word
    ngrams_list = []
    
    ngrams_list.append(list(filter(lambda x: x!=tuple('[') and x!= tuple(']'),list(ngrams(list(word),1)))))
    for i in range(2,n+1):
        ngrams_list.append(list(ngrams(list(word),i)))
    
    #Flatten
    flatten = lambda l: [item for sublist in l for item in sublist]
    #Unroll the list
    ngrams_list = flatten(ngrams_list)
    
    for ngram in ngrams_list:
        if ngram in ngram_to_index.keys():
            letter_ngram[ngram_to_index[ngram]] += 1
        
    return letter_ngram

In [9]:
def batch_letter_ngrams(words):
    letter_ngrams = []
    for word in words:
        letter_ngrams.append(give_letter_ngram(word))
    
    return np.stack(letter_ngrams)

In [10]:
def d_cos(x1,x2,cos):
    
    return (1-cos(x1,x2))/2

In [11]:
def triplet_loss(word_embedding,same_word_embedding,diff_word_embedding):
    m = torch.tensor(1.0, dtype = torch.float).to(dev, non_blocking = True)
    lower_bound = torch.tensor(0.0, dtype = torch.float).to(dev, non_blocking = True)
    a = torch.max(lower_bound,m - cos(word_embedding ,same_word_embedding) + cos(word_embedding ,diff_word_embedding))

    
    return torch.mean(a)

In [12]:
class OrthographicNet(nn.Module):
    def __init__(self,num_input,num_output):
        super(OrthographicNet, self).__init__()
        self.fc1 = nn.Linear(num_input, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, num_output)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        x = F.relu(self.fc3(x))

        return x
    
    def give_embeddings(self,x,dev):
        x = F.relu(self.fc1(x))
        #print(x.shape)
        x = F.relu(self.fc2(x))
        #print(x.shape)
        x = F.relu(self.fc3(x))
        #print(x.shape)
        #print("Done")
        return x.cpu().detach().numpy() if dev.type == 'cuda' else x.detach().numpy()

In [None]:
num_input,num_output = 50000,9974
orthographic_net = OrthographicNet(num_input,num_output)
orthographic_net = orthographic_net.float()
orthographic_net.to(dev)
optimizer = optim.SGD(orthographic_net.parameters(), lr=0.001, momentum=0.9)
#optimizer = torch.optim.Adadelta(orthographic_net.parameters(), lr=1.0, rho=0.9, eps=1e-06, weight_decay=0)
cos = nn.CosineSimilarity(dim=1, eps=1e-12)

In [66]:
import pdb

In [None]:

num_epochs = 50
verbose = True
model_save_path = "./Models/best_orthographic_model2.pth"
best_val_loss = np.Inf

batch_limit = 10

train_loss_list = []
val_loss_list = []

for epoch in range(0,num_epochs):
    if verbose:
            print('epoch %d '%(epoch))

    train_loss = 0
    orthographic_net.train()
    for batch_idx, (train_data,train_labels) in enumerate(train_dl):

        #print(train_data.shape)
        #Move to GPU
        optimizer.zero_grad()
        train_data = train_data.to(dev, non_blocking=True)
        #Get word mfcc features
        word = train_data[:,0,:]
        #Get labels
        word_labels = [num_to_word[int(train_labels[i,0])] for i in range(train_labels.shape[0])]
        diff_word_labels = [num_to_word[int(train_labels[i,1])] for i in range(train_labels.shape[0])]
            
            
        #Get letter_ngrams
        word_letter_ngrams = torch.tensor(batch_letter_ngrams(word_labels), dtype =torch.float, device = dev)
        diff_letter_ngrams = torch.tensor(batch_letter_ngrams(diff_word_labels), dtype =torch.float,device = dev)

        #Get the word embedding and letter_ngram embeddings
        with torch.no_grad():
            word_embedding = source_net(word)
        
        #word_embedding = np.stack([saved_word_embedding_dict.item().get(word).squeeze() for word in word_labels ])
        #word_embedding = torch.tensor(word_embedding,dtype =torch.float, device = dev)
        
        word_ngram_embedding = orthographic_net(word_letter_ngrams)
        diff_word_ngram_embedding = orthographic_net(diff_letter_ngrams)
        
        
        #Calculate the triplet loss
        
        loss = triplet_loss(word_embedding,word_ngram_embedding,diff_word_ngram_embedding)
        pdb.set_trace()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        
        if batch_idx == batch_limit:
            break
        

        
        


    '''
    orthographic_net.eval()
    with torch.no_grad():
        val_loss = 0
        for batch_idx, (val_data,val_labels) in enumerate(val_dl):

            val_data = val_data.to(dev, non_blocking=True)
            #Get word mfcc features
            word = train_data[:,0,:]
            #Get labels
            word_labels = [num_to_word[int(val_labels[i,0])] for i in range(train_labels.shape[0])]
            diff_word_labels = [num_to_word[int(val_labels[i,1])] for i in range(train_labels.shape[0])]
            
            #Get letter_ngrams
            word_letter_ngrams = torch.tensor(batch_letter_ngrams(word_labels), dtype =torch.float, device = dev)
            diff_letter_ngrams = torch.tensor(batch_letter_ngrams(diff_word_labels), dtype =torch.float,device = dev)

            #Get the word embedding and letter_ngram embeddings
            word_embedding = source_net(word)
            
            #word_embedding = np.stack([saved_word_embedding_dict.item().get(word).squeeze() for word in word_labels ])
            #word_embedding = torch.tensor(word_embedding,dtype =torch.float, device = dev)
            
            word_ngram_embedding = orthographic_net(word_letter_ngrams)
            diff_word_ngram_embedding = orthographic_net(diff_letter_ngrams)
        
            

            #Calculate the triplet loss
            val_loss += triplet_loss(word_embedding,word_ngram_embedding,diff_word_ngram_embedding)

            if batch_idx == batch_limit:
                break
            

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            print("Best val loss %.3f Saving Model..."%(val_loss/len(val_dl)))
            torch.save(orthographic_net.state_dict(),model_save_path)
        
        '''


    if verbose:
        print("train loss: %.8f"%(train_loss/len(train_dl)))
        #print("val loss: %.5f"%(val_loss/len(val_dl)))
        
    train_loss_list.append(train_loss/len(train_dl))
    #val_loss_list.append(val_loss/len(val_dl))


In [None]:
#Plot the learning curves

plt.title('Learning Curves')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.plot(range(len(train_loss_list)),train_loss_list, label = 'train')
plt.plot(range(len(train_loss_list)), val_loss_list, label = 'val')
plt.legend()
plt.savefig('orthographic_lc.png')

In [None]:
#Load the best orthographic model
model_save_path = "./Models/best_orthographic_model2.pth"
orthographic_net.load_state_dict(torch.load(model_save_path))

In [None]:
load_list = ['Data/feats_cmvn.ark']
num_examples = np.Inf
dh = DataHelper(load_list,num_examples)
dh.load_data()
dh.process_data()
c,word_to_num,num_to_word = dh.generate_key_dicts()

In [None]:
#Save orthographic model word embeddings

#Load Words
words = list(c.keys())

In [None]:
#Generate word embedding dict
word_embedding_dict = {}

for word in words:
    
    with torch.no_grad():
        word_features = give_letter_ngram(word)
        #print(word_features)
        word_embedding = orthographic_net(torch.tensor(word_features, dtype =torch.float, device = dev))

        word_embedding_dict[word] = word_embedding.detach().cpu().numpy()
    

In [None]:
word_embedding_dict

In [None]:
np.save("Data/orthographic_word_embedding_dict2.npy",word_embedding_dict)

# Rough

## Check how similar are the word embeddings to the saved one

In [18]:
load_list = ['Data/feats_cmvn.ark']
num_examples = np.Inf
dh = DataHelper(load_list,num_examples)
dh.load_data()
dh.process_data()
c,word_to_num_cl,num_to_word_cl = dh.generate_key_dicts()
inputs,labels = dh.give_inputs_and_labels()
del dh
words = list(c.keys())

Length before filtering on char length 317927
Length after filtering on char length 173657
Length before filtering on frequency_bounds 173657
Length after filtering on frequency_bounds 173657
Finished Loading the Data, 173657 examples
Number of Unique words  9974


In [19]:
def generate_word_embedding_dict(words,net):
    word_embedding_dict = OrderedDict()
    #Calculate embeddings
    for word in words:
        #Find the mfcc features of the acoustic representation of the word in the data
        word_features = inputs[np.where(np.isin(labels,word_to_num[word]))]
        
        #Calculate embeddings for the feature
        word_embedding = net.give_embeddings(torch.tensor(word_features, device = dev, dtype=torch.float),dev)
        
        #If the number of representation is more than one, take the average embedding
        word_embedding_dict[word] = np.mean(word_embedding, axis = 0).reshape(1,-1)
    
    return word_embedding_dict

In [20]:
word_embedding_dict = generate_word_embedding_dict(words,source_net)

In [21]:
#Load the word embeddings
saved_word_embedding_dict = np.load('Data/word_embedding_dict.npy', allow_pickle = True)

In [21]:
d = []
for word in words:
    calc_em = word_embedding_dict[word].reshape(1,-1)
    saved_em = saved_word_embedding_dict.item().get(word).squeeze().reshape(1,-1)
    a = pairwise_kernels(calc_em,saved_em, metric = "cosine")
    #print(word,a)
    d.append(a)
print(np.mean(d))

0.95097256


## Check if the siamese dataset is working correctly

In [24]:
#Check if the words are mapped to same mfcc or not
matches = 0
total_data = 0
word_index = 1
#Loop through training examples
for batch_idx, (train_data,train_labels) in enumerate(train_dl):

    #Loop through individual examples
    for i in range(train_data.shape[0]):
        
        #Extract a word and it's mfcc
        mfcc_index = 0 if word_index ==0 else 2
        mfcc = train_data[i,mfcc_index]
        label = train_labels[i,word_index]
        word = num_to_word[int(label.numpy())]
        
        
        #Get all mfccs for this word from the classic dataloader
        label_cl_num = word_to_num_cl[word]
        #print(label_cl_num)
        ids = np.where(np.isin(labels,label_cl_num))
        
        #mfccs
        mfccs = inputs[ids]
        
        for j in range(mfccs.shape[0]):
            if np.array_equal(mfcc,mfccs[j]):
                matches+=1
    
    total_data += train_data.shape[0]
    break
    
print(total_data,matches)

64 64


## Check if n-gram vector is rich enough to predict words

In [12]:
def labels_to_one_hot(words):
    
    one_hot = np.zeros((len(words),len(word_to_num.keys())))
    
    for i,word in enumerate(words):
        one_hot[i,word_to_num[word]] = 1
    
    return one_hot
    
    

In [13]:
from matplotlib.lines import Line2D
def plot_grad_flow(named_parameters):
    '''Plots the gradients flowing through different layers in the net during training.
    Can be used for checking for possible gradient vanishing / exploding problems.
    
    Usage: Plug this function in Trainer class after loss.backwards() as 
    "plot_grad_flow(self.model.named_parameters())" to visualize the gradient flow'''
    ave_grads = []
    max_grads= []
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
            layers.append(n)
            ave_grads.append(p.grad.abs().mean())
            max_grads.append(p.grad.abs().max())
    plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c")
    plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, lw=2, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(left=0, right=len(ave_grads))
    plt.ylim(bottom = -0.001, top=0.02) # zoom in on the lower gradient regions
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    plt.grid(True)
    plt.legend([Line2D([0], [0], color="c", lw=4),
                Line2D([0], [0], color="b", lw=4),
                Line2D([0], [0], color="k", lw=4)], ['max-gradient', 'mean-gradient', 'zero-gradient'])

In [14]:
class NgramNet(nn.Module):
    def __init__(self,num_input,num_output):
        super(NgramNet, self).__init__()
        
    
        self.fc1 = nn.Linear(num_input, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, num_output)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        #print(x.shape)
        x = F.relu(self.fc2(x))
        #print(x.shape)
        x = F.relu(self.fc3(x))
        #print(x.shape)
        x = F.log_softmax(x,dim=1)
        return x
        

In [15]:
from train_test_helpers import accuracy

In [16]:
bs = 64
num_examples = np.Inf
train_ds = AMI_dataset(num_examples = num_examples, split_set = "train", data_filepath = "Data/feats_cmvn.ark", char_threshold = 5, frequency_bounds = (0,np.Inf))
train_dl = DataLoader(train_ds, batch_size=bs, pin_memory = True, shuffle = True, drop_last = True)

val_ds = AMI_dataset(num_examples = num_examples, split_set = "val", data_filepath = "Data/feats_cmvn.ark", char_threshold = 5, frequency_bounds = (0,np.Inf))
val_dl = DataLoader(val_ds, batch_size=bs, pin_memory = True, shuffle = True, drop_last = True)

Length before filtering on char length 317927
Length after filtering on char length 173657
Length before filtering on frequency_bounds 173657
Length after filtering on frequency_bounds 173657
Finished Loading the Data, 173657 examples
Number of Unique words  9974
Length before filtering on char length 317927
Length after filtering on char length 173657
Length before filtering on frequency_bounds 173657
Length after filtering on frequency_bounds 173657
Finished Loading the Data, 173657 examples
Number of Unique words  9974


In [17]:
num_to_word,word_to_num = train_ds.num_to_word,train_ds.word_to_num

In [18]:
num_words = len(word_to_num.keys())
ngram_net = NgramNet(num_ngrams,num_words)
ngram_net = ngram_net.float()
ngram_net = ngram_net.to(dev)

In [19]:
#Defining training criterion
criterion = nn.NLLLoss()
optimizer = optim.SGD(ngram_net.parameters(), lr=0.001, momentum=0.9)
num_epochs = 150

In [None]:
#Loop through words
num_epochs = 100
verbose = True
model_save_path = "./Models/best_ngram_model.pth"
best_val_loss = np.Inf

batch_limit = np.Inf

train_loss_list = []
train_acc_list = []
val_loss_list = []
val_acc_list = []

for epoch in range(0,num_epochs):
    if verbose:
            print('epoch %d '%(epoch))

    train_loss = 0
    train_acc = 0
    ngram_net.train()
    for batch_idx, (train_data,train_labels) in enumerate(train_dl):

        #print(train_data.shape)
        #Move to GPU
        optimizer.zero_grad()
        
        
        #Get labels
        word_labels = [num_to_word[int(train_labels[i])] for i in range(train_labels.shape[0])]
        
        
        #Get letter_ngrams
        letter_ngrams = torch.tensor(batch_letter_ngrams(word_labels), dtype =torch.float, device = dev)

        
        #Get labels as one hot
        labels = train_labels.to(dev)
        
        #print(letter_ngrams.shape)
        #print(labels)
        
        #Predict words using the model
        predicted_labels = ngram_net(letter_ngrams)
        
        #Calculate loss
        loss = criterion(predicted_labels,labels.long())
        loss.backward()
        plot_grad_flow(ngram_net.named_parameters())
        
        #for n,p in ngram_net.named_parameters():
        #    print(n)
        #    #print(p)
        #    print(p.grad.abs().mean())
            
        optimizer.step()
        
        train_loss += loss.item()
        train_acc += accuracy(predicted_labels,labels)

        if batch_idx == batch_limit:
            break
            
    
    ngram_net.eval()
    with torch.no_grad():
        val_loss = 0
        val_acc = 0
        for batch_idx, (val_data,val_labels) in enumerate(val_dl):
            


            #Get labels
            word_labels = [num_to_word[int(val_labels[i])] for i in range(train_labels.shape[0])]
            
            
            #Get letter_ngrams
            letter_ngrams = torch.tensor(batch_letter_ngrams(word_labels), dtype =torch.float, device = dev)

            
            #Get labels as one hot
            labels = val_labels.to(dev)
            
            #Predict words using the model
            predicted_labels = ngram_net(letter_ngrams)
            
            #Calculate loss
            batch_val_loss = criterion(predicted_labels,labels.long())
        
            

            #Calculate the triplet loss
            val_loss += batch_val_loss.item()
            val_acc += accuracy(predicted_labels,labels)

            
            if batch_idx == batch_limit:
                break
            

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            print("Best val loss %.3f Saving Model..."%(val_loss/len(val_dl)))
            torch.save(ngram_net.state_dict(),model_save_path)
        
    
    
    if verbose:
        print("train loss: %.3f train acc: %.3f"%(train_loss/len(train_dl),train_acc/len(train_dl)))
        print("val loss: %.3f val acc: %.3f"%(val_loss/len(val_dl),val_acc/len(val_dl)))
    
    
    
    train_loss_list.append(train_loss/len(train_dl))
    train_acc_list.append(train_acc/len(train_dl))
    val_loss_list.append(val_loss/len(val_dl))
    val_acc_list.append(val_acc/len(val_dl))
        

        

epoch 0 
Best val loss 9.183 Saving Model...
train loss: 9.196 train acc: 0.018
val loss: 9.183 val acc: 0.020
epoch 1 
Best val loss 9.134 Saving Model...
train loss: 9.163 train acc: 0.027
val loss: 9.134 val acc: 0.026
epoch 2 
Best val loss 8.708 Saving Model...
train loss: 9.016 train acc: 0.015
val loss: 8.708 val acc: 0.016
epoch 3 
Best val loss 8.110 Saving Model...
train loss: 8.341 train acc: 0.039
val loss: 8.110 val acc: 0.039
epoch 4 
Best val loss 7.843 Saving Model...
train loss: 7.980 train acc: 0.061
val loss: 7.843 val acc: 0.078
epoch 5 
Best val loss 7.561 Saving Model...
train loss: 7.712 train acc: 0.091
val loss: 7.561 val acc: 0.121
epoch 6 
Best val loss 7.219 Saving Model...
train loss: 7.401 train acc: 0.141
val loss: 7.219 val acc: 0.173
epoch 7 
Best val loss 6.820 Saving Model...
train loss: 7.022 train acc: 0.192
val loss: 6.820 val acc: 0.212
epoch 8 
Best val loss 6.374 Saving Model...
train loss: 6.596 train acc: 0.242
val loss: 6.374 val acc: 0.280
e

val loss: 3.465 val acc: 0.629
epoch 75 
Best val loss 3.464 Saving Model...
train loss: 3.389 train acc: 0.636
val loss: 3.464 val acc: 0.630
epoch 76 
