# Top K Word AWEs

Notebook to Train and Run Word Embeddings for the Top K most common words in the dataset

## 1. Import Libraries

In [2]:
#Core Python, Pandas, and kaldi_io
import numpy as np
import pandas as pd
import string
from collections import Counter
import random
import kaldi_io

#Scikit
from sklearn import manifold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances,average_precision_score
from sklearn.metrics.pairwise import pairwise_kernels
from scipy import stats
from scipy.spatial.distance import pdist

#Plotting
from matplotlib import pyplot as plt
import seaborn as sns



#Torch and utilities
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset,DataLoader,random_split,ConcatDataset
import torch.cuda as cutorch

#Import User defined classes
from data_helpers import DataHelper
from models import SimpleNet, SiameseNet
from train_test_helpers import accuracy,evaluate_model,evaluate_model_paper,test_model,plot_learning_curves
from train_test_helpers import plot_learning_curves,siamese_train_loop,train_loop 

#datsets
from datasets import CNN_dataset, SiameseTriplets, Siamese_top_k, CNN_top_k
from datasets import BalancedBatchSampler,TopK_WordsSampler


################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



## 2. Train Models

Here we train various AWE models on these top k common words

In [2]:
#Define common training parameters
cuda = torch.cuda.is_available()
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

### 2.1 CNN Model

#Load Data

#Dataset Parameters
snr = np.Inf
cluster = False

#Datasets
train_ds = AMI_dataset(split_set = "train", char_threshold = 5, frequency_bounds = (0,np.Inf), snr = snr, cluster = cluster)
val_ds = AMI_dataset(split_set = "val", char_threshold = 5, frequency_bounds = (0,np.Inf), snr = snr, cluster = cluster)
#test_ds = AMI_dataset(split_set = "test", char_threshold = 5, frequency_bounds = (0,np.Inf), snr = snr, cluster = False)

#Define Model and training Parameters
num_output = len(train_ds.c.keys())
#Defining training criterion
criterion = nn.NLLLoss()

#optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
num_epochs = 150
batch_size = 64

#K values
k_values = [100,500,1000,5000]

#Loop over various values of k
for k in k_values:
    
    #Create the sampler
    train_batch_sampler = TopK_WordsSampler(train_ds.labels, k = k, batch_size = batch_size)
    
    #Dataloaders
    kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
    train_dl = torch.utils.data.DataLoader(train_ds, batch_sampler=train_batch_sampler, **kwargs)
    val_dl = DataLoader(val_ds, batch_size=bs, pin_memory = True, shuffle = True, drop_last = True)
    
    
    #Create Model
    net = SimpleNet(num_output)
    net = net.float()
    net.to(dev)
    
    #Optimizer
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
    
    #Unique save path
    model_save_path = "./Models/cnn_top_%d_words.pth"%(k)
    lc_save_path = "./Models/cnn_top_%d_words.png"%(k)
    
    #Train the Model
    hist = train_loop(net,num_epochs,train_dl,val_dl,optimizer,criterion,dev,save_path=model_save_path,verbose = True)
    
    #Save learning curves
    plot_learning_curves(hist,lc_save_path, show = False)
    
    

#Free memory from datasets
del train_ds,val_ds,train_dl,val_dl,train_batch_sampler

### 3.2 Siamese Model

In [3]:
#Defining training criterion

#optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
num_epochs = 150
batch_size = 64

In [4]:
#K values
#k_values = [100,500,1000,5000]
k_values = [100]
#Dataset Parameters
snr = np.Inf
cluster = False

#Loop over various values of k
for k in k_values:
    
    train_ds = Siamese_top_k(k = k, split_set = "train", frequency_bounds = (0,np.Inf), snr = snr, cluster = cluster)
    val_ds = Siamese_top_k(k = k, split_set = "val", frequency_bounds = (0,np.Inf), snr = snr, cluster = cluster)


    #DataLoaders
    train_dl = DataLoader(train_ds, batch_size=batch_size, pin_memory = True, shuffle = True, drop_last = True)
    val_dl = DataLoader(val_ds, batch_size=batch_size, pin_memory = True, shuffle = True, drop_last = True)
    
    #Create Model
    net = SiameseNet()
    net = net.float()
    net.to(dev)
    
    #Optimizer
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
    
    #Unique save path
    model_save_path = "./Models/Vanilla_Siamese_top_%d_words.pth"%(k)
    lc_save_path = "./Models/Vanilla_Siamese_top_%d_words.png"%(k)
    
    #Train the Model
    hist = siamese_train_loop(net,num_epochs,train_dl,val_dl,optimizer,dev,save_path=model_save_path,verbose = True)
    
    #Save learning curves
    plot_learning_curves(hist,lc_save_path, show = False)
    
    

Length before filtering on char length 169383
Length after filtering on char length 80821
Finished Loading the Data, 80821 examples
Number of Unique words  8607
Length before filtering for top 100 words 80821
Length after filtering for top 100 words 26971
(16182, 40, 100) (5394, 40, 100) (5395, 40, 100)


  self.data_class[self.word_to_num[key]] = torch.tensor(self.inputs[ids], dtype = torch.float).cpu()


Triplet Shape
torch.Size([1500, 3, 40, 100]) torch.Size([1500, 2])
Length before filtering on char length 169383
Length after filtering on char length 80821
Finished Loading the Data, 80821 examples
Number of Unique words  8607
Length before filtering for top 100 words 80821
Length after filtering for top 100 words 26971
(16182, 40, 100) (5394, 40, 100) (5395, 40, 100)
Triplet Shape
torch.Size([1500, 3, 40, 100]) torch.Size([1500, 2])
epoch 0 
Best val loss 0.141 Saving Model...
train loss: 0.142
val loss: 0.141
epoch 1 
Best val loss 0.139 Saving Model...
train loss: 0.139
val loss: 0.139
epoch 2 
Best val loss 0.135 Saving Model...
train loss: 0.135
val loss: 0.135
epoch 3 
Best val loss 0.131 Saving Model...
train loss: 0.131
val loss: 0.131
epoch 4 
Best val loss 0.127 Saving Model...
train loss: 0.126
val loss: 0.127
epoch 5 
Best val loss 0.125 Saving Model...
train loss: 0.121
val loss: 0.125
epoch 6 
Best val loss 0.121 Saving Model...
train loss: 0.117
val loss: 0.121
epoch 7 

In [5]:
snr = np.Inf
cluster = False
train_ds = CNN_top_k(k = 100, split_set = "train", char_threshold = 5, frequency_bounds = (0,np.Inf), snr = snr, cluster = cluster)

Length before filtering on char length 169383
Length after filtering on char length 80821
Finished Loading the Data, 80821 examples
Number of Unique words  8607
Length before filtering for top 100 words 80821
Length after filtering for top 100 words 26971
(16182, 40, 100) (5394, 40, 100) (5395, 40, 100)


### 3.3 Siamese Model with Triplet Mining

## Visualize Results

In [10]:
a = ["word","blabla","l","a","l"]

b = np.array(a)

print(b)

['word' 'blabla' 'l' 'a' 'l']


In [13]:
np.where(b == "l")[0]

array([2, 4], dtype=int64)

In [None]:
a = ""