In [2]:
#Core Python, Pandas, and kaldi_io
import numpy as np
import pandas as pd
import string
from collections import Counter
import kaldi_io

#Scikit
from sklearn import manifold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances,average_precision_score
from sklearn.metrics.pairwise import pairwise_kernels

#Plotting
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.spatial.distance import pdist


#Torch and utilities
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset,DataSet,DataLoader,random_split,ConcatDataset

In [None]:
def filter_on_character_length(matrices,mat_lengths,keys, char_threshold = 5):
    '''Takes in matrices and keys. Filters the data by making all keys lowercase, removing words
    with number of letters less than a threshold.'''
    
    print('Length before filtering %d'%(len(keys)))
    #Lowercase all keys
    keys = list(map(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower(),keys))
    
    #Filter if the characters are smaller than the character threshold
    matrices,mat_lengths,keys = zip(*filter(lambda x: len(x[2])>=char_threshold, zip(matrices,mat_lengths,keys)))
    
    matrices,mat_lengths,keys = list(matrices),list(mat_lengths),list(keys)
    
    print('Length after filtering %d'%(len(keys)))
    

    return matrices,mat_lengths,keys
    

In [None]:
def filter_on_character_frequency(matrices,mat_lengths,keys,frequency_bounds = (0,np.Inf)):
    '''Filter words that have frequnecy less than a lower bound threshold or more than an upper bound threshold'''
    
    print('Length before filtering %d'%(len(keys)))
    
    #Create a Counter
    c = Counter(keys)
    
    #Get the words whose frequency is below a lower bound threshold or above an upper bound threshold
    remove_list = []
    
    for key,value in c.items():
        if value < frequency_bounds[0] or value > frequency_bounds[1]:
            remove_list.append(key)
            
    #Remove the words from the Counter
    for word in remove_list:
        del c[word]
        
    #Remove the words from data
    matrices,mat_lengths,keys = zip(*filter(lambda x: x[2] not in remove_list, zip(matrices,mat_lengths,keys)))
    
    
    print('Length after filtering %d'%(len(keys)))
    
    return map(list,(matrices,mat_lengths,keys))

In [None]:
# Function to truncate and limit dimensionality
def truncate_shapes(matrices,mat_lengths,max_length = 100,num_mfcc_features = 40):
    
    for i, seq in enumerate(matrices):
        matrices[i] = matrices[i][:max_length, :num_mfcc_features]
        mat_lengths[i] = min(mat_lengths[i], max_length)
    
    return matrices,mat_lengths 

In [None]:
#Function for padding
def pad_sequences(x, n_padded, center_padded=True):
    """Return the padded sequences and their original lengths."""
    padded_x = np.zeros((len(x), n_padded, x[0].shape[1]))
    lengths = []
    for i_data, cur_x in enumerate(x):
        length = cur_x.shape[0]
        if center_padded:
            padding = int(np.round((n_padded - length) / 2.))
            if length <= n_padded:
                padded_x[i_data, padding:padding + length, :] = cur_x
            else:
                # Cut out snippet from sequence exceeding n_padded
                padded_x[i_data, :, :] = cur_x[-padding:-padding + n_padded]
            lengths.append(min(length, n_padded))
        else:
            length = min(length, n_padded)
            padded_x[i_data, :length, :] = cur_x[:length, :]
            lengths.append(length)
    return padded_x, lengths

In [None]:
def generate_key_dicts_and_labels(keys):
    '''Arguments:
    keys : A list of words corresponding to the mfcc feature matrices
    -------------
    Returns:
    labels : A list of numbers correspoding to the words in the list keys'''
    c = Counter(keys)
    #print(c)
    num_words = len(c.keys())
    word_to_num = {}
    num_to_word = {}

    index = 0
    for key in c.keys():
        word_to_num[key] = index
        num_to_word[index] = key
        index+=1

    label_list = []
    for key in keys:
        label_list.append(word_to_num[key])

    print('Number of Unique words ',len(c.keys()))
    return c,word_to_num,num_to_word,label_list

In [None]:
#Load the data
keys = []
matrices = []
mat_lengths = []

#number_list = [9,12,14,18,21,25,27,28]
number_list = [9]
load_list = ['Data/raw_mfcc_AMI_Segments.%d.scp'%(number) for number in number_list]
#load_list = ['Data/word_mfcc_features.ark']

for load_file in load_list:
    file_keys,file_matrices,file_mat_lengths = [],[],[]
    for key,matrix in kaldi_io.read_mat_scp(load_file):
    #for key,matrix in kaldi_io.read_mat_ark(load_file):
        file_keys.append(key.split('_')[1])
        file_matrices.append(matrix)
        file_mat_lengths.append(matrix.shape[0])
    #Filter the data
    file_matrices,file_mat_lengths,file_keys = filter_on_character_length(file_matrices,file_mat_lengths,file_keys,char_threshold = 5)
    
    #Add to the main list
    keys.extend(file_keys)
    matrices.extend(file_matrices)
    mat_lengths.extend(file_mat_lengths)
print(len(keys))

In [None]:
c = Counter(keys)

In [None]:
#Truncate the dimensions of the data
matrices,mat_lengths = truncate_shapes(matrices,mat_lengths,max_length=200,num_mfcc_features=40)
#Pad the matrices
matrices,mat_lengths = pad_sequences(matrices,n_padded = 100,center_padded = True)
matrices = np.transpose(matrices,(0,2,1))
#Generate keys and labels
c,word_to_num,num_to_word,label_list = generate_key_dicts_and_labels(keys)
#delete keys and mat_lengths
del keys,mat_lengths

In [None]:
dev = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
inputs = np.stack(matrices)
del matrices
#inputs = np.expand_dims(inputs,1)
labels = np.array(label_list)
del label_list
print(inputs.shape)
print(labels.shape)

In [None]:
x_trainval,x_test,y_trainval,y_test = train_test_split(inputs, labels, test_size=0.2, random_state=32)

In [None]:
x_train,x_val,y_train,y_val = train_test_split(x_trainval,y_trainval,test_size =0.25, random_state = 32)

In [None]:
x_train,y_train = torch.tensor(x_train,dtype= torch.float),torch.tensor(y_train, dtype= torch.float)
x_val,y_val = torch.tensor(x_val, dtype= torch.float),torch.tensor(y_val, dtype= torch.float)
x_test,y_test = torch.tensor(x_test, dtype= torch.float),torch.tensor(y_test, dtype= torch.float)

In [None]:
print(x_train.shape,y_train.shape)
print(x_val.shape,y_val.shape)
print(x_test.shape,y_test.shape)

In [None]:
bs = 64
train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=bs, pin_memory = True, drop_last = True)

val_ds = TensorDataset(x_val, y_val)
val_dl = DataLoader(val_ds, batch_size=bs, pin_memory = True, drop_last = True)

test_ds = TensorDataset(x_test, y_test)
test_dl = DataLoader(test_ds, batch_size=bs, pin_memory = True, drop_last = True)

In [None]:
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.conv1 = nn.Conv1d(40,96,(10))
        self.pool = nn.MaxPool1d(3)
        self.conv2 = nn.Conv1d(96, 96, (8))
        #self.fc1 = nn.Linear(1728, 1024)
        self.fc1 = nn.Linear(672, 1024)
        self.fc2 = nn.Linear(1024, len(c.keys()))
        self.sm = nn.Softmax(dim = 1)
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        #print(x.shape)
        x = self.pool(F.relu(self.conv2(x)))
        #print('Pre')
        #print(x.shape)  
        x = x.view(x.shape[0], -1)
        #print('Post')
        #print(x.shape)
        x = F.relu(self.fc1(x))
        #print(x.shape)
        x = F.relu(self.fc2(x))
        #print(x.shape)
        x = F.log_softmax(x,dim=1)
        #print(x.shape)
        #print("Done")
        return x
    
    def give_embeddings(self,x):
        x = self.pool(F.relu(self.conv1(x)))
        #print(x.shape)
        x = self.pool(F.relu(self.conv2(x)))
        #print('Pre')
        #print(x.shape)  
        x = x.view(x.shape[0], -1)
        #print('Post')
        #print(x.shape)
        x = F.relu(self.fc1(x))
        #print(x.shape)
        return x.cpu().detach().numpy() if dev.type == 'cuda' else x.detach().numpy()




In [None]:
#net = SimpleNet()
net = SimpleNet()
net = net.float()
net.to(dev)

In [None]:
#Load the best model
best_model_path = "./Models/l2_best_model.pth"
net.load_state_dict(torch.load(best_model_path))

In [None]:
train_words = set([num_to_word[y_train[i].item()] for i in range(y_train.shape[0])])
val_words = set([num_to_word[y_val[i].item()] for i in range(y_val.shape[0])])
test_words = set([num_to_word[y_test[i].item()] for i in range(y_test.shape[0])])

In [None]:
def add_membership(word):
    train_tag = "Tr" if word in train_words else ""
    val_tag = "|Val" if word in val_words else ""
    test_tag = "|Ts" if word in test_words else ""
    return word+" (%s%s%s)"%(train_tag,val_tag,test_tag)

In [None]:
def visualize_words(word_list,tsne_vectors,labels,rigid_limits = False):
    #Get ids of words from labels
    num_list = [word_to_num[word] for word in word_list]
    ids = np.where(np.isin(labels,np.array(num_list)))
    
    df_subset = {}
    df_subset['tsne-2d-one'] = tsne_vectors[ids][:,0]
    df_subset['tsne-2d-two'] = tsne_vectors[ids][:,1]
    df_subset['Word'] = [num_to_word[labels[ids][i].item()] for i in range(ids[0].shape[0])]
    
    #Convert to dataframe
    df_subset = pd.DataFrame(df_subset)
    #Add membership tags
    df_subset['Word'] = df_subset['Word'].apply(add_membership)
    
    #print(df_subset['tsne-2d-one'].shape)
    #print(df_subset['tsne-2d-two'].shape)
    #print(len(df_subset['y']))
    
    #print(ids)
    #print(df_subset['y'])
    
    g = sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="Word",
    data=df_subset,
    legend="full",
    alpha=0.5)
    g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)
    
    if rigid_limits:
        x_coordinate = tsne_vectors[:,0]
        y_coordinate = tsne_vectors[:,1]
        epsilon = 5
        plt.xlim(min(x_coordinate)-epsilon,max(x_coordinate)+epsilon)
        plt.ylim(min(y_coordinate)-epsilon,max(y_coordinate)+epsilon)
    

In [None]:
def nearest_neighbors(net,inputs,labels,n_neighbors = 3):
    
    if dev.type == 'cuda':
        inputs = inputs.to(dev, non_blocking = True)
    embeddings = net.give_embeddings(inputs)
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute',metric = 'cosine').fit(embeddings)
    distances,indices = nbrs.kneighbors(embeddings)
    
    return distances,indices
    

In [None]:
def nearest_neighbors_for_word(word,indices,labels):
    
    num_list = [word_to_num[word]]
    ids = np.where(np.isin(labels,np.array(num_list)))
    neighbor_indices = indices[ids]
    
    for i in range(neighbor_indices.shape[0]):
        neighboring_words = [num_to_word[labels[neighbor_indices[i,j]]] for j in range(neighbor_indices.shape[1])]
        print(neighboring_words)
    

In [None]:
tsne_vectors = give_tsne(net,torch.tensor(inputs,dtype=torch.float))

In [None]:
x_coordinate = tsne_vectors[:,0]
y_coordinate = tsne_vectors[:,1]
epsilon = 5
print(min(x_coordinate)-epsilon,max(x_coordinate)+epsilon)
print(min(y_coordinate)-epsilon,max(y_coordinate)+epsilon)

In [None]:
word_list = ["THE"]
visualize_words(word_list,tsne_vectors,labels)

In [None]:
word_list = ["AS","ARE","TWO","SIX"]
visualize_words(word_list,tsne_vectors,labels)

In [None]:
word_list = ["DEALERS","DEALS","BACK"]
visualize_words(word_list,tsne_vectors,labels,True)

In [None]:
word_list = ["ACCEPTED","ACCEPTS","RADIO","RAIDER","OWNER","OWNERS"]
visualize_words(word_list,tsne_vectors,labels,True)

In [None]:
word_list = ["NINE","NINETY","ACCEPTED","ACCEPTS","AIMED","AIMING"]
visualize_words(word_list,tsne_vectors,labels,True)

In [None]:
word_list = ["COUNT","COUNTS","SALE","SUPPORT","SUPPORTED"]
visualize_words(word_list,tsne_vectors,labels,True)

In [None]:
word_list = ["LATER","LATEST","CLOSE","CLOSING","CLOSED","CLOSELY"]
visualize_words(word_list,tsne_vectors,labels,True)

In [None]:
distances,indices = nearest_neighbors(net,torch.tensor(inputs,dtype=torch.float),labels,n_neighbors=5)

In [None]:
word = "SUPPORT"
nearest_neighbors_for_word(word,indices,labels)

In [None]:
word = "LATER"
nearest_neighbors_for_word(word,indices,labels)

In [None]:
word = "CLOSING"
nearest_neighbors_for_word(word,indices,labels)

In [None]:
word = "ACCEPTED"
nearest_neighbors_for_word(word,indices,labels)

In [None]:
word = "DEALS"
nearest_neighbors_for_word(word,indices,labels)

In [None]:
word = "RANGE"
nearest_neighbors_for_word(word,indices,labels)

In [None]:
word = "DAY"
nearest_neighbors_for_word(word,indices,labels)

In [None]:
word = "CARE"
nearest_neighbors_for_word(word,indices,labels)