In [None]:
#Core Python, Pandas, and kaldi_io
import numpy as np
import pandas as pd
import string
from collections import Counter
import kaldi_io

#Scikit
from sklearn import manifold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances,average_precision_score
from sklearn.metrics.pairwise import pairwise_kernels
from scipy import stats

#Plotting
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.spatial.distance import pdist


#Torch and utilities
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset,DataLoader,random_split,ConcatDataset

In [None]:
def filter_on_character_length(matrices,mat_lengths,keys, char_threshold = 5):
    '''Takes in matrices and keys. Filters the data by making all keys lowercase, removing words
    with number of letters less than a threshold.'''
    
    print('Length before filtering %d'%(len(keys)))
    #Lowercase all keys
    keys = list(map(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower(),keys))
    
    #Filter if the characters are smaller than the character threshold
    matrices,mat_lengths,keys = zip(*filter(lambda x: len(x[2])>=char_threshold, zip(matrices,mat_lengths,keys)))
    
    matrices,mat_lengths,keys = list(matrices),list(mat_lengths),list(keys)
    
    print('Length after filtering %d'%(len(keys)))
    

    return matrices,mat_lengths,keys
    

In [None]:
def filter_on_character_frequency(matrices,mat_lengths,keys,frequency_bounds = (0,np.Inf)):
    '''Filter words that have frequnecy less than a lower bound threshold or more than an upper bound threshold'''
    
    print('Length before filtering %d'%(len(keys)))
    
    #Create a Counter
    c = Counter(keys)
    
    #Get the words whose frequency is below a lower bound threshold or above an upper bound threshold
    remove_list = []
    
    for key,value in c.items():
        if value < frequency_bounds[0] or value > frequency_bounds[1]:
            remove_list.append(key)
            
    #Remove the words from the Counter
    for word in remove_list:
        del c[word]
        
    #Remove the words from data
    matrices,mat_lengths,keys = zip(*filter(lambda x: x[2] not in remove_list, zip(matrices,mat_lengths,keys)))
    
    
    print('Length after filtering %d'%(len(keys)))
    
    return map(list,(matrices,mat_lengths,keys))

In [None]:
# Function to truncate and limit dimensionality
def truncate_shapes(matrices,mat_lengths,max_length = 100,num_mfcc_features = 40):
    
    for i, seq in enumerate(matrices):
        matrices[i] = matrices[i][:max_length, :num_mfcc_features]
        mat_lengths[i] = min(mat_lengths[i], max_length)
    
    return matrices,mat_lengths 

In [None]:
#Function for padding
def pad_sequences(x, n_padded, center_padded=True):
    """Return the padded sequences and their original lengths."""
    padded_x = np.zeros((len(x), n_padded, x[0].shape[1]))
    lengths = []
    for i_data, cur_x in enumerate(x):
        length = cur_x.shape[0]
        if center_padded:
            padding = int(np.round((n_padded - length) / 2.))
            if length <= n_padded:
                padded_x[i_data, padding:padding + length, :] = cur_x
            else:
                # Cut out snippet from sequence exceeding n_padded
                padded_x[i_data, :, :] = cur_x[-padding:-padding + n_padded]
            lengths.append(min(length, n_padded))
        else:
            length = min(length, n_padded)
            padded_x[i_data, :length, :] = cur_x[:length, :]
            lengths.append(length)
    return padded_x, lengths

In [None]:
def generate_key_dicts_and_labels(keys):
    '''Arguments:
    keys : A list of words corresponding to the mfcc feature matrices
    -------------
    Returns:
    labels : A list of numbers correspoding to the words in the list keys'''
    c = Counter(keys)
    #print(c)
    num_words = len(c.keys())
    word_to_num = {}
    num_to_word = {}

    index = 0
    for key in c.keys():
        word_to_num[key] = index
        num_to_word[index] = key
        index+=1

    label_list = []
    for key in keys:
        label_list.append(word_to_num[key])

    print('Number of Unique words ',len(c.keys()))
    return c,word_to_num,num_to_word,label_list

In [None]:
#Load the data
keys = []
matrices = []
mat_lengths = []

load_list = ['Data/feats_cmvn.ark']
num_examples = 1000

for load_file in load_list:
    file_keys,file_matrices,file_mat_lengths = [],[],[]
    for i,(key,matrix) in enumerate(kaldi_io.read_mat_ark(load_file)):
        file_keys.append(key.split('_')[1])
        file_matrices.append(matrix)
        file_mat_lengths.append(matrix.shape[0])
        if i == num_examples-1:
            break
            
    #Filter the data
    file_matrices,file_mat_lengths,file_keys = filter_on_character_length(file_matrices,file_mat_lengths,file_keys,char_threshold = 5)
    
    #Add to the main list
    keys.extend(file_keys)
    matrices.extend(file_matrices)
    mat_lengths.extend(file_mat_lengths)
print(len(keys))

In [None]:
c = Counter(keys)

In [None]:
#Truncate the dimensions of the data
matrices,mat_lengths = truncate_shapes(matrices,mat_lengths,max_length=200,num_mfcc_features=40)
#Pad the matrices
matrices,mat_lengths = pad_sequences(matrices,n_padded = 100,center_padded = True)
matrices = np.transpose(matrices,(0,2,1))
#Generate keys and labels
c,word_to_num,num_to_word,label_list = generate_key_dicts_and_labels(keys)
#delete keys and mat_lengths
del keys,mat_lengths