In [2]:
#Core Python, Pandas, and kaldi_io
import numpy as np
import pandas as pd
import string
from collections import Counter,OrderedDict 
import kaldi_io

#Scikit
from sklearn import manifold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances,average_precision_score
from sklearn.metrics.pairwise import pairwise_kernels,paired_distances
from scipy import stats
from scipy.spatial.distance import pdist

#Plotting
from matplotlib import pyplot as plt
import seaborn as sns

#BigPhoney
from big_phoney import BigPhoney


#Torch and utilities
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset,DataLoader,random_split,ConcatDataset

#Import User defined classes
from data_helpers import DataHelper
from models import SimpleNet
from train_test_helpers import accuracy,train_model,evaluate_model,evaluate_model_paper,test_model,plot_learning_curves

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

Using TensorFlow backend.


Load Data and Models

In [None]:
load_list = ['Data/feats_cmvn.ark']
#number_list = [9,12,14,18,21,25,27,28]
#load_list = ['Data/raw_mfcc_AMI_Segments.%d.scp'%(number) for number in number_list]
num_examples = np.Inf

In [None]:
dh = DataHelper(load_list,num_examples)
dh.load_data()
dh.process_data()
c,word_to_num,num_to_word = dh.generate_key_dicts()

In [None]:
inputs,labels = dh.give_inputs_and_labels()
del dh

In [None]:
dev = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
split = False
if split:
    x_trainval,x_test,y_trainval,y_test = train_test_split(inputs, labels, test_size=0.2, random_state=32)
    x_train,x_val,y_train,y_val = train_test_split(x_trainval,y_trainval,test_size =0.25, random_state = 32)
    x_train,y_train = torch.tensor(x_train,dtype= torch.float),torch.tensor(y_train, dtype= torch.float)
    x_val,y_val = torch.tensor(x_val, dtype= torch.float),torch.tensor(y_val, dtype= torch.float)
    x_test,y_test = torch.tensor(x_test, dtype= torch.float),torch.tensor(y_test, dtype= torch.float)
    print(x_train.shape,y_train.shape)
    print(x_val.shape,y_val.shape)
    print(x_test.shape,y_test.shape)

In [None]:
#net = SimpleNet()
num_output = len(c.keys())
net = SimpleNet(num_output)
net = net.float()
net.to(dev)

In [None]:
#Load the best model
best_model_path = "./Models/awe_best_model.pth"
net.load_state_dict(torch.load(best_model_path))

Embedding Evaluation

In [4]:
#Load the word_pairs DataFrame
wordpairs_df = pd.read_csv('Data/wordpairs_test.txt', sep = ',')

In [5]:
wordpairs_df

Unnamed: 0,word_1,word_2,orthographic_edit_distance,raw_phonetic_edit_distance,filtered_phonetic_edit_distance
0,could,required,6,5,5
1,meeting,system,6,5,5
2,grippy,submission,9,7,7
3,doing,forth,4,4,4
4,quite,regular,6,7,7
...,...,...,...,...,...
70120,connection,liking,9,7,7
70121,connect,zapping,7,6,6
70122,colours,proper,6,5,5
70123,light,mention,7,6,6


In [20]:
#Calculate all the unique words
def words_from_dataframe(dataframe):
    wordpairs_list = dataframe["word_pairs"].apply(lambda x: x.strip('()').split(','))
    words = [word.strip(' \'') for wordpair in wordpairs_list for word in wordpair]
    words = set(words)
    return words

In [None]:
print(words)

In [None]:
print(len(words))

In [None]:
def generate_word_embedding_dict(words):
    word_embedding_dict = OrderedDict()
    #Calculate embeddings
    for word in words:
        #Find the mfcc features of the acoustic representation of the word in the data
        word_features = inputs[np.where(np.isin(labels,word_to_num[word]))]
        
        #Calculate embeddings for the feature
        word_embedding = net.give_embeddings(torch.tensor(word_features, device = dev, dtype=torch.float),dev)
        
        #If the number of representation is more than one, take the average embedding
        word_embedding_dict[word] = np.mean(word_embedding, axis = 0).reshape(1,-1)
    
    return word_embedding_dict

In [None]:
def calculate_embedding_distance(homophone_df,word_embedding_dict,metrics = ['cosine']):

    word1_embeddings = None
    word2_embeddings = None
    
    metric_distance_dict = {}
    for metric in metrics:
        metric_distance_dict[metric] = []
        
    for row in homophone_df.itertuples():
        word1, word2 = map(lambda x: x.strip(' \''),row.word_pairs.strip('()').split(','))
        
        for metric in metrics:
            metric_distance_dict[metric].append(paired_distances(word_embedding_dict[word1],word_embedding_dict[word2], metric = metric)[0])
        
        
        #if word1_embeddings is None and word2_embeddings is None:
        #    word1_embeddings = word_embedding_dict[word1]
        #    word2_embeddings = word_embedding_dict[word2]
        #else:
        #    word1_embeddings = np.vstack((word1_embeddings, word_embedding_dict[word1]))
        #    word2_embeddings = np.vstack((word2_embeddings, word_embedding_dict[word2]))
            
        

    #Calculate the distance
    #print(word1_embeddings.shape)
    for metric in metrics:
        #metric_distance = paired_distances(word1_embeddings,word2_embeddings, metric = metric)
        homophone_df.insert(len(homophone_df.columns),"%s_distance"%(metric), metric_distance_dict[metric], True)
    
    return homophone_df
    
    

In [None]:
def give_nearest_neighbours_on_embeddings(word_embedding_dict, n_neighbours = 10, metric = 'cosine', split = False):
    
    embeddings = None
    
    embeddings = np.stack(list(word_embedding_dict.values())).squeeze()
    
    print('Calculating Nearest Neighbours')
    nbrs = NearestNeighbors(n_neighbors=n_neighbours, algorithm='brute',metric = metric, n_jobs = 4).fit(embeddings)
    distances,indices = nbrs.kneighbors(embeddings)
    
    columns = ["word","neighbours"]
    #nearest_neighbours_df = pd.DataFrame(columns = columns)
    
    words = list(word_embedding_dict.keys())
    print('num of words %d'%(len(words)))
    
    
    nearest_neighbours_df = pd.DataFrame(columns = columns)
    
    for i,word in enumerate(word_embedding_dict.keys()):
        
        neighbours = ','.join([words[indices[i,j]] for j in range(indices.shape[1]) if words[indices[i,j]]!= word])
        #print(neighbours)
        row = pd.DataFrame(np.array([[word],[neighbours]]).T, columns = columns)
        nearest_neighbours_df = nearest_neighbours_df.append(row)
        
    
    #pd.concat([pd.DataFrame(np.array([[word],[','.join([words[indices[i,j]] for j in range(indices.shape[1]) if words[indices[i,j]]!=word ])]]).T, columns = columns) for i,word in enumerate(word_embedding_dict.keys())])
    
    if split:
        neighbour_col_names = ["neighbour_%d"%(i) for i in range(n_neighbours)]
        nearest_neighbours_df[neighbour_col_names] = nearest_neighbours_df.neighbours.str.split(',', expand = True )
        nearest_neighbours_df.drop(columns = ["neighbours"],inplace = True)
    
    
    #Reset index
    nearest_neighbours_df = nearest_neighbours_df.reset_index(drop=True)
    
    
    
    return nearest_neighbours_df

In [None]:
word_embedding_dict = generate_word_embedding_dict(words)

In [None]:
word_embedding_dict = generate_word_embedding_dict(c.keys())

In [None]:
np.save("Data/word_embedding_dict.npy",word_embedding_dict)

In [None]:
em_nearest_neighbours = give_nearest_neighbours_on_embeddings(word_embedding_dict, 10,'cosine', False)

In [None]:
df = calculate_embedding_distance(wordpairs_df,word_embedding_dict,metrics = ['cosine', 'euclidean'])

In [None]:
df

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[0]),
    #hue="Word",
    data=df,
    legend="full",
    alpha=0.5)
g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[1]),
    #hue="Word",
    data=df,
    legend="full",
    alpha=0.5)
g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
df.groupby('phonetic_edit_distance', as_index = False).agg(['mean', 'count', 'std'], index = False)

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[0]),
    #hue="Word",
    data=df.groupby('phonetic_edit_distance', as_index = False).mean(),
    legend="full",
    alpha=0.5)
plt.ylabel('average cosine distance')
#g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[1]),
    #hue="Word",
    data=df.groupby('phonetic_edit_distance', as_index = False).mean(),
    legend="full",
    alpha=0.5)
plt.ylabel('average euclidean distance')
#g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
#Read the homophones_df and split it word pairs into indiviudal columns
homophones = pd.read_csv('Data/homophones.txt')
column_names = ['word_1','word_2']
homophones[column_names] = homophones.word_pairs.str.strip('()').str.split(',', expand = True)
homophones["word_1"] = homophones.word_1.str.strip(' \'\'')
homophones["word_2"] = homophones.word_2.str.strip(' \'')
del homophones["word_pairs"]
cols = list(homophones)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('word_2')))
cols.insert(0, cols.pop(cols.index('word_1')))
homophones = homophones.loc[:, cols]
homophones.to_csv('Data/homophones_expanded.txt', index = False)

Start of Nearest Neighbour Analysis

In [10]:
#Load the nearest neighbours based on embeddings and orthographic/phonetic representation
em_cosine_nn = pd.read_csv('Data/em_nearest_neighbours.txt')
edit_distance_nn = pd.read_csv('Data/edit_nearest_neighbours.txt')
sim_distance_nn = pd.read_csv('Data/sim_nearest_neighbours.txt')
homophones = pd.read_csv('Data/homophones.txt')

In [12]:
em_cosine_nn

Unnamed: 0,word,neighbours
0,mmhmm,"regional,mhhmm,hmmmm,parallel,bumps,mmmmmm,mmm..."
1,thank,"thinked,think,thing,thingll,pinned,seemed,hang..."
2,uhhuh,"avril,avocado,crack,liger,addon,mmhmm,whatnot,..."
3,already,"roller,cloak,coordinate,laundry,figleaf,orally..."
4,analyse,"penlight,anonymous,fabulous,analysed,dialects,..."
...,...,...
9969,ponnen,"problem,scrollbutton,probabl,profitmargin,trun..."
9970,vanna,"dimensional,banana,bananabando,bananarama,frui..."
9971,origi,"exhibit,misplace,hourish,upstairs,azerty,robin..."
9972,refresh,"wordperfect,imagination,demonstration,weixuns,..."


In [13]:
edit_distance_nn

Unnamed: 0,word,orthographic,raw_phonetic,filtered_phonetic
0,cheapie,"('cheaply', 'cheapy', 'cheaps', 'cheaper', 'ch...","('cheapy', 'cheap', 'cheaply', 'chippy', 'chee...","('cheapy', 'cheap', 'chippy', 'cheaper', 'chea..."
1,conjunction,"('connection', 'consumption', 'conjunct', 'con...","('conjunctural', 'connection', 'consumption', ...","('consumption', 'connection', 'conjunctural', ..."
2,nicer,"('univer', 'tiger', 'nicked', 'timer', 'ticker...","('minor', 'guyss', 'night', 'wiper', 'wider', ...","('guyss', 'night', 'dicier', 'lesser', 'nines'..."
3,ourselves,"('yourselves', 'ourself', 'observed', 'solves'...","('ourself', 'cells', 'yourselves', 'yourself',...","('ourself', 'sells', 'yourself', 'themselves',..."
4,temporarily,"('temporary', 'temporal', 'separately', 'tempe...","('temporal', 'necessarily', 'temporary', 'simi...","('temporary', 'temporal', 'temperature', 'nece..."
...,...,...,...,...
9969,global,"('globally', 'globe', 'globby', 'loyal', 'loca...","('globally', 'local', 'mobile', 'label', 'glob...","('globally', 'mobile', 'globe', 'local', 'labe..."
9970,hearing,"('bearing', 'wearing', 'healing', 'gearing', '...","('healing', 'keyring', 'heating', 'raring', 'h...","('heating', 'keyring', 'healing', 'seeking', '..."
9971,slidebar,"('slider', 'slides', 'linear', 'slideaway', 's...","('sliding', 'slide', 'sliders', 'slides', 'sli...","('sliders', 'slide', 'sliding', 'slider', 'sli..."
9972,dedicated,"('dedicate', 'educated', 'delicate', 'indicate...","('indicated', 'edited', 'dedicate', 'dominated...","('indicated', 'dedicate', 'edited', 'educated'..."
