In [1]:
#Core Python, Pandas, and kaldi_io
import numpy as np
import pandas as pd
import string
from collections import Counter,OrderedDict 
import kaldi_io

#Scikit
from sklearn import manifold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances,average_precision_score
from sklearn.metrics.pairwise import pairwise_kernels,paired_distances
from scipy import stats
from scipy.spatial.distance import pdist

#Plotting
from matplotlib import pyplot as plt
import seaborn as sns

#BigPhoney
from big_phoney import BigPhoney


#Torch and utilities
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset,DataLoader,random_split,ConcatDataset

#Import User defined classes
from data_helpers import DataHelper
from models import SimpleNet
from train_test_helpers import accuracy,train_model,evaluate_model,evaluate_model_paper,test_model,plot_learning_curves

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

Using TensorFlow backend.


Load Data and Models

In [None]:
load_list = ['Data/feats_cmvn.ark']
#number_list = [9,12,14,18,21,25,27,28]
#load_list = ['Data/raw_mfcc_AMI_Segments.%d.scp'%(number) for number in number_list]
num_examples = np.Inf

In [None]:
dh = DataHelper(load_list,num_examples)
dh.load_data()
dh.process_data()
c,word_to_num,num_to_word = dh.generate_key_dicts()

In [None]:
inputs,labels = dh.give_inputs_and_labels()
del dh

In [None]:
dev = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
split = False
if split:
    x_trainval,x_test,y_trainval,y_test = train_test_split(inputs, labels, test_size=0.2, random_state=32)
    x_train,x_val,y_train,y_val = train_test_split(x_trainval,y_trainval,test_size =0.25, random_state = 32)
    x_train,y_train = torch.tensor(x_train,dtype= torch.float),torch.tensor(y_train, dtype= torch.float)
    x_val,y_val = torch.tensor(x_val, dtype= torch.float),torch.tensor(y_val, dtype= torch.float)
    x_test,y_test = torch.tensor(x_test, dtype= torch.float),torch.tensor(y_test, dtype= torch.float)
    print(x_train.shape,y_train.shape)
    print(x_val.shape,y_val.shape)
    print(x_test.shape,y_test.shape)

In [None]:
#net = SimpleNet()
num_output = len(c.keys())
net = SimpleNet(num_output)
net = net.float()
net.to(dev)

In [None]:
#Load the best model
best_model_path = "./Models/awe_best_model.pth"
net.load_state_dict(torch.load(best_model_path))

Embedding Evaluation

In [None]:
#Load the word_pairs DataFrame
wordpairs_df = pd.read_csv('Data/wordpairs_test.txt', sep = ',')

In [None]:
wordpairs_df

In [None]:
#filtered_homophones = wordpairs_df[wordpairs_df["phonetic_edit_distance"]<2]
#filtered_homophones

In [None]:
#Calculate all the unique words
def words_from_dataframe(dataframe):
    wordpairs_list = dataframe["word_pairs"].apply(lambda x: x.strip('()').split(','))
    words = [word.strip(' \'') for wordpair in wordpairs_list for word in wordpair]
    words = set(words)
    return words

In [None]:
print(words)

In [None]:
print(len(words))

In [None]:
def generate_word_embedding_dict(words):
    word_embedding_dict = OrderedDict()
    #Calculate embeddings
    for word in words:
        #Find the mfcc features of the acoustic representation of the word in the data
        word_features = inputs[np.where(np.isin(labels,word_to_num[word]))]
        
        #Calculate embeddings for the feature
        word_embedding = net.give_embeddings(torch.tensor(word_features, device = dev, dtype=torch.float),dev)
        
        #If the number of representation is more than one, take the average embedding
        word_embedding_dict[word] = np.mean(word_embedding, axis = 0).reshape(1,-1)
    
    return word_embedding_dict

In [None]:
def calculate_embedding_distance(homophone_df,word_embedding_dict,metrics = ['cosine']):

    word1_embeddings = None
    word2_embeddings = None
    
    metric_distance_dict = {}
    for metric in metrics:
        metric_distance_dict[metric] = []
        
    for row in homophone_df.itertuples():
        word1, word2 = map(lambda x: x.strip(' \''),row.word_pairs.strip('()').split(','))
        
        for metric in metrics:
            metric_distance_dict[metric].append(paired_distances(word_embedding_dict[word1],word_embedding_dict[word2], metric = metric)[0])
        
        
        #if word1_embeddings is None and word2_embeddings is None:
        #    word1_embeddings = word_embedding_dict[word1]
        #    word2_embeddings = word_embedding_dict[word2]
        #else:
        #    word1_embeddings = np.vstack((word1_embeddings, word_embedding_dict[word1]))
        #    word2_embeddings = np.vstack((word2_embeddings, word_embedding_dict[word2]))
            
        

    #Calculate the distance
    #print(word1_embeddings.shape)
    for metric in metrics:
        #metric_distance = paired_distances(word1_embeddings,word2_embeddings, metric = metric)
        homophone_df.insert(len(homophone_df.columns),"%s_distance"%(metric), metric_distance_dict[metric], True)
    
    return homophone_df
    
    

In [None]:
def give_nearest_neighbours_on_embeddings(word_embedding_dict, n_neighbours = 10, metric = 'cosine', split = False):
    
    embeddings = None
    
    embeddings = np.stack(list(word_embedding_dict.values())).squeeze()
    
    print('Calculating Nearest Neighbours')
    nbrs = NearestNeighbors(n_neighbors=n_neighbours, algorithm='brute',metric = metric, n_jobs = 4).fit(embeddings)
    distances,indices = nbrs.kneighbors(embeddings)
    
    columns = ["word","neighbours"]
    #nearest_neighbours_df = pd.DataFrame(columns = columns)
    
    words = list(word_embedding_dict.keys())
    print('num of words %d'%(len(words)))
    
    
    nearest_neighbours_df = pd.DataFrame(columns = columns)
    
    for i,word in enumerate(word_embedding_dict.keys()):
        
        neighbours = ','.join([words[indices[i,j]] for j in range(indices.shape[1]) if words[indices[i,j]]!= word])
        #print(neighbours)
        row = pd.DataFrame(np.array([[word],[neighbours]]).T, columns = columns)
        nearest_neighbours_df = nearest_neighbours_df.append(row)
        
    
    #pd.concat([pd.DataFrame(np.array([[word],[','.join([words[indices[i,j]] for j in range(indices.shape[1]) if words[indices[i,j]]!=word ])]]).T, columns = columns) for i,word in enumerate(word_embedding_dict.keys())])
    
    if split:
        neighbour_col_names = ["neighbour_%d"%(i) for i in range(n_neighbours)]
        nearest_neighbours_df[neighbour_col_names] = nearest_neighbours_df.neighbours.str.split(',', expand = True )
        nearest_neighbours_df.drop(columns = ["neighbours"],inplace = True)
    
    
    #Reset index
    nearest_neighbours_df = nearest_neighbours_df.reset_index(drop=True)
    
    
    
    return nearest_neighbours_df

In [None]:
word_embedding_dict = generate_word_embedding_dict(words)

In [None]:
word_embedding_dict = generate_word_embedding_dict(c.keys())

In [None]:
np.save("Data/word_embedding_dict.npy",word_embedding_dict)

In [None]:
em_nearest_neighbours = give_nearest_neighbours_on_embeddings(word_embedding_dict, 10,'cosine', False)

In [None]:
em_nearest_neighbours

In [None]:
em_nearest_neighbours.to_csv('Data/em_nearest_neighbours.txt')

In [None]:
nearest_neighbours_df[nearest_neighbours_df["word"]=="cameras"]

In [None]:
df = calculate_embedding_distance(wordpairs_df,word_embedding_dict,metrics = ['cosine', 'euclidean'])

In [None]:
df

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[0]),
    #hue="Word",
    data=df,
    legend="full",
    alpha=0.5)
g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[1]),
    #hue="Word",
    data=df,
    legend="full",
    alpha=0.5)
g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
df.groupby('phonetic_edit_distance', as_index = False).agg(['mean', 'count', 'std'], index = False)

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[0]),
    #hue="Word",
    data=df.groupby('phonetic_edit_distance', as_index = False).mean(),
    legend="full",
    alpha=0.5)
plt.ylabel('average cosine distance')
#g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[1]),
    #hue="Word",
    data=df.groupby('phonetic_edit_distance', as_index = False).mean(),
    legend="full",
    alpha=0.5)
plt.ylabel('average euclidean distance')
#g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
#Read the homophones_df and split it word pairs into indiviudal columns
homophones = pd.read_csv('Data/homophones.txt')
column_names = ['word_1','word_2']
homophones[column_names] = homophones.word_pairs.str.strip('()').str.split(',', expand = True)
homophones["word_1"] = homophones.word_1.str.strip(' \'\'')
homophones["word_2"] = homophones.word_2.str.strip(' \'')
del homophones["word_pairs"]
cols = list(homophones)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('word_2')))
cols.insert(0, cols.pop(cols.index('word_1')))
homophones = homophones.loc[:, cols]
homophones.to_csv('Data/homophones_expanded.txt', index = False)

Start of Nearest Neighbour Calculations

In [2]:
homophones = pd.read_csv('Data/wordpairs_test.txt')

In [3]:
homophones

Unnamed: 0,word_1,word_2,orthographic_edit_distance,raw_phonetic_edit_distance,filtered_phonetic_edit_distance
0,important,probability,10,10,10
1,metre,proper,5,4,4
2,buying,class,6,4,4
3,mention,question,3,4,4
4,functions,promoting,7,8,8
...,...,...,...,...,...
70120,ready,series,5,4,4
70121,please,tongue,5,4,4
70122,choice,search,5,3,3
70123,displayed,separate,7,6,5


In [4]:
#Get list of words
words = set(homophones["word_1"].to_list()).union(set(homophones["word_2"].to_list()))
print(len(words))

375


In [13]:
def swap_columns(query):
    #Swap columns
    cols = list(query)
    cols.insert(0, cols.pop(cols.index('word_2')))
    query = query.loc[:, cols]
    #Change column names
    query.columns = ["word_1","word_2","orthographic_edit_distance","raw_phonetic_edit_distance","filtered_phonetic_edit_distance"]
    return query

In [14]:
def generate_word_phoneme_dict(words, phoney):
    '''Given a list of words generate a dictionary with their phonetic expansions'''
    word_phoneme_dict = {}
    for word in words:
        word_phoneme_dict[word] = phoney.phonize(word)
    
    return word_phoneme_dict

In [7]:
#Create a phonetic expansion dict
phoney = BigPhoney()

In [8]:
word_phoneme_dict = generate_word_phoneme_dict(words,phoney)

In [9]:
word_phoneme_dict

{'another': 'AH0 N AH1 DH ER0',
 'interpretation': 'IH2 N T ER2 P R IH0 T EY1 SH AH0 N',
 'instructions': 'IH2 N S T R AH1 K SH AH0 N Z',
 'corpus': 'K AO1 R P AH0 S',
 'preservatives': 'P R AH0 Z ER1 V AH0 T IH0 V Z',
 'leadership': 'L IY1 D ER0 SH IH2 P',
 'heavily': 'HH EH1 V AH0 L IY0',
 'expensive': 'IH0 K S P EH1 N S IH0 V',
 'definition': 'D EH2 F AH0 N IH1 SH AH0 N',
 'probability': 'P R AA2 B AH0 B IH1 L AH0 T IY2',
 'across': 'AH0 K R AO1 S',
 'really': 'R IH1 L IY0',
 'language': 'L AE1 NG G W AH0 JH',
 'connect': 'K AH0 N EH1 K T',
 'point': 'P OY1 N T',
 'incorporate': 'IH2 N K AO1 R P ER0 EY2 T',
 'beeper': 'B IY1 P ER0',
 'sound': 'S AW1 N D',
 'guess': 'G EH1 S',
 'class': 'K L AE1 S',
 'working': 'W ER1 K IH0 NG',
 'thursday': 'TH ER1 Z D EY2',
 'ionising': 'AY1 AH0 N AY2 Z IH0 NG',
 'whole': 'HH OW1 L',
 'achieve': 'AH0 CH IY1 V',
 'shifted': 'SH IH1 F T AH0 D',
 'thatd': 'TH AE1 T',
 'whats': 'W AH0 T S',
 'interface': 'IH1 N T ER0 F EY2 S',
 'exchange': 'IH0 K S CH 

In [44]:
def sim_score(ser):
    return 10 - min(10,ser/10)

In [81]:
#Similarity based NNs
sim_nn_dict = {}
sim_nn_dict["word"] = []
sim_nn_dict["orthographic"] = []
sim_nn_dict["raw_phonetic"] = []
sim_nn_dict["filtered_phonetic"] = []


#Edit Distance based NNs
edit_nn_dict = {}
edit_nn_dict["word"] = []
edit_nn_dict["orthographic"] = []
edit_nn_dict["raw_phonetic"] = []
edit_nn_dict["filtered_phonetic"] = []


test_words = ["seven","sheet"]
for word in test_words:

    query = pd.concat([homophones.query("word_1 == '%s'"%(word)),swap_columns(homophones.query("word_2 == '%s'"%(word)))])

    query["orthographic_sim"] = query.apply(lambda row: sim_score(100*row["orthographic_edit_distance"]/len(row["word_1"])), axis = 1)
    query["raw_phonetic_sim"] = query.apply(lambda row: sim_score(100*row["raw_phonetic_edit_distance"]/len(word_phoneme_dict[row["word_1"]])), axis = 1)
    query["filtered_phonetic_sim"] = query.apply(lambda row: sim_score(100*row["filtered_phonetic_edit_distance"]/len(word_phoneme_dict[row["word_1"]])), axis = 1)

    sim_orthographic_nn = tuple(query.sort_values( "orthographic_sim", ascending =False).iloc[:10]["word_2"].to_list())
    sim_raw_phonetic_nn = tuple(query.sort_values( "raw_phonetic_sim", ascending =False).iloc[:10]["word_2"].to_list())
    sim_filtered_phonetic_nn = tuple(query.sort_values( "filtered_phonetic_sim", ascending =False).iloc[:10]["word_2"].to_list())


    edit_orthographic_nn = tuple(query.sort_values( "orthographic_edit_distance", ascending = True).iloc[:10]["word_2"].to_list())
    edit_raw_phonetic_nn = tuple(query.sort_values( "raw_phonetic_edit_distance", ascending = True).iloc[:10]["word_2"].to_list())
    edit_filtered_phonetic_nn = tuple(query.sort_values( "filtered_phonetic_edit_distance", ascending = True).iloc[:10]["word_2"].to_list())

    sim_nn_dict["word"].append(word)
    sim_nn_dict["orthographic"].append(sim_orthographic_nn)
    sim_nn_dict["raw_phonetic"].append(sim_raw_phonetic_nn)
    sim_nn_dict["filtered_phonetic"].append(sim_filtered_phonetic_nn)


    edit_nn_dict["word"].append(word)
    edit_nn_dict["orthographic"].append(edit_orthographic_nn)
    edit_nn_dict["raw_phonetic"].append(edit_raw_phonetic_nn)
    edit_nn_dict["filtered_phonetic"].append(edit_filtered_phonetic_nn)

    del query

sim_nn_df = pd.DataFrame(sim_nn_dict)
edit_nn_df = pd.DataFrame(edit_nn_dict)






In [82]:
sim_nn_df

Unnamed: 0,word,orthographic,raw_phonetic,filtered_phonetic
0,seven,"(given, seems, sheet, cover, series, screen, h...","(given, screen, version, station, button, sens...","(given, station, heavily, strain, mention, hav..."
1,sheet,"(short, wheat, screen, theme, cheap, shall, th...","(wheat, right, shall, thought, reached, might,...","(wheat, shape, shall, thought, right, cheap, s..."


In [83]:
edit_nn_df

Unnamed: 0,word,orthographic,raw_phonetic,filtered_phonetic
0,seven,"(given, sheet, screen, cover, series, seems, h...","(given, screen, mention, heavily, station, opt...","(given, stuff, havent, heavily, button, screen..."
1,sheet,"(short, wheat, cheap, shall, theme, their, sta...","(wheat, metre, thatd, reached, shape, thought,...","(wheat, shall, teach, might, metre, these, sho..."


In [70]:
query.sort_values("orthographic_sim", ascending = True)[:11]

Unnamed: 0,word_1,word_2,orthographic_edit_distance,raw_phonetic_edit_distance,filtered_phonetic_edit_distance,orthographic_sim,raw_phonetic_sim,filtered_phonetic_sim
846,sheet,things,5,4,4,0.0,5.0,5.0
40939,sheet,definition,9,8,8,0.0,0.0,0.0
40895,sheet,advanced,7,6,6,0.0,2.5,2.5
40615,sheet,project,5,6,6,0.0,2.5,2.5
40451,sheet,pushbuttons,8,7,7,0.0,1.25,1.25
39987,sheet,corpus,6,6,6,0.0,2.5,2.5
39667,sheet,decided,6,7,7,0.0,1.25,1.25
39532,sheet,definitely,9,8,8,0.0,0.0,0.0
39002,sheet,liking,6,5,5,0.0,3.75,3.75
38951,sheet,please,5,3,3,0.0,6.25,6.25


In [58]:
for i,word in enumerate(test_words):
    print(word)
    print(nn_df.iloc[i]["orthographic"])
    print(nn_df.iloc[i]["raw_phonetic"])
    print(nn_df.iloc[i]["filtered_phonetic"])

seven
('given', 'seems', 'sheet', 'cover', 'series', 'screen', 'havent', 'sense', 'smoke', 'remit')
('given', 'screen', 'version', 'station', 'button', 'sense', 'heavily', 'option', 'havent', 'strain')
('given', 'station', 'heavily', 'strain', 'mention', 'havent', 'screen', 'stuff', 'version', 'option')
beeper
('better', 'proper', 'paper', 'seems', 'shapes', 'series', 'cover', 'sense', 'seven', 'other')
('better', 'metre', 'paper', 'either', 'cheap', 'theme', 'bring', 'battery', 'order', 'means')
('paper', 'cheap', 'metre', 'either', 'better', 'sheet', 'proper', 'happy', 'wheat', 'battery')


In [19]:
homophones

Unnamed: 0,word_1,word_2,orthographic_edit_distance,phonetic_edit_distance
0,amusement,discoveries,10,9
1,avril,effect,6,5
2,biomorphic,serialize,10,9
3,meeting,trendiness,7,9
4,formatting,hierarch,9,7
...,...,...,...,...
49735346,alastair,autumn,6,6
49735347,articulation,definite,10,10
49735348,suppression,surprise,6,5
49735349,keyword,realising,8,7


In [None]:
#Find 10 nearest neighbours for each word
for word in words:
    #For each word query all the row containing that word
    homophones.query("word_1 == '%s'"%(word))
    homophones.query("word_2 == '%s'"%(word))
    

In [None]:
print(len(words))

In [21]:
nn = pd.read_csv('Data/nearest_neighbours.txt')

In [22]:
nn

Unnamed: 0,word,orthographic,raw_phonetic,filtered_phonetic
0,quick,"('quite', 'trick', 'stick', 'which', 'guide', ...","('clear', 'switch', 'which', 'trick', 'quite',...","('stick', 'clear', 'switch', 'which', 'quite',..."
1,system,"('master', 'mister', 'scores', 'issue', 'shift...","('pistol', 'whistle', 'autumn', 'shifted', 'si...","('pistol', 'mister', 'little', 'shifted', 'whi..."
2,material,"('general', 'master', 'matter', 'metre', 'term...","('control', 'table', 'little', 'mirror', 'stil...","('careful', 'control', 'table', 'mirror', 'whi..."
3,trends,"('twenty', 'things', 'thanks', 'fronts', 'ther...","('ready', 'twenty', 'refers', 'trying', 'keywo...","('ready', 'twenty', 'refers', 'trying', 'keywo..."
4,sorry,"('sorta', 'scores', 'spongy', 'start', 'forth'...","('start', 'coffee', 'series', 'saying', 'parts...","('series', 'coffee', 'screen', 'stream', 'star..."
...,...,...,...,...
370,above,"('about', 'those', 'adopt', 'curve', 'close', ...","('about', 'achieve', 'touch', 'receive', 'tong...","('about', 'achieve', 'touch', 'table', 'other'..."
371,edible,"('table', 'flexible', 'child', 'guide', 'eithe...","('table', 'possible', 'flexible', 'definite', ...","('possible', 'table', 'above', 'people', 'opti..."
372,flexible,"('edible', 'possible', 'please', 'table', 'exc...","('possible', 'edible', 'portable', 'digital', ...","('edible', 'possible', 'portable', 'general', ..."
373,places,"('plates', 'pages', 'please', 'process', 'pape...","('pages', 'please', 'purposes', 'plates', 'pla...","('pages', 'purposes', 'plates', 'please', 'pla..."
