In [10]:
#Core Python, Pandas, and kaldi_io
import numpy as np
import pandas as pd
import string
from collections import Counter,OrderedDict 
import kaldi_io
from datetime import datetime

#Scikit
from sklearn import manifold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances,average_precision_score
from sklearn.metrics.pairwise import pairwise_kernels,paired_distances
from scipy import stats
from scipy.spatial.distance import pdist

#Plotting
from matplotlib import pyplot as plt
import seaborn as sns

#BigPhoney
from big_phoney import BigPhoney


#Torch and utilities
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset,DataLoader,random_split,ConcatDataset

#Import User defined classes
from data_helpers import DataHelper
from models import SimpleNet
from train_test_helpers import accuracy,train_model,evaluate_model,evaluate_model_paper,test_model,plot_learning_curves
from sfba4.utils import alignSequences

Load Data and Models

In [None]:
load_list = ['Data/feats_cmvn.ark']
#number_list = [9,12,14,18,21,25,27,28]
#load_list = ['Data/raw_mfcc_AMI_Segments.%d.scp'%(number) for number in number_list]
num_examples = np.Inf

In [None]:
dh = DataHelper(load_list,num_examples)
dh.load_data()
dh.process_data()
c,word_to_num,num_to_word = dh.generate_key_dicts()

In [None]:
inputs,labels = dh.give_inputs_and_labels()
del dh

In [None]:
dev = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
split = False
if split:
    x_trainval,x_test,y_trainval,y_test = train_test_split(inputs, labels, test_size=0.2, random_state=32)
    x_train,x_val,y_train,y_val = train_test_split(x_trainval,y_trainval,test_size =0.25, random_state = 32)
    x_train,y_train = torch.tensor(x_train,dtype= torch.float),torch.tensor(y_train, dtype= torch.float)
    x_val,y_val = torch.tensor(x_val, dtype= torch.float),torch.tensor(y_val, dtype= torch.float)
    x_test,y_test = torch.tensor(x_test, dtype= torch.float),torch.tensor(y_test, dtype= torch.float)
    print(x_train.shape,y_train.shape)
    print(x_val.shape,y_val.shape)
    print(x_test.shape,y_test.shape)

In [None]:
#net = SimpleNet()
num_output = len(c.keys())
net = SimpleNet(num_output)
net = net.float()
net.to(dev)

In [None]:
#Load the best model
best_model_path = "./Models/awe_best_model.pth"
net.load_state_dict(torch.load(best_model_path))

In [19]:
def show_time():
	now = datetime.now()
	current_time = now.strftime("%d/%m/%Y %H:%M:%S")
	print("Current Date and Time =", current_time)

Embedding Evaluation

In [90]:
#Load the word_pairs DataFrame
wordpairs_df = pd.read_csv('Data/wordpairs.txt', sep = ',')
#del wordpairs_df["raw_phonetic_edit_distance"]

In [20]:
#Calculate all the unique words
def words_from_dataframe(dataframe):
    wordpairs_list = dataframe["word_pairs"].apply(lambda x: x.strip('()').split(','))
    words = [word.strip(' \'') for wordpair in wordpairs_list for word in wordpair]
    words = set(words)
    return words

In [None]:
print(words)

In [None]:
print(len(words))

In [33]:
def generate_word_embedding_dict(words):
    word_embedding_dict = OrderedDict()
    #Calculate embeddings
    for word in words:
        #Find the mfcc features of the acoustic representation of the word in the data
        word_features = inputs[np.where(np.isin(labels,word_to_num[word]))]
        
        #Calculate embeddings for the feature
        word_embedding = net.give_embeddings(torch.tensor(word_features, device = dev, dtype=torch.float),dev)
        
        #If the number of representation is more than one, take the average embedding
        word_embedding_dict[word] = np.mean(word_embedding, axis = 0).reshape(1,-1)
    
    return word_embedding_dict

In [None]:
def calculate_embedding_distance(homophone_df,word_embedding_dict,metrics = ['cosine']):

    word1_embeddings = None
    word2_embeddings = None
    
    metric_distance_dict = {}
    for metric in metrics:
        metric_distance_dict[metric] = []
        
    for row in homophone_df.itertuples():
        word1, word2 = map(lambda x: x.strip(' \''),row.word_pairs.strip('()').split(','))
        
        for metric in metrics:
            metric_distance_dict[metric].append(paired_distances(word_embedding_dict[word1],word_embedding_dict[word2], metric = metric)[0])
        
        
        #if word1_embeddings is None and word2_embeddings is None:
        #    word1_embeddings = word_embedding_dict[word1]
        #    word2_embeddings = word_embedding_dict[word2]
        #else:
        #    word1_embeddings = np.vstack((word1_embeddings, word_embedding_dict[word1]))
        #    word2_embeddings = np.vstack((word2_embeddings, word_embedding_dict[word2]))
            
        

    #Calculate the distance
    #print(word1_embeddings.shape)
    for metric in metrics:
        #metric_distance = paired_distances(word1_embeddings,word2_embeddings, metric = metric)
        homophone_df.insert(len(homophone_df.columns),"%s_distance"%(metric), metric_distance_dict[metric], True)
    
    return homophone_df
    
    

In [None]:
def give_nearest_neighbours_on_embeddings(word_embedding_dict, n_neighbours = 10, metric = 'cosine', split = False):
    
    embeddings = None
    
    embeddings = np.stack(list(word_embedding_dict.values())).squeeze()
    
    print('Calculating Nearest Neighbours')
    nbrs = NearestNeighbors(n_neighbors=n_neighbours, algorithm='brute',metric = metric, n_jobs = 4).fit(embeddings)
    distances,indices = nbrs.kneighbors(embeddings)
    
    columns = ["word","neighbours"]
    #nearest_neighbours_df = pd.DataFrame(columns = columns)
    
    words = list(word_embedding_dict.keys())
    print('num of words %d'%(len(words)))
    
    
    nearest_neighbours_df = pd.DataFrame(columns = columns)
    
    for i,word in enumerate(word_embedding_dict.keys()):
        
        neighbours = ','.join([words[indices[i,j]] for j in range(indices.shape[1]) if words[indices[i,j]]!= word])
        #print(neighbours)
        row = pd.DataFrame(np.array([[word],[neighbours]]).T, columns = columns)
        nearest_neighbours_df = nearest_neighbours_df.append(row)
        
    
    #pd.concat([pd.DataFrame(np.array([[word],[','.join([words[indices[i,j]] for j in range(indices.shape[1]) if words[indices[i,j]]!=word ])]]).T, columns = columns) for i,word in enumerate(word_embedding_dict.keys())])
    
    if split:
        neighbour_col_names = ["neighbour_%d"%(i) for i in range(n_neighbours)]
        nearest_neighbours_df[neighbour_col_names] = nearest_neighbours_df.neighbours.str.split(',', expand = True )
        nearest_neighbours_df.drop(columns = ["neighbours"],inplace = True)
    
    
    #Reset index
    nearest_neighbours_df = nearest_neighbours_df.reset_index(drop=True)
    
    
    
    return nearest_neighbours_df

In [None]:
word_embedding_dict = generate_word_embedding_dict(words)

In [None]:
word_embedding_dict = generate_word_embedding_dict(c.keys())

In [None]:
np.save("Data/word_embedding_dict.npy",word_embedding_dict)

In [None]:
em_nearest_neighbours = give_nearest_neighbours_on_embeddings(word_embedding_dict, 10,'cosine', False)

In [None]:
df = calculate_embedding_distance(wordpairs_df,word_embedding_dict,metrics = ['cosine', 'euclidean'])

In [None]:
df

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[0]),
    #hue="Word",
    data=df,
    legend="full",
    alpha=0.5)
g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[1]),
    #hue="Word",
    data=df,
    legend="full",
    alpha=0.5)
g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
df.groupby('phonetic_edit_distance', as_index = False).agg(['mean', 'count', 'std'], index = False)

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[0]),
    #hue="Word",
    data=df.groupby('phonetic_edit_distance', as_index = False).mean(),
    legend="full",
    alpha=0.5)
plt.ylabel('average cosine distance')
#g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[1]),
    #hue="Word",
    data=df.groupby('phonetic_edit_distance', as_index = False).mean(),
    legend="full",
    alpha=0.5)
plt.ylabel('average euclidean distance')
#g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
#Read the homophones_df and split it word pairs into indiviudal columns
homophones = pd.read_csv('Data/homophones.txt')
column_names = ['word_1','word_2']
homophones[column_names] = homophones.word_pairs.str.strip('()').str.split(',', expand = True)
homophones["word_1"] = homophones.word_1.str.strip(' \'\'')
homophones["word_2"] = homophones.word_2.str.strip(' \'')
del homophones["word_pairs"]
cols = list(homophones)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('word_2')))
cols.insert(0, cols.pop(cols.index('word_1')))
homophones = homophones.loc[:, cols]
homophones.to_csv('Data/homophones_expanded.txt', index = False)

Start of Nearest Neighbour Analysis

In [15]:
def alphabet_commas(string):
    '''Takes a string and returns a filtered string with only alphabet and commas'''
    
    return ''.join(e for e in string if (e.isalpha() or e == ","))
    

In [77]:
def calc_spearman_with_awe(word,nn_words,word_embedding_dict):
    '''Takes a  word and it's list of nearest neighbour words, 
    calculates their awe and calculates the spearman rank coefficient'''
    
    nn_words_ranks = np.array((np.arange(1,len(nn_words)+1))).reshape(1,-1)
    
    word_embedding = word_embedding_dict.item().get(word).squeeze()
    nn_words_embeddings = np.stack([word_embedding_dict.item().get(word).squeeze() for word in nn_words])
    #print(word_embedding.shape,nn_words_embeddings.shape)
    similarity = pairwise_kernels(word_embedding.reshape(1,-1),nn_words_embeddings, metric = 'cosine')
    awe_words_ranks = np.argsort(-similarity)+1
    
    #print(nn_words_ranks, awe_words_ranks)
    rho,p_value = stats.spearmanr(nn_words_ranks.ravel(), awe_words_ranks.ravel())
    return rho,p_value

In [11]:
#Load the nearest neighbours based on embeddings and orthographic/phonetic representation
#em_cosine_nn = pd.read_csv('Data/em_nearest_neighbours.txt')
edit_distance_nn = pd.read_csv('Data/edit_nearest_neighbours.txt')
sim_distance_nn = pd.read_csv('Data/sim_nearest_neighbours.txt')
homophones = pd.read_csv('Data/raw_ph_homophones.txt')

In [101]:
word_embedding_dict = np.load('Data/word_embedding_dict.npy', allow_pickle = True)

In [102]:
word_embedding_dict.item().get("thank").squeeze()

array([2.0415986e-01, 2.4921808e+01, 0.0000000e+00, ..., 0.0000000e+00,
       0.0000000e+00, 3.6583733e-04], dtype=float32)

In [93]:
em_cosine_nn

Unnamed: 0,word,neighbours
0,mmhmm,"regional,mhhmm,hmmmm,parallel,bumps,mmmmmm,mmm..."
1,thank,"thinked,think,thing,thingll,pinned,seemed,hang..."
2,uhhuh,"avril,avocado,crack,liger,addon,mmhmm,whatnot,..."
3,already,"roller,cloak,coordinate,laundry,figleaf,orally..."
4,analyse,"penlight,anonymous,fabulous,analysed,dialects,..."
...,...,...
9969,ponnen,"problem,scrollbutton,probabl,profitmargin,trun..."
9970,vanna,"dimensional,banana,bananabando,bananarama,frui..."
9971,origi,"exhibit,misplace,hourish,upstairs,azerty,robin..."
9972,refresh,"wordperfect,imagination,demonstration,weixuns,..."


In [13]:
sim_distance_nn

Unnamed: 0,word,orthographic,raw_phonetic,filtered_phonetic
0,cheapie,"('cheaper', 'cheaply', 'cheap', 'cheapy', 'che...","('cheapy', 'chippy', 'cheaps', 'cheaper', 'che...","('cheapy', 'cheap', 'cheaply', 'chippy', 'chea..."
1,conjunction,"('connection', 'conjunct', 'convention', 'cons...","('connection', 'consumption', 'conjunctural', ...","('conjunctural', 'connection', 'consumption', ..."
2,nicer,"('niche', 'never', 'nicety', 'timer', 'univer'...","('lesser', 'night', 'fibre', 'dicier', 'nines'...","('nigger', 'wiper', 'answer', 'never', 'buyer'..."
3,ourselves,"('yourselves', 'ourself', 'relies', 'courses',...","('ourself', 'yourselves', 'yourself', 'solves'...","('ourself', 'sells', 'cells', 'solves', 'thems..."
4,temporarily,"('temporary', 'temporal', 'arbitrarily', 'terr...","('necessarily', 'temporal', 'temporary', 'temp...","('temporary', 'temporal', 'necessarily', 'temp..."
...,...,...,...,...
9969,global,"('globe', 'local', 'loyal', 'globs', 'globby',...","('globally', 'globe', 'mobile', 'label', 'loca...","('globally', 'local', 'mobile', 'label', 'glob..."
9970,hearing,"('healing', 'heating', 'wearing', 'gearing', '...","('heating', 'healing', 'keyring', 'hitting', '...","('healing', 'heating', 'keyring', 'string', 'h..."
9971,slidebar,"('slider', 'slides', 'slidey', 'spider', 'line...","('slideshow', 'slider', 'sliders', 'sliding', ...","('sliders', 'slideshow', 'slides', 'slide', 's..."
9972,dedicated,"('dedicate', 'educated', 'indicated', 'delicat...","('dedicate', 'edited', 'indicated', 'dissected...","('indicated', 'dedicate', 'educated', 'edited'..."


In [83]:
def similarity_task(nn_df,column_name = "raw_phonetic"):
    words = set(sim_distance_nn["word"].to_list())
    #words = ["cheapie"]
    avg_rho = 0
    for word in words:
        query = nn_df.query("word == '%s'"%(word))[column_name].item()
        nn_words = alphabet_commas(query).split(",")
        rho,p_value = calc_spearman_with_awe(word,nn_words, word_embedding_dict)
        avg_rho += rho


    avg_rho = avg_rho/len(words)
    return avg_rho
    

In [82]:
print(similarity_task(sim_distance_nn,"raw_phonetic"))
print(similarity_task(sim_distance_nn,"filtered_phonetic"))
print(similarity_task(sim_distance_nn,"orthographic"))

0.4954177832060372
0.487934083161677
0.49291430446433687


In [84]:
print(similarity_task(edit_distance_nn,"raw_phonetic"))
print(similarity_task(edit_distance_nn,"filtered_phonetic"))
print(similarity_task(edit_distance_nn,"orthographic"))

0.4885781820612404
0.4897861713181568
0.4951018101609663


In [133]:
phoney = BigPhoney()

In [135]:
words = set(homophones["word"].to_list())
phoneme_dict = {}
for word in words:
    phoneme_dict[word] = phoney.phonize(word)

In [132]:
homophones[homophones["word"] == "slidey"]

Unnamed: 0,word,homophone_words
24,slidey,"slidy,"


In [139]:
def filter_alphabets(string):
	return ''.join(e for e in string if (e.isalpha() or e.isspace()))

In [148]:
def homophone_task(homophones):
    words = set(homophones["word"].to_list())
    homophones["homophone_words"] = homophones["homophone_words"].apply(alphabet_commas) 

    avg_precision = 0
    
    phoneme_eDistance_list = []
    for word in words:
        homophone_query = homophones.query("word == '%s'"%(word))
        awe_nn_words_query = em_cosine_nn.query("word == '%s'"%(word))

        homophone_words = list(filter(lambda x: x.isalpha(), homophone_query["homophone_words"].item().split(",")))
        awe_nn_words = list(awe_nn_words_query["neighbours"].item().split(","))[:len(homophone_words)]

        #print(awe_nn_words)

        #Set of homophone words
        set_homophone_words = set(homophone_words)

        #Set of Nearest neighbours based on cosine_similarity of embeddings
        set_awe_nn_words = set(awe_nn_words)
        
        
        #print(word,set_homophone_words,set_awe_nn_words)
        
        for word_1,word_2 in zip(list(set_homophone_words),list(set_awe_nn_words)):
            
            aligned_seq1, aligned_seq2, eDistance = alignSequences.align(filter_alphabets(phoney.phonize(word_1)),filter_alphabets(phoney.phonize(word_2)))
            
            print('{%s , %s , %s } Phoneme eDistance %d'%(word,word_1,word_2,eDistance))
            
            phoneme_eDistance_list.append(eDistance)
        
        
        #Calculate precision score
        word_precision = len(set_homophone_words.intersection(set_awe_nn_words))/len(set_homophone_words)

        avg_precision += word_precision

    avg_precision = avg_precision/len(words)
    print(avg_precision)
    
    return phoneme_eDistance_list

In [149]:
a = homophone_task(homophones)

{through , threw , carrier } Phoneme eDistance 4
{commitment , committment , equipments } Phoneme eDistance 5
{weights , waits , weight } Phoneme eDistance 1
{handy , handi , napki } Phoneme eDistance 3
{simpl , simple , should } Phoneme eDistance 6
{strawberrys , strawberries , forgo } Phoneme eDistance 7
{corinne , corrine , pretty } Phoneme eDistance 5
{prioritise , prioritize , prioritized } Phoneme eDistance 1
{pares , pairs , players } Phoneme eDistance 3
{plain , plane , playing } Phoneme eDistance 2
{parti , party , their } Phoneme eDistance 4
{mails , males , names } Phoneme eDistance 2
{acquaintance , aquaintance , gorgeous } Phoneme eDistance 7
{peoplell , peopl , poodle } Phoneme eDistance 2
{peoplell , people , people } Phoneme eDistance 0
{disks , discs , these } Phoneme eDistance 5
{franc , frank , throwing } Phoneme eDistance 4
{weighted , waited , wasting } Phoneme eDistance 2
{frank , franc , print } Phoneme eDistance 4
{minimalised , minimalized , encouragement } Pho

{characterization , characterisation , particularity } Phoneme eDistance 12
{shuts , schutz , schutz } Phoneme eDistance 0
{knowing , knoing , ellen } Phoneme eDistance 4
{custome , custom , customers } Phoneme eDistance 2
{stylized , stylised , sacrificed } Phoneme eDistance 7
{emphasise , emphasize , nondescript } Phoneme eDistance 10
{channelll , channel , transcribed } Phoneme eDistance 8
{usable , useable , usersll } Phoneme eDistance 3
{course , coarse , coolest } Phoneme eDistance 4
{hairy , harry , battery } Phoneme eDistance 4
{moldable , mouldable , animal } Phoneme eDistance 5
{theres , theirs , luxurious } Phoneme eDistance 8
{centr , centre , fruit } Phoneme eDistance 4
{board , bored , avoidable } Phoneme eDistance 7
{claire , clare , foldout } Phoneme eDistance 5
{waits , weights , lights } Phoneme eDistance 2
{pains , panes , change } Phoneme eDistance 2
{programmes , programs , provenance } Phoneme eDistance 7
{wouldn , wooden , wouldnt } Phoneme eDistance 1
{waited , 

In [150]:
c = Counter(a)

In [154]:
c

Counter({4: 38,
         5: 38,
         1: 18,
         3: 29,
         6: 28,
         7: 25,
         2: 25,
         0: 29,
         10: 4,
         8: 16,
         9: 5,
         11: 3,
         12: 2})

Rough

In [94]:
phoney = BigPhoney()

In [95]:
def filter_alphabets(string):
	return ''.join(e for e in string if (e.isalpha() or e.isspace()))

In [91]:
words = set(wordpairs_df["word_1"].to_list()).union(set(wordpairs_df["word_2"].to_list()))

In [92]:
print(len(words))

9974


In [96]:

word_phoneme_dict = {}

#Calculate the word phonemes
for word in words:
    phonemes = phoney.phonize(word)
    word_phoneme_dict[word] = filter_alphabets(phonemes)

print('Finished Calculating Phonemic Expansion')

Finished Calculating Phonemic Expansion


In [97]:
word_phoneme_dict

{'pixel': 'P IH K S AH L',
 'finds': 'F AY N D Z',
 'echelons': 'EH SH AH L AA N Z',
 'suppo': 'S UW P OW',
 'brick': 'B R IH K',
 'smallish': 'S M AO L IH SH',
 'execute': 'EH K S AH K Y UW T',
 'ignore': 'IH G N AO R',
 'doesnt': 'D OW S AH N T',
 'injury': 'IH N JH ER IY',
 'lowering': 'L OW ER IH NG',
 'innova': 'IH N OW V AH',
 'horses': 'HH AO R S AH Z',
 'atmosphere': 'AE T M AH S F IH R',
 'kangaroo': 'K AE NG G ER UW',
 'preparing': 'P R IY P EH R IH NG',
 'refining': 'R AH F AY N IH NG',
 'hypothetically': 'HH AY P AH TH EH T IH K L IY',
 'beams': 'B IY M Z',
 'sucking': 'S AH K IH NG',
 'arbitrarily': 'AA R B IH T R EH R AH L IY',
 'teambuilding': 'T IY M B IH L D IH NG',
 'semisuperv': 'S EH M IY S UW P ER V',
 'lefthander': 'L EH F T HH AE N D ER',
 'assignments': 'AH S AY N M AH N T S',
 'script': 'S K R IH P T',
 'decision': 'D IH S IH ZH AH N',
 'candidates': 'K AE N D AH D EY T S',
 'fruitful': 'F R UW T F AH L',
 'redundancy': 'R IH D AH N D AH N S IY',
 'conformable'

In [98]:
np.save("Data/word_phoneme_dict.npy",word_phoneme_dict)

In [100]:
a = np.load("Data/word_phoneme_dict.npy", allow_pickle = True)

In [102]:
a.item().get("pixel")

'P IH K S AH L'

In [86]:
def sim_score(ser):
	return 10 - min(10,ser/10)

In [87]:
wordpairs_ser = pd.read_csv('Data/wordpairs_test_with_ser.txt')

In [88]:
wordpairs_embedding_similarity = pd.read_csv('Data/wordpairs_test_embedding_similarity.txt')

In [89]:
wordpairs_ser.head()

Unnamed: 0,word_1,word_2,orthographic_edit_distance,phonetic_edit_distance,word_1_phonetic_ser,word_2_phonetic_ser,word_1_orthographic_ser,word_2_orthographic_ser
0,marketing,share,7,7,0.777778,1.4,0.777778,1.4
1,complexity,simple,6,8,0.8,1.333333,0.6,1.0
2,chance,probably,7,8,1.333333,1.0,1.166667,0.875
3,covered,sheet,5,5,0.714286,1.0,0.714286,1.0
4,shifting,topics,7,6,0.75,1.0,0.875,1.166667


In [70]:
del wordpairs_ser["orthographic_edit_distance"]
del wordpairs_ser["phonetic_edit_distance"]

Unnamed: 0,word_1,word_2,word_1_phonetic_ser,word_2_phonetic_ser,word_1_orthographic_ser,word_2_orthographic_ser
0,marketing,share,0.777778,1.400000,0.777778,1.400000
1,complexity,simple,0.800000,1.333333,0.600000,1.000000
2,chance,probably,1.333333,1.000000,1.166667,0.875000
3,covered,sheet,0.714286,1.000000,0.714286,1.000000
4,shifting,topics,0.750000,1.000000,0.875000,1.166667
...,...,...,...,...,...,...
1547915,pilots,producing,1.000000,0.666667,1.333333,0.888889
1547916,devices,forcing,0.857143,0.857143,1.000000,1.000000
1547917,group,smartboards,1.800000,0.818182,1.800000,0.818182
1547918,agreed,compress,1.000000,0.750000,1.000000,0.750000


In [None]:
wordpairs_ser.head()

In [71]:
wordpairs_embedding_similarity

Unnamed: 0,word_1,word_2,cosine_similarity
0,marketing,share,0.000090
1,complexity,simple,0.000142
2,chance,probably,0.000292
3,covered,sheet,0.005748
4,shifting,topics,0.000115
...,...,...,...
1547915,pilots,producing,0.000000
1547916,devices,forcing,0.000000
1547917,group,smartboards,0.001058
1547918,agreed,compress,0.001191


In [72]:
wordpairs_distance = pd.merge(wordpairs_ser,wordpairs_embedding_similarity,on = ["word_1","word_2"])

In [73]:
wordpairs_distance

Unnamed: 0,word_1,word_2,word_1_phonetic_ser,word_2_phonetic_ser,word_1_orthographic_ser,word_2_orthographic_ser,cosine_similarity
0,marketing,share,0.777778,1.400000,0.777778,1.400000,0.000090
1,complexity,simple,0.800000,1.333333,0.600000,1.000000,0.000142
2,chance,probably,1.333333,1.000000,1.166667,0.875000,0.000292
3,covered,sheet,0.714286,1.000000,0.714286,1.000000,0.005748
4,shifting,topics,0.750000,1.000000,0.875000,1.166667,0.000115
...,...,...,...,...,...,...,...
1547915,pilots,producing,1.000000,0.666667,1.333333,0.888889,0.000000
1547916,devices,forcing,0.857143,0.857143,1.000000,1.000000,0.000000
1547917,group,smartboards,1.800000,0.818182,1.800000,0.818182,0.001058
1547918,agreed,compress,1.000000,0.750000,1.000000,0.750000,0.001191


In [80]:
wordpairs_distance["word_1_phonetic_ser"].apply(lambda x: sim_score(x*100)).to_list()

[2.2222222222222214,
 2.0,
 0.0,
 2.8571428571428568,
 2.5,
 0.0,
 0.0,
 2.5,
 1.1111111111111107,
 3.333333333333334,
 0.0,
 0.9090909090909083,
 0.0,
 0.0,
 0.0,
 1.666666666666666,
 2.8571428571428568,
 2.8571428571428568,
 1.666666666666666,
 2.7272727272727266,
 2.8571428571428568,
 2.0,
 0.0,
 1.666666666666666,
 4.0,
 1.4285714285714288,
 0.0,
 0.0,
 2.2222222222222214,
 0.0,
 4.444444444444445,
 1.1111111111111107,
 0.0,
 2.0,
 0.0,
 0.0,
 2.3076923076923066,
 0.0,
 0.0,
 2.5,
 2.0,
 0.0,
 1.0,
 0.0,
 3.333333333333334,
 0.0,
 0.0,
 1.4285714285714288,
 1.666666666666666,
 3.0,
 0.0,
 0.0,
 2.5,
 1.5384615384615383,
 0.0,
 0.0,
 4.2857142857142865,
 3.0,
 2.0,
 1.4285714285714288,
 3.75,
 0.0,
 0.9090909090909083,
 2.0,
 2.0,
 0.0,
 4.2857142857142865,
 3.333333333333334,
 1.4285714285714288,
 2.8571428571428568,
 0.0,
 1.25,
 0.0,
 2.0,
 1.25,
 1.4285714285714288,
 2.5,
 0.0,
 2.8571428571428568,
 0.0,
 2.0,
 1.666666666666666,
 0.0,
 0.0,
 2.5,
 0.0,
 0.0,
 1.666666666666666,

In [85]:
stats.spearmanr(wordpairs_distance["word_1_phonetic_ser"].to_list(),wordpairs_distance["cosine_similarity"].apply(lambda x: 1-x).to_list())

SpearmanrResult(correlation=0.15835975897834867, pvalue=0.0)