In [8]:
#Core Python, Pandas, and kaldi_io
import numpy as np
import pandas as pd
import string
from collections import Counter,OrderedDict 
import kaldi_io
from datetime import datetime

#Scikit
from sklearn import manifold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances,average_precision_score
from sklearn.metrics.pairwise import pairwise_kernels,paired_distances
from scipy import stats
from scipy.spatial.distance import pdist

#Plotting
from matplotlib import pyplot as plt
import seaborn as sns

#BigPhoney
from big_phoney import BigPhoney


#Torch and utilities
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset,DataLoader,random_split,ConcatDataset

#Import User defined classes
from AweNoise.models.torch_models import SimpleNet, SimpleNet_with_dropout, SiameseNet
from AweNoise.torch_utils import evaluate_model,test_classifier, baseline, accuracy
from AweNoise.datasets import CNN_dataset, SiameseTriplets
from sfba4.utils import alignSequences

Load Data and Models

In [9]:
def load_model(num_output):
    '''Function to create a model given number of outputs, and load weights'''
    
    #CNN
    net = SimpleNet(num_output)
    net = net.float()
    net.to(dev)
    
    model_save_path = "C:/Users/jayes/Downloads/LSV/Models/cnn_clean.pth"
    #Load model weights
    net.load_state_dict(torch.load(model_save_path))
    
    return net

In [10]:
#Load data
clean_ds = CNN_dataset(split_set = None, char_threshold = 5, frequency_bounds = (0,np.Inf), snr = np.Inf, k = np.Inf, cluster = False)
inputs, labels = clean_ds.inputs,clean_ds.labels
c,word_to_num,num_to_word = clean_ds.c, clean_ds.word_to_num, clean_ds.num_to_word

Length before filtering on char length 169383
Length after filtering on char length 80821
Finished Loading the Data, 80821 examples
Number of Unique words  8607
Length before filtering on frequency_bounds  (80821,)
Not filtering


In [11]:
#Old data loading
'''
load_list = ['Data/feats_cmvn.ark']
#number_list = [9,12,14,18,21,25,27,28]
#load_list = ['Data/raw_mfcc_AMI_Segments.%d.scp'%(number) for number in number_list]
num_examples = np.Inf
dh = DataHelper(load_list,num_examples)
dh.load_data()
dh.process_data()
c,word_to_num,num_to_word = dh.generate_key_dicts()
inputs,labels = dh.give_inputs_and_labels()
del dh
split = False
if split:
    x_trainval,x_test,y_trainval,y_test = train_test_split(inputs, labels, test_size=0.2, random_state=32)
    x_train,x_val,y_train,y_val = train_test_split(x_trainval,y_trainval,test_size =0.25, random_state = 32)
    x_train,y_train = torch.tensor(x_train,dtype= torch.float),torch.tensor(y_train, dtype= torch.float)
    x_val,y_val = torch.tensor(x_val, dtype= torch.float),torch.tensor(y_val, dtype= torch.float)
    x_test,y_test = torch.tensor(x_test, dtype= torch.float),torch.tensor(y_test, dtype= torch.float)
    print(x_train.shape,y_train.shape)
    print(x_val.shape,y_val.shape)
    print(x_test.shape,y_test.shape)
'''

"\nload_list = ['Data/feats_cmvn.ark']\n#number_list = [9,12,14,18,21,25,27,28]\n#load_list = ['Data/raw_mfcc_AMI_Segments.%d.scp'%(number) for number in number_list]\nnum_examples = np.Inf\ndh = DataHelper(load_list,num_examples)\ndh.load_data()\ndh.process_data()\nc,word_to_num,num_to_word = dh.generate_key_dicts()\ninputs,labels = dh.give_inputs_and_labels()\ndel dh\nsplit = False\nif split:\n    x_trainval,x_test,y_trainval,y_test = train_test_split(inputs, labels, test_size=0.2, random_state=32)\n    x_train,x_val,y_train,y_val = train_test_split(x_trainval,y_trainval,test_size =0.25, random_state = 32)\n    x_train,y_train = torch.tensor(x_train,dtype= torch.float),torch.tensor(y_train, dtype= torch.float)\n    x_val,y_val = torch.tensor(x_val, dtype= torch.float),torch.tensor(y_val, dtype= torch.float)\n    x_test,y_test = torch.tensor(x_test, dtype= torch.float),torch.tensor(y_test, dtype= torch.float)\n    print(x_train.shape,y_train.shape)\n    print(x_val.shape,y_val.shape)\

In [12]:
dev = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

In [13]:
#Load Best Clean Model
num_output = len(num_to_word.keys()) #Num output is same as clean dataset
net = load_model(num_output)

In [14]:
def show_time():
	now = datetime.now()
	current_time = now.strftime("%d/%m/%Y %H:%M:%S")
	print("Current Date and Time =", current_time)

Embedding Evaluation

In [25]:
words = list(c.keys())

In [26]:
print(words)



In [17]:
print(len(words))

8607


In [18]:
def generate_word_embedding_dict(words):
    word_embedding_dict = OrderedDict()
    #Calculate embeddings
    for word in words:
        #Find the mfcc features of the acoustic representation of the word in the data
        word_features = inputs[np.where(np.isin(labels,word_to_num[word]))]
        
        #Calculate embeddings for the feature
        word_embedding = net.give_embeddings(torch.tensor(word_features, device = dev, dtype=torch.float),dev)
        
        #If the number of representation is more than one, take the average embedding
        word_embedding_dict[word] = np.mean(word_embedding, axis = 0).reshape(1,-1)
    
    return word_embedding_dict

In [19]:
def calculate_embedding_distance(homophone_df,word_embedding_dict,metrics = ['cosine']):

    word1_embeddings = None
    word2_embeddings = None
    
    metric_distance_dict = {}
    for metric in metrics:
        metric_distance_dict[metric] = []
        
    for row in homophone_df.itertuples():
        word1, word2 = map(lambda x: x.strip(' \''),row.word_pairs.strip('()').split(','))
        
        for metric in metrics:
            metric_distance_dict[metric].append(paired_distances(word_embedding_dict[word1],word_embedding_dict[word2], metric = metric)[0])
        
        
        #if word1_embeddings is None and word2_embeddings is None:
        #    word1_embeddings = word_embedding_dict[word1]
        #    word2_embeddings = word_embedding_dict[word2]
        #else:
        #    word1_embeddings = np.vstack((word1_embeddings, word_embedding_dict[word1]))
        #    word2_embeddings = np.vstack((word2_embeddings, word_embedding_dict[word2]))
            
        

    #Calculate the distance
    #print(word1_embeddings.shape)
    for metric in metrics:
        #metric_distance = paired_distances(word1_embeddings,word2_embeddings, metric = metric)
        homophone_df.insert(len(homophone_df.columns),"%s_distance"%(metric), metric_distance_dict[metric], True)
    
    return homophone_df
    
    

In [20]:
def give_nearest_neighbours_on_embeddings(word_embedding_dict, n_neighbours = 10, metric = 'cosine', split = False):
    
    embeddings = None
    
    embeddings = np.stack(list(word_embedding_dict.values())).squeeze()
    
    print('Calculating Nearest Neighbours')
    nbrs = NearestNeighbors(n_neighbors=n_neighbours, algorithm='brute',metric = metric, n_jobs = 4).fit(embeddings)
    distances,indices = nbrs.kneighbors(embeddings)
    
    columns = ["word","neighbours"]
    #nearest_neighbours_df = pd.DataFrame(columns = columns)
    
    words = list(word_embedding_dict.keys())
    print('num of words %d'%(len(words)))
    
    
    nearest_neighbours_df = pd.DataFrame(columns = columns)
    
    for i,word in enumerate(word_embedding_dict.keys()):
        
        neighbours = ','.join([words[indices[i,j]] for j in range(indices.shape[1]) if words[indices[i,j]]!= word])
        #print(neighbours)
        row = pd.DataFrame(np.array([[word],[neighbours]]).T, columns = columns)
        nearest_neighbours_df = nearest_neighbours_df.append(row)
        
    
    #pd.concat([pd.DataFrame(np.array([[word],[','.join([words[indices[i,j]] for j in range(indices.shape[1]) if words[indices[i,j]]!=word ])]]).T, columns = columns) for i,word in enumerate(word_embedding_dict.keys())])
    
    if split:
        neighbour_col_names = ["neighbour_%d"%(i) for i in range(n_neighbours)]
        nearest_neighbours_df[neighbour_col_names] = nearest_neighbours_df.neighbours.str.split(',', expand = True )
        nearest_neighbours_df.drop(columns = ["neighbours"],inplace = True)
    
    
    #Reset index
    nearest_neighbours_df = nearest_neighbours_df.reset_index(drop=True)
    
    
    
    return nearest_neighbours_df

In [21]:
words = list(c.keys())

In [17]:
word_embedding_dict = generate_word_embedding_dict(words)

In [22]:
word_embedding_dict = generate_word_embedding_dict(c.keys())

In [24]:
np.save("C:/Users/jayes/Downloads/LSV/Data/word_embedding_dict_latest.npy",word_embedding_dict)

In [19]:
em_nearest_neighbours = give_nearest_neighbours_on_embeddings(word_embedding_dict, 10,'cosine', False)

Calculating Nearest Neighbours
num of words 9974


In [22]:
em_nearest_neighbours.to_csv("Data/em_nearest_neighbours_cosine_freq_5.txt")

In [None]:
df = calculate_embedding_distance(wordpairs_df,word_embedding_dict,metrics = ['cosine', 'euclidean'])

In [None]:
df

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[0]),
    #hue="Word",
    data=df,
    legend="full",
    alpha=0.5)
g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[1]),
    #hue="Word",
    data=df,
    legend="full",
    alpha=0.5)
g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
df.groupby('phonetic_edit_distance', as_index = False).agg(['mean', 'count', 'std'], index = False)

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[0]),
    #hue="Word",
    data=df.groupby('phonetic_edit_distance', as_index = False).mean(),
    legend="full",
    alpha=0.5)
plt.ylabel('average cosine distance')
#g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
g = sns.scatterplot(
    x="phonetic_edit_distance", y="%s_distance"%(metrics[1]),
    #hue="Word",
    data=df.groupby('phonetic_edit_distance', as_index = False).mean(),
    legend="full",
    alpha=0.5)
plt.ylabel('average euclidean distance')
#g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

In [None]:
#Read the homophones_df and split it word pairs into indiviudal columns
homophones = pd.read_csv('Data/homophones.txt')
column_names = ['word_1','word_2']
homophones[column_names] = homophones.word_pairs.str.strip('()').str.split(',', expand = True)
homophones["word_1"] = homophones.word_1.str.strip(' \'\'')
homophones["word_2"] = homophones.word_2.str.strip(' \'')
del homophones["word_pairs"]
cols = list(homophones)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('word_2')))
cols.insert(0, cols.pop(cols.index('word_1')))
homophones = homophones.loc[:, cols]
homophones.to_csv('Data/homophones_expanded.txt', index = False)

Start of Nearest Neighbour Analysis

In [15]:
def alphabet_commas(string):
    '''Takes a string and returns a filtered string with only alphabet and commas'''
    
    return ''.join(e for e in string if (e.isalpha() or e == ","))
    

In [77]:
def calc_spearman_with_awe(word,nn_words,word_embedding_dict):
    '''Takes a  word and it's list of nearest neighbour words, 
    calculates their awe and calculates the spearman rank coefficient'''
    
    nn_words_ranks = np.array((np.arange(1,len(nn_words)+1))).reshape(1,-1)
    
    word_embedding = word_embedding_dict.item().get(word).squeeze()
    nn_words_embeddings = np.stack([word_embedding_dict.item().get(word).squeeze() for word in nn_words])
    #print(word_embedding.shape,nn_words_embeddings.shape)
    similarity = pairwise_kernels(word_embedding.reshape(1,-1),nn_words_embeddings, metric = 'cosine')
    awe_words_ranks = np.argsort(-similarity)+1
    
    #print(nn_words_ranks, awe_words_ranks)
    rho,p_value = stats.spearmanr(nn_words_ranks.ravel(), awe_words_ranks.ravel())
    return rho,p_value

In [11]:
#Load the nearest neighbours based on embeddings and orthographic/phonetic representation
#em_cosine_nn = pd.read_csv('Data/em_nearest_neighbours.txt')
edit_distance_nn = pd.read_csv('Data/edit_nearest_neighbours.txt')
sim_distance_nn = pd.read_csv('Data/sim_nearest_neighbours.txt')
homophones = pd.read_csv('Data/raw_ph_homophones.txt')

In [101]:
word_embedding_dict = np.load('Data/word_embedding_dict.npy', allow_pickle = True)

In [102]:
word_embedding_dict.item().get("thank").squeeze()

array([2.0415986e-01, 2.4921808e+01, 0.0000000e+00, ..., 0.0000000e+00,
       0.0000000e+00, 3.6583733e-04], dtype=float32)

In [93]:
em_cosine_nn

Unnamed: 0,word,neighbours
0,mmhmm,"regional,mhhmm,hmmmm,parallel,bumps,mmmmmm,mmm..."
1,thank,"thinked,think,thing,thingll,pinned,seemed,hang..."
2,uhhuh,"avril,avocado,crack,liger,addon,mmhmm,whatnot,..."
3,already,"roller,cloak,coordinate,laundry,figleaf,orally..."
4,analyse,"penlight,anonymous,fabulous,analysed,dialects,..."
...,...,...
9969,ponnen,"problem,scrollbutton,probabl,profitmargin,trun..."
9970,vanna,"dimensional,banana,bananabando,bananarama,frui..."
9971,origi,"exhibit,misplace,hourish,upstairs,azerty,robin..."
9972,refresh,"wordperfect,imagination,demonstration,weixuns,..."


In [13]:
sim_distance_nn

Unnamed: 0,word,orthographic,raw_phonetic,filtered_phonetic
0,cheapie,"('cheaper', 'cheaply', 'cheap', 'cheapy', 'che...","('cheapy', 'chippy', 'cheaps', 'cheaper', 'che...","('cheapy', 'cheap', 'cheaply', 'chippy', 'chea..."
1,conjunction,"('connection', 'conjunct', 'convention', 'cons...","('connection', 'consumption', 'conjunctural', ...","('conjunctural', 'connection', 'consumption', ..."
2,nicer,"('niche', 'never', 'nicety', 'timer', 'univer'...","('lesser', 'night', 'fibre', 'dicier', 'nines'...","('nigger', 'wiper', 'answer', 'never', 'buyer'..."
3,ourselves,"('yourselves', 'ourself', 'relies', 'courses',...","('ourself', 'yourselves', 'yourself', 'solves'...","('ourself', 'sells', 'cells', 'solves', 'thems..."
4,temporarily,"('temporary', 'temporal', 'arbitrarily', 'terr...","('necessarily', 'temporal', 'temporary', 'temp...","('temporary', 'temporal', 'necessarily', 'temp..."
...,...,...,...,...
9969,global,"('globe', 'local', 'loyal', 'globs', 'globby',...","('globally', 'globe', 'mobile', 'label', 'loca...","('globally', 'local', 'mobile', 'label', 'glob..."
9970,hearing,"('healing', 'heating', 'wearing', 'gearing', '...","('heating', 'healing', 'keyring', 'hitting', '...","('healing', 'heating', 'keyring', 'string', 'h..."
9971,slidebar,"('slider', 'slides', 'slidey', 'spider', 'line...","('slideshow', 'slider', 'sliders', 'sliding', ...","('sliders', 'slideshow', 'slides', 'slide', 's..."
9972,dedicated,"('dedicate', 'educated', 'indicated', 'delicat...","('dedicate', 'edited', 'indicated', 'dissected...","('indicated', 'dedicate', 'educated', 'edited'..."


In [83]:
def similarity_task(nn_df,column_name = "raw_phonetic"):
    words = set(sim_distance_nn["word"].to_list())
    #words = ["cheapie"]
    avg_rho = 0
    for word in words:
        query = nn_df.query("word == '%s'"%(word))[column_name].item()
        nn_words = alphabet_commas(query).split(",")
        rho,p_value = calc_spearman_with_awe(word,nn_words, word_embedding_dict)
        avg_rho += rho


    avg_rho = avg_rho/len(words)
    return avg_rho
    

In [82]:
print(similarity_task(sim_distance_nn,"raw_phonetic"))
print(similarity_task(sim_distance_nn,"filtered_phonetic"))
print(similarity_task(sim_distance_nn,"orthographic"))

0.4954177832060372
0.487934083161677
0.49291430446433687


In [84]:
print(similarity_task(edit_distance_nn,"raw_phonetic"))
print(similarity_task(edit_distance_nn,"filtered_phonetic"))
print(similarity_task(edit_distance_nn,"orthographic"))

0.4885781820612404
0.4897861713181568
0.4951018101609663


In [133]:
phoney = BigPhoney()

In [135]:
words = set(homophones["word"].to_list())
phoneme_dict = {}
for word in words:
    phoneme_dict[word] = phoney.phonize(word)

In [132]:
homophones[homophones["word"] == "slidey"]

Unnamed: 0,word,homophone_words
24,slidey,"slidy,"


In [139]:
def filter_alphabets(string):
	return ''.join(e for e in string if (e.isalpha() or e.isspace()))

In [148]:
def homophone_task(homophones):
    words = set(homophones["word"].to_list())
    homophones["homophone_words"] = homophones["homophone_words"].apply(alphabet_commas) 

    avg_precision = 0
    
    phoneme_eDistance_list = []
    for word in words:
        homophone_query = homophones.query("word == '%s'"%(word))
        awe_nn_words_query = em_cosine_nn.query("word == '%s'"%(word))

        homophone_words = list(filter(lambda x: x.isalpha(), homophone_query["homophone_words"].item().split(",")))
        awe_nn_words = list(awe_nn_words_query["neighbours"].item().split(","))[:len(homophone_words)]

        #print(awe_nn_words)

        #Set of homophone words
        set_homophone_words = set(homophone_words)

        #Set of Nearest neighbours based on cosine_similarity of embeddings
        set_awe_nn_words = set(awe_nn_words)
        
        
        #print(word,set_homophone_words,set_awe_nn_words)
        
        for word_1,word_2 in zip(list(set_homophone_words),list(set_awe_nn_words)):
            
            aligned_seq1, aligned_seq2, eDistance = alignSequences.align(filter_alphabets(phoney.phonize(word_1)),filter_alphabets(phoney.phonize(word_2)))
            
            print('{%s , %s , %s } Phoneme eDistance %d'%(word,word_1,word_2,eDistance))
            
            phoneme_eDistance_list.append(eDistance)
        
        
        #Calculate precision score
        word_precision = len(set_homophone_words.intersection(set_awe_nn_words))/len(set_homophone_words)

        avg_precision += word_precision

    avg_precision = avg_precision/len(words)
    print(avg_precision)
    
    return phoneme_eDistance_list

In [149]:
a = homophone_task(homophones)

{through , threw , carrier } Phoneme eDistance 4
{commitment , committment , equipments } Phoneme eDistance 5
{weights , waits , weight } Phoneme eDistance 1
{handy , handi , napki } Phoneme eDistance 3
{simpl , simple , should } Phoneme eDistance 6
{strawberrys , strawberries , forgo } Phoneme eDistance 7
{corinne , corrine , pretty } Phoneme eDistance 5
{prioritise , prioritize , prioritized } Phoneme eDistance 1
{pares , pairs , players } Phoneme eDistance 3
{plain , plane , playing } Phoneme eDistance 2
{parti , party , their } Phoneme eDistance 4
{mails , males , names } Phoneme eDistance 2
{acquaintance , aquaintance , gorgeous } Phoneme eDistance 7
{peoplell , peopl , poodle } Phoneme eDistance 2
{peoplell , people , people } Phoneme eDistance 0
{disks , discs , these } Phoneme eDistance 5
{franc , frank , throwing } Phoneme eDistance 4
{weighted , waited , wasting } Phoneme eDistance 2
{frank , franc , print } Phoneme eDistance 4
{minimalised , minimalized , encouragement } Pho

{characterization , characterisation , particularity } Phoneme eDistance 12
{shuts , schutz , schutz } Phoneme eDistance 0
{knowing , knoing , ellen } Phoneme eDistance 4
{custome , custom , customers } Phoneme eDistance 2
{stylized , stylised , sacrificed } Phoneme eDistance 7
{emphasise , emphasize , nondescript } Phoneme eDistance 10
{channelll , channel , transcribed } Phoneme eDistance 8
{usable , useable , usersll } Phoneme eDistance 3
{course , coarse , coolest } Phoneme eDistance 4
{hairy , harry , battery } Phoneme eDistance 4
{moldable , mouldable , animal } Phoneme eDistance 5
{theres , theirs , luxurious } Phoneme eDistance 8
{centr , centre , fruit } Phoneme eDistance 4
{board , bored , avoidable } Phoneme eDistance 7
{claire , clare , foldout } Phoneme eDistance 5
{waits , weights , lights } Phoneme eDistance 2
{pains , panes , change } Phoneme eDistance 2
{programmes , programs , provenance } Phoneme eDistance 7
{wouldn , wooden , wouldnt } Phoneme eDistance 1
{waited , 

In [150]:
c = Counter(a)

In [154]:
c

Counter({4: 38,
         5: 38,
         1: 18,
         3: 29,
         6: 28,
         7: 25,
         2: 25,
         0: 29,
         10: 4,
         8: 16,
         9: 5,
         11: 3,
         12: 2})

Rough

In [18]:
def sim_score(ser):
	return 10 - min(10,ser/10)

In [6]:
wordpairs_ser = pd.read_csv('Data/wordpairs_test_with_ser.txt')
wordpairs_ser = wordpairs_ser.head()

In [7]:
wordpairs_embedding_similarity = pd.read_csv('Data/wordpairs_test_embedding_similarity.txt')
wordpairs_embedding_similarity = wordpairs_embedding_similarity.head()

In [8]:
wordpairs_ser.head()

Unnamed: 0,word_1,word_2,orthographic_edit_distance,phonetic_edit_distance,word_1_phonetic_ser,word_2_phonetic_ser,word_1_orthographic_ser,word_2_orthographic_ser
0,marketing,share,7,7,0.777778,1.4,0.777778,1.4
1,complexity,simple,6,8,0.8,1.333333,0.6,1.0
2,chance,probably,7,8,1.333333,1.0,1.166667,0.875
3,covered,sheet,5,5,0.714286,1.0,0.714286,1.0
4,shifting,topics,7,6,0.75,1.0,0.875,1.166667


In [9]:
del wordpairs_ser["orthographic_edit_distance"]
for s in ["word_1","word_2"]:
    del wordpairs_ser["%s_phonetic_ser"%(s)]
    del wordpairs_ser["%s_orthographic_ser"%(s)]
#del wordpairs_ser["phonetic_edit_distance"]

In [10]:
wordpairs_ser.sort_values("phonetic_edit_distance").reset_index(drop = True)

Unnamed: 0,word_1,word_2,phonetic_edit_distance
0,covered,sheet,5
1,shifting,topics,6
2,marketing,share,7
3,complexity,simple,8
4,chance,probably,8


In [11]:
wordpairs_embedding_similarity.sort_values("cosine_similarity", ascending = False)

Unnamed: 0,word_1,word_2,cosine_similarity
3,covered,sheet,0.005748
2,chance,probably,0.000292
1,complexity,simple,0.000142
4,shifting,topics,0.000115
0,marketing,share,9e-05


In [18]:
wordpairs_distance = pd.merge(wordpairs_ser,wordpairs_embedding_similarity,on = ["word_1","word_2"])

In [19]:
wordpairs_distance

Unnamed: 0,word_1,word_2,phonetic_edit_distance,cosine_similarity
0,marketing,share,7,9e-05
1,complexity,simple,8,0.000142
2,chance,probably,8,0.000292
3,covered,sheet,5,0.005748
4,shifting,topics,6,0.000115


In [31]:
stats.spearmanr(wordpairs_distance["word_2_phonetic_ser"],wordpairs_distance["cosine_similarity"].to_list())

SpearmanrResult(correlation=-0.13328721425962478, pvalue=0.0)

In [24]:
stats.spearmanr(wordpairs_distance["orthographic_edit_distance"],wordpairs_distance["orthographic_edit_distance"])

SpearmanrResult(correlation=0.81043155844782, pvalue=0.0)