In [62]:
from scipy.spatial import distance
import pandas as pd
import numpy as np

## Exploring Wikipedia Distance

In [20]:
def compare(df, val1, val2):
    return distance.euclidean(df.loc[val1], df.loc[val2])

### 3D embedding distances

In [3]:
# Load the wikipedia data in memory
path = '../demo_embeddings/wikipedia_3000/iterations_250/perplexity_3/pca_25/learning_rate_10'
embedding_df = pd.read_csv(path + f'/data.csv', index_col=0)
embedding_df.head()

Unnamed: 0,x,y,z
the,0.010972,-0.019077,0.048307
of,-0.002067,-0.020699,0.039711
to,0.033334,-0.866722,-0.1539
and,-0.109757,-0.03304,0.00845
in,0.186069,-0.003899,0.181199


In [24]:
print("the vs of:", compare(embedding_df, 'the', 'of'))
print("of vs to:", compare(embedding_df, 'to', 'of'))

the vs of: 0.01570124387892331
of vs to: 0.8686157619115147


### Original Embeddings distances

In [17]:
glove_df = pd.read_csv("../data/wikipedia_3000.csv", index_col=0)
glove_df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,0.04656,0.21318,-0.007436,-0.45854,-0.035639,0.23643,-0.28836,0.21521,-0.13486,-1.6413,...,-0.013064,-0.29686,-0.079913,0.195,0.031549,0.28506,-0.087461,0.009061,-0.20989,0.053913
of,-0.076947,-0.021211,0.21271,-0.72232,-0.13988,-0.12234,-0.17521,0.12137,-0.070866,-1.5721,...,-0.36673,-0.38603,0.3029,0.015747,0.34036,0.47841,0.068617,0.18351,-0.29183,-0.046533
to,-0.25756,-0.057132,-0.6719,-0.38082,-0.36421,-0.082155,-0.010955,-0.082047,0.46056,-1.8477,...,-0.012806,-0.59707,0.31734,-0.25267,0.54384,0.063007,-0.049795,-0.16043,0.046744,-0.070621
and,0.038466,-0.039792,0.082747,-0.38923,-0.21431,0.1702,-0.025657,0.09578,0.2386,-1.6342,...,0.045194,-0.20405,-0.21097,-0.11025,0.021766,0.44129,0.32797,-0.33427,0.011807,0.059703
in,-0.44399,0.12817,-0.25247,-0.18582,-0.16614,0.25909,-0.22678,-0.069229,-0.077204,-1.5814,...,-0.2745,-0.037237,0.10104,0.10798,0.37727,0.87977,0.33583,-0.20043,-0.082191,-0.06255


In [27]:
print("Euclidean distances for Original 300D Glove embeddings:")
print("the, of:", compare(glove_df, 'the', 'of'))
print("of, to:", compare(glove_df, 'to', 'of'))

Euclidean distances for Original 300D Glove embeddings:
the, of: 4.08706684721173
of, to: 6.035719708879699


### Speed comparison, 3D vs original

In [31]:
print("Speed for 3D Embedding:")
%timeit compare(embedding_df, 'the', 'of')
print("\nSpeed for original Embedding (300D):")
%timeit compare(glove_df, 'the', 'of')

Speed for 3D Embedding:
324 µs ± 44.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

Speed for original Embedding (300D):
321 µs ± 33.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [32]:
print("Speed for 3D Embedding:")
%timeit compare(embedding_df, 'to', 'of')
print("\nSpeed for original Embedding (300D):")
%timeit compare(glove_df, 'to', 'of')

Speed for 3D Embedding:
296 µs ± 37.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

Speed for original Embedding (300D):
304 µs ± 27.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


We can see that the speeds are very similar to each other, and the computation is very efficient (less than a ms)

In [86]:
selected_word = 'united'
selected_vec = glove_df.loc[selected_word]
word_dict = pd.DataFrame(glove_df.index)

def compare_pd(vector):
    return distance.euclidean(vector, selected_vec)

%timeit glove_df.apply(compare_pd, axis=1)
distance_map = glove_df.apply(compare_pd, axis=1)
print(distance_map.shape)
distance_map.head()

198 ms ± 15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
(3000,)


0
the    6.498628
of     6.774521
to     6.892077
and    6.457886
in     6.284315
dtype: float64

This returns exactly what we want, which is a list of all the distances a certain word, in our case 'the', and does so in a reasonable amount of time.

In [87]:
selected_word = 'united'
selected_vec = embedding_df.loc[selected_word]

def compare_pd(vector):
    return distance.euclidean(vector, selected_vec)

%timeit embedding_df.apply(compare_pd, axis=1)
distance_map_3d = embedding_df.apply(compare_pd, axis=1)
distance_map_3d.head()

155 ms ± 8.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


the    2.121015
of     2.128318
to     2.759959
and    2.199864
in     2.042792
dtype: float64

We again notice similar performance for the 3D embedding. Therefore there's no good reason to use this one in particular, since the original embedding captures the distribution better in any case.

### Testing preprocessing

In [90]:
sorted_map = distance_map.sort_values()
sorted_map

0
united           0.000000
states           3.213737
u.s.             5.808629
both             6.013402
nations          6.068058
although         6.113164
america          6.171285
also             6.199050
already          6.251269
which            6.252895
in               6.284315
american         6.286174
however          6.295584
now              6.299501
while            6.316134
but              6.349286
only             6.352042
though           6.388109
well             6.397458
as               6.403575
washington       6.445161
and              6.457886
same             6.470125
join             6.491023
despite          6.494019
the              6.498628
meanwhile        6.500599
americans        6.502239
addition         6.512599
example          6.523228
                  ...    
trim            10.065186
optional        10.083921
bowl            10.084070
del             10.095619
rep.            10.106383
municipality    10.112696
inflation       10.135611
province  

In [89]:
sorted_3d_map = distance_map_3d.sort_values()
sorted_3d_map

united         0.000000
states         0.007371
australian     0.032252
canadian       0.064197
indian         0.076431
african        0.084495
mexican        0.089349
british        0.105448
american       0.112825
zealand        0.117893
vietnam        0.119781
europe         0.131286
australia      0.141201
brazilian      0.141710
korean         0.149245
japanese       0.152028
chinese        0.155905
korea          0.157681
japan          0.158535
swiss          0.158535
swedish        0.162122
philippines    0.162242
european       0.162300
canada         0.163285
italian        0.165823
philippine     0.166912
spanish        0.166999
dutch          0.167640
africa         0.170548
thai           0.173024
                 ...   
trading        3.773671
shares         3.774356
jan.           3.774899
scheduled      3.776306
you            3.777855
dollar         3.782783
prices         3.783643
stocks         3.799382
yen            3.808897
million        3.814492
dlrs           3