In [60]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from numpy import dot
from numpy.linalg import norm

## Read SimLex

In [16]:
simlex = pd.read_csv('MSimLex999_Polish.txt', sep='\t', header=None)

In [17]:
simlex.columns = ['id', 'word1', 'word2', 'similarity', 'relatedness']

In [18]:
simlex.head()

Unnamed: 0,id,word1,word2,similarity,relatedness
0,1,stary,nowy,0.43,7.29
1,2,bystry,inteligentny,8.86,9.71
2,3,ciężki,trudny,4.86,7.29
3,4,szczęśliwy,radosny,8.14,8.86
4,5,łatwy,męczący,0.43,6.43


In [97]:
simlex.describe()

Unnamed: 0,id,similarity,relatedness
count,999.0,999.0,999.0
mean,500.0,2.476697,5.945395
std,288.530761,2.553304,2.531229
min,1.0,0.0,0.0
25%,250.5,0.43,4.43
50%,500.0,1.57,6.57
75%,749.5,4.0,7.86
max,999.0,9.71,9.86


In [24]:
unique_words = set(list(simlex['word1'].unique()) + list(simlex['word2'].unique()))

In [41]:
len(unique_words)

1139

## Read embeddings

In [50]:
embeddings = {}

with open("kgr10.plain.lemma.cbow.dim100.neg10.vec", "r") as vec:
    next(vec)
    for line in tqdm(vec):
        line = line.split()
        if line[0] in unique_words:
            embeddings[line[0]] = np.array(line[1:]).astype(float)

2137684it [00:28, 74136.51it/s]


In [51]:
len(embeddings)

1117

### Missing embeddings

In [55]:
found_embeddings = set(embeddings.keys())

In [56]:
unique_words - found_embeddings

{'byk',
 'cela',
 'duma',
 'dąb ',
 'furia',
 'głupi ',
 'kostka',
 'księżyc',
 'ludzie',
 'luka',
 'meble',
 'mężczyźni',
 'nasiona',
 'obładowany',
 'partia',
 'pieniądze',
 'przestraszony',
 'rak',
 'ruch',
 'sława',
 'ubrania',
 'łódź'}

In [89]:
filtered_simlex = simlex[(simlex['word1'].isin(found_embeddings)) & (simlex['word2'].isin(found_embeddings))]

In [90]:
print(len(simlex))
print(len(filtered_simlex))

999
967


## Similarity / relatedness metrics

In [91]:
def add_cosinus(simlex_row):
    v1 = embeddings[simlex_row['word1']]
    v2 = embeddings[simlex_row['word2']]

    simlex_row['cosinus'] = dot(v1, v2)/(norm(v1) * (norm(v2)))

    return simlex_row

In [92]:
def add_euclidean(simlex_row):
    v1 = embeddings[simlex_row['word1']]
    v2 = embeddings[simlex_row['word2']]

    simlex_row['euclidean'] = np.sqrt(np.sum((v1 - v2)**2))

    return simlex_row

In [93]:
tqdm.pandas()

In [94]:
filtered_simlex = filtered_simlex.progress_apply(add_cosinus, axis=1)

100%|██████████| 967/967 [00:00<00:00, 1199.65it/s]


In [95]:
filtered_simlex = filtered_simlex.progress_apply(add_euclidean, axis=1)

100%|██████████| 967/967 [00:00<00:00, 1148.71it/s]


In [96]:
filtered_simlex.tail()

Unnamed: 0,id,word1,word2,similarity,relatedness,cosinus,euclideann
994,995,dołączyć,zdobyć,0.43,2.29,0.657871,25.897192
995,996,wysyłać,uczestniczyć,0.0,0.86,0.560702,29.530632
996,997,zbierać,uczestniczyć,0.0,0.71,0.617206,26.957282
997,998,pochłonąć,wycofać,0.0,0.57,0.720902,21.570204
998,999,uczestniczyć,przybyć,0.57,3.43,0.727095,22.198529


In [98]:
filtered_simlex.to_csv('embeddings_results.csv')