In [1]:
import spacy
nlp = spacy.load('pl_spacy_model')

Adapted from: https://nlpforhackers.io/complete-guide-to-spacy/

In [2]:
print(nlp.vocab['banan'].vector)

[-0.0024764 -0.70863   -0.5028    -0.056419  -0.53134   -0.13114
 -0.40978   -0.14778   -0.064284   0.6401    -0.17423   -0.50371
  0.43842    0.55932   -0.075957  -0.17258    0.30186    0.60262
 -0.52406    0.11838    0.13601    0.89278    0.21444   -0.060095
 -0.41741    0.098456  -0.087808  -0.76608   -0.20468   -0.45683
  0.71817    0.57862   -0.24163   -0.038695  -0.9357     0.14653
 -0.53036   -0.21333   -0.28863   -0.2988     0.1382    -0.19232
 -0.072795  -0.53791    0.25016    0.31046   -0.51876   -0.35181
  0.070766  -0.0079212 -0.14035   -0.41757    0.034469   0.33171
 -0.26193   -0.3754     0.19537   -0.51506    0.3065    -0.30848
  0.36483   -0.086375  -0.061522   0.34212   -0.69735    0.23462
  0.24127    0.11332   -0.31009    0.28628   -0.83422    0.094826
  0.58068   -0.67866    0.29921    0.76477    0.033      0.15036
 -0.13588   -0.32511   -0.32224    0.0902     0.31761   -0.50214
 -0.27068   -0.14194   -0.15916    0.065393   0.059113   0.20234
 -0.0199     0.80114   

In [3]:
from scipy import spatial
 
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
 
man = nlp.vocab['mężczyzna'].vector
woman = nlp.vocab['kobieta'].vector
queen = nlp.vocab['królowa'].vector
king = nlp.vocab['król'].vector
 
# We now need to find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
maybe_king = man - woman + queen
computed_similarities = []
 
for word in nlp.vocab:
    # Ignore words without vectors
    if not word.has_vector:
        continue
 
    similarity = cosine_similarity(maybe_king, word.vector)
    computed_similarities.append((word, similarity))
 
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])

['książę', 'król', 'królewicz', 'monarcha', 'hrabia', 'władca', 'Książę', 'arcyksiążę', 'książe', 'Arcyksiążę']


In [4]:
banana = nlp.vocab['banan']
dog = nlp.vocab['pies']
fruit = nlp.vocab['owoc']
animal = nlp.vocab['zwierzę']
 
print(dog.similarity(animal), dog.similarity(fruit)) # ENG: 0.6618534 0.23552845
print(banana.similarity(fruit), banana.similarity(animal)) # ENG: 0.67148364 0.2427285

0.68410385 0.38233432
0.61139715 0.36240068


In [5]:
target = nlp("Koty to piękne zwierzęta.")
 
doc1 = nlp("Psy są niesamowite.")
doc2 = nlp("Jednymi z najwspanialszych zwierząt są koty.")
doc3 = nlp("Delfiny to pływające ssaki.")
 
print(target.similarity(doc1))  # ENG: 0.8901765218466683
print(target.similarity(doc2))  # ENG: 0.9115828449161616
print(target.similarity(doc3))  # ENG: 0.7822956752876101

0.8993484113232622
0.828325566207268
0.862666079636411
