In [38]:
# uploading term2doc_nostopwords_en-ru.news+reviews.profs.id+en+ru
import json
filename = 'term2doc_nostopwords_en-ru.news+reviews.studs.id+en+ru.json'
with open(filename, 'rb') as f:
    matrix = json.load(f)

In [39]:
len(matrix)

9421

In [40]:
# the keys look like this:
keys = list(matrix.keys())
print(keys[:10])

['ru_сын', 'en_whose', 'en_palu', 'en_glasgow', 'en_bloom', 'ru_обострен', 'ru_обеспеч', 'ru_плох', 'en_featur', 'en_goyo']


In [41]:
# import the cosine metric and the stemmers for english and the other language
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer
stemmer_en = SnowballStemmer("english")
stemmer_other = SnowballStemmer("russian") # stemmer for the "other language", here russian; finnish for finnish etc.

In [42]:
# let's not forget the prefix for the language!
# I will make a function that goes from word to key to go faster
def word2key(word, lang):
    return lang + '_' + stemmer_en.stem(word) if lang == 'en' else lang + '_' + stemmer_other.stem(word)

# let's try it out
print(word2key('dog', 'en'))
print(word2key('собака', 'ru'))

en_dog
ru_собак


In [43]:
# comparing two words' similarity

english_key = word2key('book', 'en')
russian_key = word2key('книга', 'ru')
cosine_similarity([matrix[english_key]], [matrix[russian_key]])

array([[0.8912319]])

In [44]:
# finding the nearest neighbours
# what is the nearest neighbour to the word 'book' in the other language?

to_sort = []
for w in keys:
    if w != english_key:
        to_sort.append((cosine_similarity([matrix[english_key]], [matrix[w]]),w))

In [45]:
n=10
sorted(to_sort, reverse=True)[:n]

[(array([[0.8912319]]), 'ru_книг'),
 (array([[0.29740896]]), 'ru_торт'),
 (array([[0.29740896]]), 'en_cake'),
 (array([[0.24353971]]), 'ru_эт'),
 (array([[0.21029989]]), 'ru_страниц'),
 (array([[0.2079501]]), 'ru_ищет'),
 (array([[0.20134682]]), 'ru_спэн'),
 (array([[0.20134682]]), 'ru_купер'),
 (array([[0.20134682]]), 'en_style'),
 (array([[0.20134682]]), 'en_spann')]

In [46]:
# 'book' seems to have a steady translation

In [75]:
# I will also make a function to go from a word to its 10 nearest neighbours
def nearest_neighbours(word, lang, n=4):
    key = word2key(word, lang)
    to_sort = []
    for w in keys:
        if w != key:
            to_sort.append((cosine_similarity([matrix[key]], [matrix[w]]),w))
    return sorted(to_sort, reverse=True)[:n]

In [76]:
# a very systematically translated word f.e. would be a name
nearest_neighbours('trump', 'en')

[(array([[1.]]), 'ru_трамп'),
 (array([[0.5976143]]), 'ru_дональд'),
 (array([[0.56694671]]), 'en_donald'),
 (array([[0.37953606]]), 'en_presid')]

In [77]:
# the first result for trump has a perfect similarity, the second is much lower

In [78]:
# on the opposite end of the spectrum, a word that has no systematic translation
# (in the corpus, arguably in russian)
nearest_neighbours('unless', 'en')

[(array([[0.57735027]]), 'ru_частност'),
 (array([[0.57735027]]), 'ru_успева'),
 (array([[0.57735027]]), 'ru_условн'),
 (array([[0.57735027]]), 'ru_транскрипц')]

In [79]:
# unless has a highly entropic cloud of mid-distance neighbours

In [80]:
# pain has a less entropic cloud than "unless", with судорог being the closest russian word,
# but still it is pretty far (and in fact not a perfect translation if I understand, but that 
# is also a problem of stemming).
nearest_neighbours('pain', 'en')

[(array([[0.81649658]]), 'ru_болезнен'),
 (array([[0.57735027]]), 'ru_спазм'),
 (array([[0.57735027]]), 'en_cramp'),
 (array([[0.40824829]]), 'ru_синт')]

In [81]:
#Unfortunately rare words in the corpus can obtain several wrong perfect matches, so it will be a good practice to check for their frequency in the corpus
nearest_neighbours('ecology', 'en') 

[(array([[1.]]), 'ru_опылител'),
 (array([[1.]]), 'en_pollin'),
 (array([[0.70710678]]), 'ru_устойчив'),
 (array([[0.70710678]]), 'ru_канцероген')]

In [82]:
# cases of different acceptable translations can have degrading probabilities 
nearest_neighbours('night', 'en')

[(array([[0.69991324]]), 'ru_ноч'),
 (array([[0.32444284]]), 'ru_тумбочк'),
 (array([[0.32444284]]), 'ru_прикроватн'),
 (array([[0.32444284]]), 'ru_вечер')]

In [83]:
nearest_neighbours('war', 'en')

[(array([[0.9701425]]), 'ru_войн'),
 (array([[0.45834925]]), 'ru_торгов'),
 (array([[0.40422604]]), 'en_trade'),
 (array([[0.36380344]]), 'ru_окончан')]

In [84]:
nearest_neighbours('environmental', 'en') #two legit translations

[(array([[0.66666667]]), 'ru_экологическ'),
 (array([[0.66666667]]), 'ru_окружа'),
 (array([[0.57735027]]), 'ru_присуд'),
 (array([[0.57735027]]), 'ru_принадлеж')]

In [85]:
nearest_neighbours('syria', 'en') 

[(array([[0.81649658]]), 'ru_сир'),
 (array([[0.47140452]]), 'ru_возвращен'),
 (array([[0.47140452]]), 'en_readi'),
 (array([[0.40824829]]), 'ru_правдив')]

In [104]:
nearest_neighbours('country', 'en') 

[(array([[0.76338629]]), 'ru_стран'),
 (array([[0.26261287]]), 'ru_проголос'),
 (array([[0.26261287]]), 'ru_добровольн'),
 (array([[0.26261287]]), 'en_welcom')]

In [98]:
import numpy as np
from scipy.stats import entropy

def entropy_of_the_cloud(similarity_list):
    return entropy(np.array(similarity_list).flatten(), base=len(similarity_list))

# let's see the entropy of the clouds for the words we've seen
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('trump', 'en')]))

0.9644964611972059


In [99]:
# let's see how entropic the clouds are
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('unless', 'en')]))

1.0


In [103]:
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('country', 'en')]))

0.9023970328731514


In [95]:
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('bully', 'en')]))

1.0


In [96]:
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('fear', 'en')]))

0.9970372532381339


In [97]:
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('syria', 'en')]))

0.9707977567330853


In [101]:
# sorting words from most systematically to least systematically translated

words = """trump
environmental
war
bully
syria
unless""".split("\n")

list_to_sort = []
for w in words:
    list_to_sort.append((entropy_of_the_cloud([x[0] for x in nearest_neighbours(w, 'en')]), w))

[(1.0, 'unless'),
 (1.0, 'bully'),
 (0.9981392069160074, 'environmental'),
 (0.9707977567330853, 'syria'),
 (0.9562897546505799, 'trump'),
 (0.93574299558834, 'war')]

In [102]:
# from most to least systematically translated
sorted(list_to_sort, reverse=False)

[(0.93574299558834, 'war'),
 (0.9562897546505799, 'trump'),
 (0.9707977567330853, 'syria'),
 (0.9981392069160074, 'environmental'),
 (1.0, 'bully'),
 (1.0, 'unless')]

In [None]:
# imperfect measure, we can improve it. But it's a start.