In [37]:
# uploading term2doc_nostopwords_en-ru.news+reviews.profs.id+en+ru
import json
filename = 'term2doc_nostopwords_en-ru.news+reviews.profs.id+en+ru.json'
with open(filename, 'rb') as f:
    matrix = json.load(f)

In [38]:
len(matrix)

9414

In [39]:
# the keys look like this:
keys = list(matrix.keys())
print(keys[:10])


['ru_сын', 'en_whose', 'en_palu', 'en_glasgow', 'en_bloom', 'ru_обострен', 'ru_обеспеч', 'ru_плох', 'en_featur', 'en_goyo']


In [40]:
# import the cosine metric and the stemmers for english and the other language
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer
stemmer_en = SnowballStemmer("english")
stemmer_other = SnowballStemmer("russian") # stemmer for the "other language", here russian; finnish for finnish etc.



In [41]:
# let's not forget the prefix for the language!
# I will make a function that goes from word to key to go faster
def word2key(word, lang):
    return lang + '_' + stemmer_en.stem(word) if lang == 'en' else lang + '_' + stemmer_other.stem(word)

# let's try it out
print(word2key('dog', 'en'))
print(word2key('собака', 'ru'))

en_dog
ru_собак


In [42]:
# comparing two words' similarity

english_key = word2key('book', 'en')
russian_key = word2key('книга', 'ru')
cosine_similarity([matrix[english_key]], [matrix[russian_key]])


array([[0.83487293]])

In [43]:
# finding the nearest neighbours
# what is the nearest neighbour to the word 'book' in the other language?

to_sort = []
for w in keys:
    if w != english_key:
        to_sort.append((cosine_similarity([matrix[english_key]], [matrix[w]]),w))



In [8]:
n=10
sorted(to_sort, reverse=True)[:n]

[(array([[0.83487293]]), 'ru_книг'),
 (array([[0.33833032]]), 'ru_торт'),
 (array([[0.29740896]]), 'en_cake'),
 (array([[0.2409612]]), 'ru_эт'),
 (array([[0.20134682]]), 'ru_спан'),
 (array([[0.20134682]]), 'ru_купер'),
 (array([[0.20134682]]), 'en_style'),
 (array([[0.20134682]]), 'en_spann'),
 (array([[0.20134682]]), 'en_cooper'),
 (array([[0.20024663]]), 'ru_.')]

In [44]:
# I will also make a function to go from a word to its 10 nearest neighbours
def nearest_neighbours(word, lang, n=10):
    key = word2key(word, lang)
    to_sort = []
    for w in keys:
        if w != key:
            to_sort.append((cosine_similarity([matrix[key]], [matrix[w]]),w))
    return sorted(to_sort, reverse=True)[:n]


In [11]:
# Exploring most frequent positive words in EN originals: good, great, well, favorite, nice, excellent, original, recommend, love

In [30]:
nearest_neighbours('good', 'en')

[(array([[0.51328943]]), 'ru_хорош'),
 (array([[0.19200614]]), 'en_realli'),
 (array([[0.18898224]]), 'ru_спокойн'),
 (array([[0.18898224]]), 'ru_добр'),
 (array([[0.18898224]]), 'ru_бабочк'),
 (array([[0.18898224]]), 'en_butterfli'),
 (array([[0.17078251]]), 'ru_товар'),
 (array([[0.16903085]]), 'en_idea'),
 (array([[0.16366342]]), 'ru_заказ'),
 (array([[0.15699646]]), 'en_.')]

In [29]:
nearest_neighbours('great', 'en')

[(array([[0.56468392]]), 'ru_отличн'),
 (array([[0.20766965]]), 'ru_классн'),
 (array([[0.19425717]]), 'ru_повсюд'),
 (array([[0.19425717]]), 'ru_доставк'),
 (array([[0.19425717]]), 'ru_бор'),
 (array([[0.19425717]]), 'en_candid'),
 (array([[0.19425717]]), 'en_alphabet'),
 (array([[0.18193851]]), 'en_work'),
 (array([[0.1737489]]), 'ru_работа'),
 (array([[0.1737489]]), 'ru_замечательн')]

In [14]:
nearest_neighbours('well', 'en')

[(array([[0.27741324]]), 'ru_хорош'),
 (array([[0.24283093]]), 'en_work'),
 (array([[0.23570226]]), 'ru_поп'),
 (array([[0.23570226]]), 'ru_отправля'),
 (array([[0.23570226]]), 'ru_основа'),
 (array([[0.21081851]]), 'en_written'),
 (array([[0.20412415]]), 'ru_удобн'),
 (array([[0.20412415]]), 'ru_размер'),
 (array([[0.16666667]]), 'ru_явл'),
 (array([[0.16666667]]), 'ru_хвалебн')]

In [20]:
nearest_neighbours('favorite', 'en')

[(array([[0.6172134]]), 'ru_любим'),
 (array([[0.37796447]]), 'ru_хобб'),
 (array([[0.37796447]]), 'ru_фавор'),
 (array([[0.37796447]]), 'ru_праздничн'),
 (array([[0.37796447]]), 'ru_пораз'),
 (array([[0.37796447]]), 'ru_нелюбим'),
 (array([[0.37796447]]), 'ru_мысл'),
 (array([[0.37796447]]), 'ru_мокк'),
 (array([[0.37796447]]), 'ru_кондитер'),
 (array([[0.37796447]]), 'ru_захват')]

In [21]:
nearest_neighbours('nice', 'en')

[(array([[0.32732684]]), 'ru_приятн'),
 (array([[0.26726124]]), 'ru_щенок'),
 (array([[0.26726124]]), 'ru_шлют'),
 (array([[0.26726124]]), 'ru_радова'),
 (array([[0.26726124]]), 'ru_поставля'),
 (array([[0.26726124]]), 'ru_накладн'),
 (array([[0.26726124]]), 'ru_линз'),
 (array([[0.26726124]]), 'ru_косточк'),
 (array([[0.26726124]]), 'ru_календар'),
 (array([[0.26726124]]), 'ru_исправ')]

In [22]:
nearest_neighbours('excellent', 'en')

[(array([[0.31622777]]), 'ru_эбенизер'),
 (array([[0.31622777]]), 'ru_шлангов'),
 (array([[0.31622777]]), 'ru_ходов'),
 (array([[0.31622777]]), 'ru_функционаа'),
 (array([[0.31622777]]), 'ru_финанс'),
 (array([[0.31622777]]), 'ru_супер'),
 (array([[0.31622777]]), 'ru_сосотоян'),
 (array([[0.31622777]]), 'ru_соединител'),
 (array([[0.31622777]]), 'ru_рынок'),
 (array([[0.31622777]]), 'ru_пэйн')]

In [24]:
nearest_neighbours('original', 'en')

[(array([[0.63900965]]), 'ru_оригинальн'),
 (array([[0.4472136]]), 'ru_оригина'),
 (array([[0.36514837]]), 'ru_ауд'),
 (array([[0.36514837]]), 'en_theme'),
 (array([[0.36514837]]), 'en_shown'),
 (array([[0.2981424]]), 'ru_увлекательн'),
 (array([[0.2981424]]), 'ru_налич'),
 (array([[0.27386128]]), 'ru_cd'),
 (array([[0.25819889]]), 'ru_цепочек'),
 (array([[0.25819889]]), 'ru_храбр')]

In [25]:
nearest_neighbours('recommend', 'en')

[(array([[0.52297636]]), 'ru_рекоменд'),
 (array([[0.40032038]]), 'ru_порекомендова'),
 (array([[0.38074981]]), 'en_anyon'),
 (array([[0.35082321]]), 'ru_рекомендова'),
 (array([[0.2773501]]), 'ru_советова'),
 (array([[0.26355339]]), 'en_would'),
 (array([[0.22645541]]), 'ru_попробова'),
 (array([[0.20965697]]), 'ru_всем'),
 (array([[0.19611614]]), 'ru_щенок'),
 (array([[0.19611614]]), 'ru_шоколадк')]

In [26]:
nearest_neighbours('love', 'en')

[(array([[0.34874292]]), 'ru_понрав'),
 (array([[0.33557803]]), 'ru_обожа'),
 (array([[0.3099937]]), 'ru_нрав'),
 (array([[0.23249528]]), 'ru_сперв'),
 (array([[0.23249528]]), 'ru_вся'),
 (array([[0.1898316]]), 'ru_любл'),
 (array([[0.1898316]]), 'ru_линейк'),
 (array([[0.1898316]]), 'ru_macely'),
 (array([[0.1898316]]), 'en_mace'),
 (array([[0.16439899]]), 'ru_щенок')]

In [None]:
# Exploring most frequent negative words in EN originals: fake, awful, bad, lame, mean, upset, weak, knockoff, mess, cramp, hate

In [27]:
nearest_neighbours('fake', 'en')

[(array([[0.61558701]]), 'ru_mac'),
 (array([[0.61558701]]), 'en_mac'),
 (array([[0.61313934]]), 'ru_подделк'),
 (array([[0.61177529]]), 'ru_поддельн'),
 (array([[0.39735971]]), 'en_dont'),
 (array([[0.3441236]]), 'ru_прода'),
 (array([[0.3441236]]), 'en_makeup'),
 (array([[0.32444284]]), 'ru_фейков'),
 (array([[0.32444284]]), 'ru_пигмент'),
 (array([[0.32444284]]), 'ru_опрос')]

In [29]:
nearest_neighbours('awful', 'en')

[(array([[0.66143783]]), 'ru_ужасн'),
 (array([[0.39036003]]), 'en_tast'),
 (array([[0.37796447]]), 'ru_скрип'),
 (array([[0.37796447]]), 'ru_невыносим'),
 (array([[0.37796447]]), 'ru_деревя'),
 (array([[0.37796447]]), 'en_wooden'),
 (array([[0.37796447]]), 'en_utensil'),
 (array([[0.37796447]]), 'en_gray'),
 (array([[0.37796447]]), 'en_episod'),
 (array([[0.37796447]]), 'en_clifhang')]

In [32]:
nearest_neighbours('lame', 'en')

[(array([[0.70710678]]), 'ru_хвалебн'),
 (array([[0.70710678]]), 'ru_убог'),
 (array([[0.70710678]]), 'ru_туп'),
 (array([[0.70710678]]), 'ru_пк'),
 (array([[0.70710678]]), 'ru_од'),
 (array([[0.70710678]]), 'ru_дюрант'),
 (array([[0.70710678]]), 'en_paean'),
 (array([[0.70710678]]), 'en_jimmi'),
 (array([[0.70710678]]), 'en_durant'),
 (array([[0.5]]), 'ru_стол')]

In [33]:
nearest_neighbours('mean', 'en')

[(array([[0.45226702]]), 'ru_знач'),
 (array([[0.42640143]]), 'ru_подлост'),
 (array([[0.42640143]]), 'ru_гнев'),
 (array([[0.42640143]]), 'en_nasti'),
 (array([[0.42640143]]), 'en_anger'),
 (array([[0.30151134]]), 'ru_уступк'),
 (array([[0.30151134]]), 'ru_трам'),
 (array([[0.30151134]]), 'ru_радиоведущ'),
 (array([[0.30151134]]), 'ru_означа'),
 (array([[0.30151134]]), 'ru_обсужд')]

In [34]:
nearest_neighbours('upset', 'en')

[(array([[0.5]]), 'ru_расстраива'),
 (array([[0.5]]), 'ru_похищен'),
 (array([[0.5]]), 'ru_овечк'),
 (array([[0.5]]), 'ru_недорог'),
 (array([[0.5]]), 'ru_бааба'),
 (array([[0.5]]), 'en_super'),
 (array([[0.5]]), 'en_sheep'),
 (array([[0.5]]), 'en_lose'),
 (array([[0.5]]), 'en_kidnap'),
 (array([[0.5]]), 'en_baabaa')]

In [35]:
nearest_neighbours('weak', 'en')

[(array([[0.70710678]]), 'ru_слабеньк'),
 (array([[0.70710678]]), 'ru_пресн'),
 (array([[0.70710678]]), 'ru_непримечательн'),
 (array([[0.70710678]]), 'ru_блестящ'),
 (array([[0.70710678]]), 'ru_ароматн'),
 (array([[0.70710678]]), 'en_unremark'),
 (array([[0.40824829]]), 'ru_повествован'),
 (array([[0.40824829]]), 'en_ok'),
 (array([[0.35355339]]), 'ru_напиток'),
 (array([[0.35355339]]), 'en_narrat')]

In [36]:
nearest_neighbours('knockoff', 'en')

[(array([[0.57735027]]), 'ru_бдительн'),
 (array([[0.57735027]]), 'en_buyer'),
 (array([[0.46291005]]), 'ru_подделк'),
 (array([[0.40824829]]), 'ru_покупател'),
 (array([[0.40824829]]), 'ru_будьт'),
 (array([[0.40824829]]), 'en_bewar'),
 (array([[0.33333333]]), 'ru_отз'),
 (array([[0.21821789]]), 'ru_прочита'),
 (array([[0.20412415]]), 'ru_спрос'),
 (array([[0.19245009]]), 'en_left')]

In [38]:
nearest_neighbours('hate', 'en')

[(array([[0.5]]), 'ru_сим'),
 (array([[0.5]]), 'ru_предсказуем'),
 (array([[0.5]]), 'ru_постановк'),
 (array([[0.5]]), 'ru_ненавидет'),
 (array([[0.5]]), 'ru_алистер'),
 (array([[0.5]]), 'ru_1951'),
 (array([[0.5]]), 'en_predict'),
 (array([[0.5]]), 'en_alist'),
 (array([[0.5]]), 'en_1951'),
 (array([[0.35355339]]), 'ru_математик')]

In [53]:
import numpy as np
from scipy.stats import entropy

def entropy_of_the_cloud(similarity_list):
    return entropy(np.array(similarity_list).flatten(), base=len(similarity_list))

# let's see the entropy of the clouds for the words we've seen
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('good', 'en')]))


0.962835767063005


In [54]:
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('great', 'en')]))

0.9601465064422539


In [None]:
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('well', 'en')]))

In [None]:
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('favorite', 'en')]))

In [None]:
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('nice', 'en')]))

In [None]:
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('excellent', 'en')]))

In [None]:
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('original', 'en')]))

In [None]:
print(entropy_of_the_cloud([x[0] for x in nearest_neighbours('recommend', 'en')]))