In [52]:
# uploading term2doc_nostopwords_en-ru.news+reviews.profs.id+en+ru
import json
filename = 'term2doc_nostopwords_en-ru.news+reviews.studs.id+en+ru.json'
with open(filename, 'rb') as f:
    matrix = json.load(f)

In [50]:
len(matrix)

9421

In [53]:
# the keys look like this:
keys = list(matrix.keys())
print(keys[:10])


['ru_сын', 'en_whose', 'en_palu', 'en_glasgow', 'en_bloom', 'ru_обострен', 'ru_обеспеч', 'ru_плох', 'en_featur', 'en_goyo']


In [54]:
# import the cosine metric and the stemmers for english and the other language
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer
stemmer_en = SnowballStemmer("english")
stemmer_other = SnowballStemmer("russian") # stemmer for the "other language", here russian; finnish for finnish etc.



In [55]:
# let's not forget the prefix for the language!
# I will make a function that goes from word to key to go faster
def word2key(word, lang):
    return lang + '_' + stemmer_en.stem(word) if lang == 'en' else lang + '_' + stemmer_other.stem(word)

# let's try it out
print(word2key('dog', 'en'))
print(word2key('собака', 'ru'))

en_dog
ru_собак


In [56]:
# comparing two words' similarity

english_key = word2key('book', 'en')
russian_key = word2key('книга', 'ru')
cosine_similarity([matrix[english_key]], [matrix[russian_key]])


array([[0.8912319]])

In [57]:
# finding the nearest neighbours
# what is the nearest neighbour to the word 'book' in the other language?

to_sort = []
for w in keys:
    if w != english_key:
        to_sort.append((cosine_similarity([matrix[english_key]], [matrix[w]]),w))



In [46]:
n=10
sorted(to_sort, reverse=True)[:n]

[(array([[0.8912319]]), 'ru_книг'),
 (array([[0.29740896]]), 'ru_торт'),
 (array([[0.29740896]]), 'en_cake'),
 (array([[0.24353971]]), 'ru_эт'),
 (array([[0.21029989]]), 'ru_страниц'),
 (array([[0.2079501]]), 'ru_ищет'),
 (array([[0.20134682]]), 'ru_спэн'),
 (array([[0.20134682]]), 'ru_купер'),
 (array([[0.20134682]]), 'en_style'),
 (array([[0.20134682]]), 'en_spann')]

In [58]:
# I will also make a function to go from a word to its 10 nearest neighbours
def nearest_neighbours(word, lang, n=10):
    key = word2key(word, lang)
    to_sort = []
    for w in keys:
        if w != key:
            to_sort.append((cosine_similarity([matrix[key]], [matrix[w]]),w))
    return sorted(to_sort, reverse=True)[:n]


In [11]:
# Exploring most frequent positive words in EN originals: good, great, well, favorite, nice, excellent, original, recommend, love

In [51]:
nearest_neighbours('good', 'en')

[(array([[0.52610428]]), 'ru_хорош'),
 (array([[0.21821789]]), 'ru_добр'),
 (array([[0.19200614]]), 'en_realli'),
 (array([[0.18898224]]), 'ru_тариф'),
 (array([[0.18898224]]), 'ru_бабочк'),
 (array([[0.18898224]]), 'en_butterfli'),
 (array([[0.17928429]]), 'ru_представлен'),
 (array([[0.16903085]]), 'en_idea'),
 (array([[0.15699646]]), 'en_.'),
 (array([[0.15675134]]), 'ru_.')]

In [59]:
nearest_neighbours('great', 'en')

[(array([[0.54207975]]), 'ru_отличн'),
 (array([[0.24571805]]), 'ru_доставк'),
 (array([[0.19425717]]), 'ru_повсюд'),
 (array([[0.19425717]]), 'ru_вещ'),
 (array([[0.19425717]]), 'ru_алфавит'),
 (array([[0.19425717]]), 'en_candid'),
 (array([[0.19425717]]), 'en_alphabet'),
 (array([[0.18193851]]), 'en_work'),
 (array([[0.16483268]]), 'ru_цен'),
 (array([[0.16381203]]), 'ru_работа')]

In [60]:
nearest_neighbours('well', 'en')

[(array([[0.29633363]]), 'ru_хорош'),
 (array([[0.25]]), 'ru_поп'),
 (array([[0.24283093]]), 'en_work'),
 (array([[0.23570226]]), 'ru_элемент'),
 (array([[0.23570226]]), 'ru_предубежден'),
 (array([[0.21081851]]), 'en_written'),
 (array([[0.19245009]]), 'ru_обеспечива'),
 (array([[0.19245009]]), 'ru_жесток'),
 (array([[0.18898224]]), 'ru_потряса'),
 (array([[0.1767767]]), 'ru_удобн')]

In [61]:
nearest_neighbours('favorite', 'en')

[(array([[0.53452248]]), 'ru_праздничн'),
 (array([[0.50709255]]), 'ru_любим'),
 (array([[0.37796447]]), 'ru_фавор'),
 (array([[0.37796447]]), 'ru_поб'),
 (array([[0.37796447]]), 'ru_нелюбим'),
 (array([[0.37796447]]), 'ru_мокк'),
 (array([[0.37796447]]), 'en_pasttim'),
 (array([[0.37796447]]), 'en_mocha'),
 (array([[0.37796447]]), 'en_holiday'),
 (array([[0.37796447]]), 'en_ecstat')]

In [62]:
nearest_neighbours('nice', 'en')

[(array([[0.26726124]]), 'ru_эндрюс'),
 (array([[0.26726124]]), 'ru_шнур'),
 (array([[0.26726124]]), 'ru_шла'),
 (array([[0.26726124]]), 'ru_шикарн'),
 (array([[0.26726124]]), 'ru_прилож'),
 (array([[0.26726124]]), 'ru_плюшек'),
 (array([[0.26726124]]), 'ru_переключа'),
 (array([[0.26726124]]), 'ru_ощуп'),
 (array([[0.26726124]]), 'ru_открытк'),
 (array([[0.26726124]]), 'ru_объектив')]

In [63]:
nearest_neighbours('excellent', 'en')

[(array([[0.34426519]]), 'ru_отличн'),
 (array([[0.31622777]]), 'ru_эбенизер'),
 (array([[0.31622777]]), 'ru_распределител'),
 (array([[0.31622777]]), 'ru_пейн'),
 (array([[0.31622777]]), 'ru_нискольк'),
 (array([[0.31622777]]), 'ru_латун'),
 (array([[0.31622777]]), 'ru_двухканальн'),
 (array([[0.31622777]]), 'en_prompt'),
 (array([[0.31622777]]), 'en_payn'),
 (array([[0.31622777]]), 'en_functiona')]

In [64]:
nearest_neighbours('original', 'en')

[(array([[0.65319726]]), 'ru_оригинальн'),
 (array([[0.36514837]]), 'ru_оригина'),
 (array([[0.36514837]]), 'ru_клиффхэнгер'),
 (array([[0.36514837]]), 'en_theme'),
 (array([[0.36514837]]), 'en_shown'),
 (array([[0.2981424]]), 'ru_сериа'),
 (array([[0.2981424]]), 'ru_cd'),
 (array([[0.25819889]]), 'ru_ярк'),
 (array([[0.25819889]]), 'ru_цепочк'),
 (array([[0.25819889]]), 'ru_храбр')]

In [65]:
nearest_neighbours('recommend', 'en')

[(array([[0.63737744]]), 'ru_рекоменд'),
 (array([[0.43852901]]), 'ru_порекомендова'),
 (array([[0.38074981]]), 'en_anyon'),
 (array([[0.2773501]]), 'ru_ребят'),
 (array([[0.26355339]]), 'en_would'),
 (array([[0.23782575]]), 'ru_всем'),
 (array([[0.20965697]]), 'ru_попробова'),
 (array([[0.19611614]]), 'ru_чудес'),
 (array([[0.19611614]]), 'ru_чолул'),
 (array([[0.19611614]]), 'ru_фронтер')]

In [66]:
nearest_neighbours('love', 'en')

[(array([[0.42059978]]), 'ru_нрав'),
 (array([[0.37715714]]), 'ru_понрав'),
 (array([[0.24659848]]), 'ru_обожа'),
 (array([[0.24659848]]), 'ru_любл'),
 (array([[0.23249528]]), 'ru_вся'),
 (array([[0.1898316]]), 'ru_линейк'),
 (array([[0.1898316]]), 'ru_иде'),
 (array([[0.1898316]]), 'ru_macely'),
 (array([[0.1898316]]), 'en_mace'),
 (array([[0.18652107]]), 'ru_очен')]

In [None]:
# Exploring most frequent negative words in EN originals: fake, awful, bad, lame, mean, upset, weak, knockoff, mess, cramp, hate

In [68]:
nearest_neighbours('fake', 'en')

[(array([[0.82717019]]), 'ru_поддельн'),
 (array([[0.61558701]]), 'ru_mac'),
 (array([[0.61558701]]), 'en_mac'),
 (array([[0.39735971]]), 'en_dont'),
 (array([[0.3441236]]), 'en_makeup'),
 (array([[0.32444284]]), 'ru_фейков'),
 (array([[0.32444284]]), 'ru_рассыпчат'),
 (array([[0.32444284]]), 'ru_выяв'),
 (array([[0.32444284]]), 'ru_instagram'),
 (array([[0.32444284]]), 'ru_20000')]

In [69]:
nearest_neighbours('awful', 'en')

[(array([[0.57142857]]), 'ru_отвратительн'),
 (array([[0.39036003]]), 'en_tast'),
 (array([[0.37796447]]), 'ru_предназнач'),
 (array([[0.37796447]]), 'ru_деревя'),
 (array([[0.37796447]]), 'en_wooden'),
 (array([[0.37796447]]), 'en_utensil'),
 (array([[0.37796447]]), 'en_gray'),
 (array([[0.37796447]]), 'en_episod'),
 (array([[0.37796447]]), 'en_clifhang'),
 (array([[0.37796447]]), 'en_chop')]

In [70]:
nearest_neighbours('lame', 'en')

[(array([[0.70710678]]), 'ru_смахива'),
 (array([[0.70710678]]), 'ru_рассказчик'),
 (array([[0.70710678]]), 'ru_ерес'),
 (array([[0.70710678]]), 'ru_дюрант'),
 (array([[0.70710678]]), 'ru_джимм'),
 (array([[0.70710678]]), 'ru_бредятин'),
 (array([[0.70710678]]), 'ru_90х'),
 (array([[0.70710678]]), 'en_paean'),
 (array([[0.70710678]]), 'en_jimmi'),
 (array([[0.70710678]]), 'en_durant')]

In [71]:
nearest_neighbours('mean', 'en')

[(array([[0.52223297]]), 'ru_подлост'),
 (array([[0.42640143]]), 'ru_гнев'),
 (array([[0.42640143]]), 'en_nasti'),
 (array([[0.42640143]]), 'en_anger'),
 (array([[0.34815531]]), 'ru_страда'),
 (array([[0.34815531]]), 'ru_знач'),
 (array([[0.30151134]]), 'ru_цзиньпин'),
 (array([[0.30151134]]), 'ru_требован'),
 (array([[0.30151134]]), 'ru_си'),
 (array([[0.30151134]]), 'ru_решим')]

In [72]:
nearest_neighbours('upset', 'en')

[(array([[0.57735027]]), 'ru_расстро'),
 (array([[0.5]]), 'ru_похищен'),
 (array([[0.5]]), 'ru_побольш'),
 (array([[0.5]]), 'ru_овц'),
 (array([[0.5]]), 'ru_дефектн'),
 (array([[0.5]]), 'ru_баб'),
 (array([[0.5]]), 'en_super'),
 (array([[0.5]]), 'en_sheep'),
 (array([[0.5]]), 'en_lose'),
 (array([[0.5]]), 'en_kidnap')]

In [73]:
nearest_neighbours('weak', 'en')

[(array([[0.81649658]]), 'ru_слаб'),
 (array([[0.70710678]]), 'ru_примечательн'),
 (array([[0.70710678]]), 'ru_приемлем'),
 (array([[0.70710678]]), 'ru_ароматн'),
 (array([[0.70710678]]), 'en_unremark'),
 (array([[0.5]]), 'ru_повествован'),
 (array([[0.40824829]]), 'ru_сюжет'),
 (array([[0.40824829]]), 'ru_откровен'),
 (array([[0.40824829]]), 'en_ok'),
 (array([[0.35355339]]), 'en_narrat')]

In [36]:
nearest_neighbours('knockoff', 'en')

[(array([[0.57735027]]), 'ru_бдительн'),
 (array([[0.57735027]]), 'en_buyer'),
 (array([[0.46291005]]), 'ru_подделк'),
 (array([[0.40824829]]), 'ru_покупател'),
 (array([[0.40824829]]), 'ru_будьт'),
 (array([[0.40824829]]), 'en_bewar'),
 (array([[0.33333333]]), 'ru_отз'),
 (array([[0.21821789]]), 'ru_прочита'),
 (array([[0.20412415]]), 'ru_спрос'),
 (array([[0.19245009]]), 'en_left')]

In [38]:
nearest_neighbours('hate', 'en')

[(array([[0.5]]), 'ru_сим'),
 (array([[0.5]]), 'ru_предсказуем'),
 (array([[0.5]]), 'ru_постановк'),
 (array([[0.5]]), 'ru_ненавидет'),
 (array([[0.5]]), 'ru_алистер'),
 (array([[0.5]]), 'ru_1951'),
 (array([[0.5]]), 'en_predict'),
 (array([[0.5]]), 'en_alist'),
 (array([[0.5]]), 'en_1951'),
 (array([[0.35355339]]), 'ru_математик')]