In [2]:
# uploading term2doc_nostopwords_en-ru.news+reviews.profs.id+en+ru
import json
filename = 'term2doc_nostopwords_en-fi.reviews.profs.id+en+fi.json'
with open(filename, 'rb') as f:
    matrix = json.load(f)

In [3]:
len(matrix)

5210

In [4]:
# the keys look like this:
keys = list(matrix.keys())
print(keys[:10])


['en_featur', 'fi_kitar', 'en_).', 'fi_helpot', 'en_found', 'en_strain', 'en_diet', 'fi_pelas', 'en_waist', 'fi_päivitys']


In [8]:
# import the cosine metric and the stemmers for english and the other language
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer
stemmer_en = SnowballStemmer("english")
stemmer_other = SnowballStemmer("finnish") # stemmer for the "other language", here russian; finnish for finnish etc.



In [9]:
# let's not forget the prefix for the language!
# I will make a function that goes from word to key to go faster
def word2key(word, lang):
    return lang + '_' + stemmer_en.stem(word) if lang == 'en' else lang + '_' + stemmer_other.stem(word)

# let's try it out
print(word2key('dog', 'en'))

en_dog


In [11]:
# comparing two words' similarity

english_key = word2key('book', 'en')
finnish_key = word2key('книга', 'ru')
cosine_similarity([matrix[english_key]], [matrix[finnish_key]])


KeyError: 'ru_книга'

In [12]:
# finding the nearest neighbours
# what is the nearest neighbour to the word 'book' in the other language?

to_sort = []
for w in keys:
    if w != english_key:
        to_sort.append((cosine_similarity([matrix[english_key]], [matrix[w]]),w))



In [13]:
n=10
sorted(to_sort, reverse=True)[:n]

[(array([[0.83163204]]), 'fi_kirj'),
 (array([[0.31980107]]), 'en_cake'),
 (array([[0.25]]), 'en_page'),
 (array([[0.23862251]]), 'fi_.'),
 (array([[0.2362278]]), 'fi_sivu'),
 (array([[0.23482614]]), 'en_.'),
 (array([[0.22097087]]), 'fi_kaku'),
 (array([[0.21650635]]), 'fi_neuvo'),
 (array([[0.19869426]]), 'fi_,'),
 (array([[0.19764235]]), 'en_wed')]

In [14]:
# I will also make a function to go from a word to its 10 nearest neighbours
def nearest_neighbours(word, lang, n=10):
    key = word2key(word, lang)
    to_sort = []
    for w in keys:
        if w != key:
            to_sort.append((cosine_similarity([matrix[key]], [matrix[w]]),w))
    return sorted(to_sort, reverse=True)[:n]


In [11]:
# Exploring most frequent positive words in EN originals: good, great, well, favorite, nice, excellent, original, recommend, love

In [15]:
nearest_neighbours('good', 'en')

[(array([[0.48935018]]), 'fi_hyvä'),
 (array([[0.21081851]]), 'fi_vink'),
 (array([[0.21081851]]), 'en_butterfli'),
 (array([[0.20869968]]), 'en_realli'),
 (array([[0.18606758]]), 'fi_.'),
 (array([[0.18257419]]), 'en_idea'),
 (array([[0.18120657]]), 'en_.'),
 (array([[0.17233217]]), 'fi_,'),
 (array([[0.17213259]]), 'fi_vaihtoehto'),
 (array([[0.17213259]]), 'en_truli')]

In [16]:
nearest_neighbours('great', 'en')

[(array([[0.29711254]]), 'fi_hyvä'),
 (array([[0.2857738]]), 'fi_mahtav'),
 (array([[0.22860023]]), 'fi_toimi'),
 (array([[0.21602469]]), 'fi_loistav'),
 (array([[0.21320072]]), 'en_work'),
 (array([[0.21213203]]), 'fi_nope'),
 (array([[0.2]]), 'fi_yksityiskoht'),
 (array([[0.2]]), 'fi_tarjous'),
 (array([[0.2]]), 'fi_näppär'),
 (array([[0.2]]), 'en_n')]

In [17]:
nearest_neighbours('well', 'en')

[(array([[0.44644187]]), 'fi_hyv'),
 (array([[0.32163376]]), 'fi_kirjoitetu'),
 (array([[0.28074496]]), 'en_written'),
 (array([[0.27994626]]), 'en_work'),
 (array([[0.26261287]]), 'fi_kokoin'),
 (array([[0.23346307]]), 'fi_toimi'),
 (array([[0.19165499]]), 'en_veri'),
 (array([[0.18569534]]), 'fi_älyttöm'),
 (array([[0.18569534]]), 'fi_yrittäj'),
 (array([[0.18569534]]), 'fi_ymmärtäm')]

In [18]:
nearest_neighbours('favorite', 'en')

[(array([[0.53452248]]), 'fi_suosik'),
 (array([[0.37796447]]), 'fi_yllättyny'),
 (array([[0.37796447]]), 'fi_suosikkijouluelokuv'),
 (array([[0.37796447]]), 'fi_suosikkiharrastuks'),
 (array([[0.37796447]]), 'fi_onnellin'),
 (array([[0.37796447]]), 'fi_näytäi'),
 (array([[0.37796447]]), 'fi_mok'),
 (array([[0.37796447]]), 'fi_mauis'),
 (array([[0.37796447]]), 'fi_lempijouluohjelm'),
 (array([[0.37796447]]), 'fi_kakkuleipur')]

In [19]:
nearest_neighbours('nice', 'en')

[(array([[0.46291005]]), 'fi_kiva'),
 (array([[0.37796447]]), 'fi_mukav'),
 (array([[0.26726124]]), 'fi_ystäväl'),
 (array([[0.26726124]]), 'fi_ylimäär'),
 (array([[0.26726124]]), 'fi_yksilö'),
 (array([[0.26726124]]), 'fi_vuorovaikutu'),
 (array([[0.26726124]]), 'fi_virh'),
 (array([[0.26726124]]), 'fi_viest'),
 (array([[0.26726124]]), 'fi_vastau'),
 (array([[0.26726124]]), 'fi_vaikeud')]

In [20]:
nearest_neighbours('excellent', 'en')

[(array([[0.31622777]]), 'fi_toiminnallisuut'),
 (array([[0.31622777]]), 'fi_taust'),
 (array([[0.31622777]]), 'fi_talouseläm'),
 (array([[0.31622777]]), 'fi_payne'),
 (array([[0.31622777]]), 'fi_mieletön'),
 (array([[0.31622777]]), 'fi_messinkis'),
 (array([[0.31622777]]), 'fi_markkino'),
 (array([[0.31622777]]), 'fi_letkuliittim'),
 (array([[0.31622777]]), 'fi_kaksipäis'),
 (array([[0.31622777]]), 'fi_esitäe')]

In [21]:
nearest_neighbours('original', 'en')

[(array([[0.6289709]]), 'fi_alkuperäis'),
 (array([[0.49613894]]), 'fi_jakso'),
 (array([[0.48038446]]), 'fi_original'),
 (array([[0.48038446]]), 'fi_alkuperäin'),
 (array([[0.39223227]]), 'fi_cliffhanger'),
 (array([[0.39223227]]), 'en_theme'),
 (array([[0.39223227]]), 'en_shown'),
 (array([[0.32025631]]), 'fi_yhä'),
 (array([[0.30769231]]), 'en_show'),
 (array([[0.2773501]]), 'fi_ylistyslaulu')]

In [22]:
nearest_neighbours('recommend', 'en')

[(array([[0.68473679]]), 'fi_suosittel'),
 (array([[0.43412157]]), 'en_anyon'),
 (array([[0.39223227]]), 'fi_suosit'),
 (array([[0.34544086]]), 'en_would'),
 (array([[0.2941742]]), 'fi_lämpim'),
 (array([[0.2773501]]), 'fi_suosittelis'),
 (array([[0.2773501]]), 'fi_suositelt'),
 (array([[0.2773501]]), 'fi_kel'),
 (array([[0.2773501]]), 'en_7'),
 (array([[0.22645541]]), 'fi_jotk')]

In [23]:
nearest_neighbours('love', 'en')

[(array([[0.48809353]]), 'fi_rakast'),
 (array([[0.30316953]]), 'fi_tyk'),
 (array([[0.24253563]]), 'fi_kuor'),
 (array([[0.20180184]]), 'fi_täs'),
 (array([[0.19802951]]), 'fi_tosi'),
 (array([[0.19802951]]), 'fi_pidä'),
 (array([[0.19802951]]), 'fi_macely'),
 (array([[0.19802951]]), 'en_mace'),
 (array([[0.18190172]]), 'fi_!!'),
 (array([[0.17712298]]), 'en_case')]

In [None]:
# Exploring most frequent negative words in EN originals: fake, awful, bad, lame, mean, upset, weak, knockoff, mess, cramp, hate

In [24]:
nearest_neighbours('fake', 'en')

[(array([[0.65079137]]), 'en_mac'),
 (array([[0.61205637]]), 'fi_mac'),
 (array([[0.54232614]]), 'fi_väärennettyj'),
 (array([[0.48507125]]), 'fi_feik'),
 (array([[0.48507125]]), 'en_sale'),
 (array([[0.42008403]]), 'fi_väärennety'),
 (array([[0.42008403]]), 'en_dont'),
 (array([[0.36380344]]), 'en_makeup'),
 (array([[0.34299717]]), 'fi_väärennöks'),
 (array([[0.34299717]]), 'fi_tue')]

In [25]:
nearest_neighbours('awful', 'en')

[(array([[0.39036003]]), 'en_tast'),
 (array([[0.37796447]]), 'fi_yhty'),
 (array([[0.37796447]]), 'fi_syömäpuiko'),
 (array([[0.37796447]]), 'fi_puuaterim'),
 (array([[0.37796447]]), 'fi_luokaton'),
 (array([[0.37796447]]), 'fi_lopetuks'),
 (array([[0.37796447]]), 'fi_kestämätö'),
 (array([[0.37796447]]), 'fi_kauhe'),
 (array([[0.37796447]]), 'fi_jättäv'),
 (array([[0.37796447]]), 'fi_jälj')]

In [26]:
nearest_neighbours('lame', 'en')

[(array([[0.70710678]]), 'fi_ylistyslaulu'),
 (array([[0.70710678]]), 'fi_show'),
 (array([[0.70710678]]), 'fi_myön'),
 (array([[0.70710678]]), 'fi_lukulais'),
 (array([[0.70710678]]), 'fi_korrektiud'),
 (array([[0.70710678]]), 'fi_kerto'),
 (array([[0.70710678]]), 'fi_jimy'),
 (array([[0.70710678]]), 'fi_durant'),
 (array([[0.70710678]]), 'fi_alkumusiik'),
 (array([[0.70710678]]), 'fi_1990')]

In [27]:
nearest_neighbours('mean', 'en')

[(array([[0.57735027]]), 'fi_ään'),
 (array([[0.57735027]]), 'fi_tekem'),
 (array([[0.57735027]]), 'fi_suurin'),
 (array([[0.57735027]]), 'fi_ruudu'),
 (array([[0.57735027]]), 'fi_pitiv'),
 (array([[0.57735027]]), 'fi_kalo'),
 (array([[0.57735027]]), 'fi_??'),
 (array([[0.57735027]]), 'en_grandkid'),
 (array([[0.57735027]]), 'en_fish'),
 (array([[0.57735027]]), 'en_anyway')]

In [28]:
nearest_neighbours('upset', 'en')

[(array([[0.5]]), 'fi_viallis'),
 (array([[0.5]]), 'fi_valikoim'),
 (array([[0.5]]), 'fi_tuntikaup'),
 (array([[0.5]]), 'fi_suru'),
 (array([[0.5]]), 'fi_super'),
 (array([[0.5]]), 'fi_raivostuttav'),
 (array([[0.5]]), 'fi_menet'),
 (array([[0.5]]), 'fi_lata'),
 (array([[0.5]]), 'fi_lamp'),
 (array([[0.5]]), 'fi_kidnapat')]

In [29]:
nearest_neighbours('weak', 'en')

[(array([[0.70710678]]), 'fi_olemato'),
 (array([[0.70710678]]), 'fi_muuks'),
 (array([[0.70710678]]), 'fi_mauttom'),
 (array([[0.70710678]]), 'fi_laiha'),
 (array([[0.70710678]]), 'fi_kutsu'),
 (array([[0.70710678]]), 'fi_juoni'),
 (array([[0.70710678]]), 'fi_joks'),
 (array([[0.70710678]]), 'en_unremark'),
 (array([[0.5]]), 'fi_kerron'),
 (array([[0.40824829]]), 'fi_kahv')]

In [30]:
nearest_neighbours('knockoff', 'en')

[(array([[0.57735027]]), 'fi_roina'),
 (array([[0.57735027]]), 'fi_piraat'),
 (array([[0.57735027]]), 'fi_kysyt'),
 (array([[0.57735027]]), 'fi_huijau'),
 (array([[0.57735027]]), 'fi_annetu'),
 (array([[0.57735027]]), 'en_buyer'),
 (array([[0.40824829]]), 'fi_varok'),
 (array([[0.40824829]]), 'en_bewar'),
 (array([[0.33333333]]), 'en_ask'),
 (array([[0.28867513]]), 'fi_ainoa')]

In [31]:
nearest_neighbours('hate', 'en')

[(array([[0.5]]), 'fi_vihas'),
 (array([[0.5]]), 'fi_vihaam'),
 (array([[0.5]]), 'fi_sim'),
 (array([[0.5]]), 'fi_roolihahmoa'),
 (array([[0.5]]), 'fi_näytel'),
 (array([[0.5]]), 'fi_matematiik'),
 (array([[0.5]]), 'fi_inhoav'),
 (array([[0.5]]), 'fi_arvattav'),
 (array([[0.5]]), 'fi_alister'),
 (array([[0.5]]), 'fi_1951')]