In [1]:
# uploading term2doc_nostopwords_en-ru.news+reviews.profs.id+en+ru
import json
filename = 'term2doc_nostopwords_en-fi.reviews.studs.id+en+fi.json'
with open(filename, 'rb') as f:
    matrix = json.load(f)

In [2]:
len(matrix)

5249

In [3]:
# the keys look like this:
keys = list(matrix.keys())
print(keys[:10])


['en_featur', 'fi_kitar', 'en_).', 'fi_helpot', 'en_found', 'fi_välin', 'en_strain', 'en_diet', 'fi_pelas', 'fi_totuus']


In [4]:
# import the cosine metric and the stemmers for english and the other language
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer
stemmer_en = SnowballStemmer("english")
stemmer_other = SnowballStemmer("finnish") # stemmer for the "other language", here russian; finnish for finnish etc.



In [5]:
# let's not forget the prefix for the language!
# I will make a function that goes from word to key to go faster
def word2key(word, lang):
    return lang + '_' + stemmer_en.stem(word) if lang == 'en' else lang + '_' + stemmer_other.stem(word)

# let's try it out
print(word2key('dog', 'en'))

en_dog


In [7]:
# comparing two words' similarity

english_key = word2key('book', 'en')
finnish_key = word2key('книга', 'ru')
cosine_similarity([matrix[english_key]], [matrix[finnish_key]])


KeyError: 'ru_книга'

In [8]:
# finding the nearest neighbours
# what is the nearest neighbour to the word 'book' in the other language?

to_sort = []
for w in keys:
    if w != english_key:
        to_sort.append((cosine_similarity([matrix[english_key]], [matrix[w]]),w))



In [9]:
n=10
sorted(to_sort, reverse=True)[:n]

[(array([[0.90099965]]), 'fi_kirj'),
 (array([[0.31980107]]), 'en_cake'),
 (array([[0.25]]), 'fi_sivu'),
 (array([[0.25]]), 'en_page'),
 (array([[0.24207223]]), 'fi_.'),
 (array([[0.23482614]]), 'en_.'),
 (array([[0.20833333]]), 'fi_kaku'),
 (array([[0.20578269]]), 'fi_,'),
 (array([[0.19764235]]), 'en_wed'),
 (array([[0.1821232]]), 'en_use')]

In [10]:
# I will also make a function to go from a word to its 10 nearest neighbours
def nearest_neighbours(word, lang, n=10):
    key = word2key(word, lang)
    to_sort = []
    for w in keys:
        if w != key:
            to_sort.append((cosine_similarity([matrix[key]], [matrix[w]]),w))
    return sorted(to_sort, reverse=True)[:n]


In [11]:
# Exploring most frequent positive words in EN originals: good, great, well, favorite, nice, excellent, original, recommend, love

In [11]:
nearest_neighbours('good', 'en')

[(array([[0.56343617]]), 'fi_hyvä'),
 (array([[0.21081851]]), 'fi_yötä'),
 (array([[0.21081851]]), 'fi_vuodenaiko'),
 (array([[0.21081851]]), 'fi_un'),
 (array([[0.21081851]]), 'fi_olem'),
 (array([[0.21081851]]), 'en_butterfli'),
 (array([[0.21035158]]), 'fi_,'),
 (array([[0.20869968]]), 'en_realli'),
 (array([[0.18519606]]), 'fi_.'),
 (array([[0.18257419]]), 'en_idea')]

In [28]:
nearest_neighbours('great', 'en')

[(array([[0.27439774]]), 'fi_mahtav'),
 (array([[0.27216553]]), 'fi_toimi'),
 (array([[0.26832816]]), 'fi_loistav'),
 (array([[0.25584086]]), 'fi_erinomais'),
 (array([[0.24944383]]), 'fi_hyvä'),
 (array([[0.21320072]]), 'en_work'),
 (array([[0.2]]), 'fi_laatiko'),
 (array([[0.2]]), 'en_n'),
 (array([[0.2]]), 'en_alphabet'),
 (array([[0.16329932]]), 'fi_vihr')]

In [29]:
nearest_neighbours('well', 'en')

[(array([[0.48141065]]), 'fi_hyv'),
 (array([[0.33218192]]), 'fi_kirjoitetu'),
 (array([[0.28589668]]), 'fi_toimi'),
 (array([[0.28074496]]), 'en_written'),
 (array([[0.27994626]]), 'en_work'),
 (array([[0.26261287]]), 'fi_noh'),
 (array([[0.26261287]]), 'fi_kokoin'),
 (array([[0.19165499]]), 'en_veri'),
 (array([[0.18569534]]), 'fi_”...'),
 (array([[0.18569534]]), 'fi_yrittäj')]

In [30]:
nearest_neighbours('favorite', 'en')

[(array([[0.43643578]]), 'fi_lemppar'),
 (array([[0.37796447]]), 'fi_taita'),
 (array([[0.37796447]]), 'fi_näytäi'),
 (array([[0.37796447]]), 'fi_mok'),
 (array([[0.37796447]]), 'fi_lomaelokuv'),
 (array([[0.37796447]]), 'fi_lempparilef'),
 (array([[0.37796447]]), 'fi_lempikark'),
 (array([[0.37796447]]), 'fi_lempijouluelokuv'),
 (array([[0.37796447]]), 'fi_kakunsuunnittelj'),
 (array([[0.37796447]]), 'fi_juhlapyh')]

In [31]:
nearest_neighbours('nice', 'en')

[(array([[0.47809144]]), 'fi_kiva'),
 (array([[0.46291005]]), 'fi_mukav'),
 (array([[0.37796447]]), 'fi_viest'),
 (array([[0.3086067]]), 'fi_yrityks'),
 (array([[0.3086067]]), 'fi_henkilö'),
 (array([[0.26726124]]), 'fi_ystävällis'),
 (array([[0.26726124]]), 'fi_yritys'),
 (array([[0.26726124]]), 'fi_väitet'),
 (array([[0.26726124]]), 'fi_vuorovaikutu'),
 (array([[0.26726124]]), 'fi_veloituks')]

In [32]:
nearest_neighbours('excellent', 'en')

[(array([[0.42163702]]), 'fi_erinomain'),
 (array([[0.31622777]]), 'fi_toiminnallisuus'),
 (array([[0.31622777]]), 'fi_payne'),
 (array([[0.31622777]]), 'fi_osais'),
 (array([[0.31622777]]), 'fi_onnistun'),
 (array([[0.31622777]]), 'fi_näyttel'),
 (array([[0.31622777]]), 'fi_messinkis'),
 (array([[0.31622777]]), 'fi_markkino'),
 (array([[0.31622777]]), 'fi_letkuliittim'),
 (array([[0.31622777]]), 'fi_kuvail')]

In [33]:
nearest_neighbours('original', 'en')

[(array([[0.67936622]]), 'fi_alkuperäin'),
 (array([[0.5547002]]), 'fi_alkuperäis'),
 (array([[0.39223227]]), 'fi_teemo'),
 (array([[0.39223227]]), 'fi_cliffhanger'),
 (array([[0.39223227]]), 'en_theme'),
 (array([[0.39223227]]), 'en_shown'),
 (array([[0.30769231]]), 'en_show'),
 (array([[0.2773501]]), 'fi_”),'),
 (array([[0.2773501]]), 'fi_ylistyslaulu'),
 (array([[0.2773501]]), 'fi_yhteys')]

In [18]:
nearest_neighbours('recommend', 'en')

[(array([[0.76486616]]), 'fi_suosittel'),
 (array([[0.43852901]]), 'fi_suosit'),
 (array([[0.43412157]]), 'en_anyon'),
 (array([[0.34544086]]), 'en_would'),
 (array([[0.33968311]]), 'fi_kene'),
 (array([[0.2773501]]), 'fi_suosittelis'),
 (array([[0.2773501]]), 'fi_kokeilet'),
 (array([[0.2773501]]), 'en_7'),
 (array([[0.26148818]]), 'fi_kaik'),
 (array([[0.23652496]]), 'fi_voin')]

In [34]:
nearest_neighbours('love', 'en')

[(array([[0.84084099]]), 'fi_rakast'),
 (array([[0.24253563]]), 'fi_kotelo'),
 (array([[0.21004201]]), 'fi_vanh'),
 (array([[0.19802951]]), 'fi_pian'),
 (array([[0.19802951]]), 'fi_macely'),
 (array([[0.19802951]]), 'en_mace'),
 (array([[0.17712298]]), 'en_case'),
 (array([[0.17149859]]), 'fi_äärimmäis'),
 (array([[0.17149859]]), 'fi_äänit'),
 (array([[0.17149859]]), 'fi_äid')]

In [None]:
# Exploring most frequent negative words in EN originals: fake, awful, bad, lame, mean, upset, weak, knockoff, mess, cramp, hate

In [35]:
nearest_neighbours('fake', 'en')

[(array([[0.65079137]]), 'fi_mac'),
 (array([[0.65079137]]), 'en_mac'),
 (array([[0.59408853]]), 'fi_väärennety'),
 (array([[0.48507125]]), 'en_sale'),
 (array([[0.42008403]]), 'fi_väärennettyj'),
 (array([[0.42008403]]), 'fi_myynt'),
 (array([[0.42008403]]), 'fi_feik'),
 (array([[0.42008403]]), 'en_dont'),
 (array([[0.36380344]]), 'en_makeup'),
 (array([[0.34299717]]), 'fi_väärennös')]

In [36]:
nearest_neighbours('awful', 'en')

[(array([[0.84515425]]), 'fi_kauhe'),
 (array([[0.45584231]]), 'fi_maku'),
 (array([[0.39036003]]), 'en_tast'),
 (array([[0.37796447]]), 'fi_syömäpuiko'),
 (array([[0.37796447]]), 'fi_sietä'),
 (array([[0.37796447]]), 'fi_puist'),
 (array([[0.37796447]]), 'fi_nähdy'),
 (array([[0.37796447]]), 'fi_muodo'),
 (array([[0.37796447]]), 'fi_metallis'),
 (array([[0.37796447]]), 'fi_kamal')]

In [37]:
nearest_neighbours('lame', 'en')

[(array([[1.]]), 'fi_vaisu'),
 (array([[0.70710678]]), 'fi_ylistyslaulu'),
 (array([[0.70710678]]), 'fi_tunnar'),
 (array([[0.70710678]]), 'fi_show'),
 (array([[0.70710678]]), 'fi_nykyis'),
 (array([[0.70710678]]), 'fi_myön'),
 (array([[0.70710678]]), 'fi_jimy'),
 (array([[0.70710678]]), 'fi_durant'),
 (array([[0.70710678]]), 'en_paean'),
 (array([[0.70710678]]), 'en_jimmi')]

In [38]:
nearest_neighbours('mean', 'en')

[(array([[0.57735027]]), 'fi_tekem'),
 (array([[0.57735027]]), 'fi_suurin'),
 (array([[0.57735027]]), 'fi_kourallis'),
 (array([[0.57735027]]), 'fi_kala'),
 (array([[0.57735027]]), 'fi_??'),
 (array([[0.57735027]]), 'en_grandkid'),
 (array([[0.57735027]]), 'en_fish'),
 (array([[0.57735027]]), 'en_anyway'),
 (array([[0.57735027]]), 'en_??'),
 (array([[0.40824829]]), 'fi_ruudu')]

In [39]:
nearest_neighbours('upset', 'en')

[(array([[0.5]]), 'fi_vihain'),
 (array([[0.5]]), 'fi_viallis'),
 (array([[0.5]]), 'fi_tuntitolku'),
 (array([[0.5]]), 'fi_surullin'),
 (array([[0.5]]), 'fi_muist'),
 (array([[0.5]]), 'fi_lataudu'),
 (array([[0.5]]), 'fi_lamp'),
 (array([[0.5]]), 'fi_kidnapat'),
 (array([[0.5]]), 'fi_kats'),
 (array([[0.5]]), 'fi_ikäv')]

In [40]:
nearest_neighbours('weak', 'en')

[(array([[1.]]), 'fi_heiko'),
 (array([[0.70710678]]), 'fi_tuiki'),
 (array([[0.70710678]]), 'fi_muuks'),
 (array([[0.70710678]]), 'fi_lain'),
 (array([[0.70710678]]), 'fi_kutsu'),
 (array([[0.70710678]]), 'fi_joks'),
 (array([[0.70710678]]), 'en_unremark'),
 (array([[0.5]]), 'fi_mauttom'),
 (array([[0.5]]), 'fi_kerron'),
 (array([[0.40824829]]), 'en_ok')]

In [41]:
nearest_neighbours('knockoff', 'en')

[(array([[0.8660254]]), 'fi_kopio'),
 (array([[0.57735027]]), 'fi_varok'),
 (array([[0.57735027]]), 'fi_kysyt'),
 (array([[0.57735027]]), 'fi_jätety'),
 (array([[0.57735027]]), 'en_buyer'),
 (array([[0.40824829]]), 'fi_ostaj'),
 (array([[0.40824829]]), 'en_bewar'),
 (array([[0.33333333]]), 'fi_halp'),
 (array([[0.33333333]]), 'en_ask'),
 (array([[0.28867513]]), 'fi_luin')]

In [42]:
nearest_neighbours('hate', 'en')

[(array([[0.5]]), 'fi_vihas'),
 (array([[0.5]]), 'fi_vihaav'),
 (array([[0.5]]), 'fi_vihaam'),
 (array([[0.5]]), 'fi_sim'),
 (array([[0.5]]), 'fi_roolihahmoa'),
 (array([[0.5]]), 'fi_näytel'),
 (array([[0.5]]), 'fi_harmit'),
 (array([[0.5]]), 'fi_arvattav'),
 (array([[0.5]]), 'fi_alister'),
 (array([[0.5]]), 'fi_1951')]