# Eigenwords

In [1]:
import numpy as np
import pandas as pd
from eigenwords import EigenwordsOSCCA

## training

In [2]:
%%time

model = EigenwordsOSCCA()
model.load_corpus('./data/text8_with_phrase', verbose=False)

CPU times: user 5min 35s, sys: 38.5 s, total: 6min 14s
Wall time: 3min 49s


## most similar words

In [3]:
model.wv.most_similar('cat')

  if np.issubdtype(vec.dtype, np.int):


[('dog', 0.6467665433883667),
 ('bird', 0.4829463064670563),
 ('dogs', 0.4766378700733185),
 ('dolphin', 0.47493958473205566),
 ('horse', 0.46100080013275146),
 ('cats', 0.46080493927001953),
 ('goat', 0.45060423016548157),
 ('mammals', 0.44402754306793213),
 ('genus', 0.43115168809890747),
 ('whale', 0.42810702323913574)]

In [4]:
model.wv.most_similar(positive=['king','woman'], negative=['man'])

[('queen', 0.529191255569458),
 ('princess', 0.47166308760643005),
 ('prince', 0.46154117584228516),
 ('ruler', 0.4600673019886017),
 ('crown_prince', 0.44104325771331787),
 ('monarch', 0.4402952492237091),
 ('emperor', 0.4147062301635742),
 ('duke', 0.40347960591316223),
 ('regent', 0.4031509757041931),
 ('empress', 0.3490029573440552)]

## word similarity task

In [5]:
word_table = pd.read_csv('./data/combined.csv', names=['word1', 'word2', 'score'], skiprows=[0])
word_table.head(10)

Unnamed: 0,word1,word2,score
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.0
3,book,paper,7.46
4,computer,keyboard,7.62
5,computer,internet,7.58
6,plane,car,5.77
7,train,car,6.31
8,telephone,communication,7.5
9,television,radio,6.77


In [6]:
%%time
model.evaluate_word_pairs(word_table) # pearson, spearman, oov-ratio

CPU times: user 62.7 ms, sys: 1.68 ms, total: 64.3 ms
Wall time: 61.9 ms


((0.5709904071461659, 4.0645804778960944e-23),
 SpearmanrResult(correlation=0.6044125190322798, pvalue=2.175995763839577e-26),
 28.89518413597734)

## word analogy task

In [10]:
model.wv.evaluate_word_analogies('./data/questions-words.txt')

(0.41578947368421054,
 [{'section': 'capital-common-countries',
   'correct': [('ATHENS', 'GREECE', 'BERLIN', 'GERMANY'),
    ('ATHENS', 'GREECE', 'ROME', 'ITALY'),
    ('BAGHDAD', 'IRAQ', 'KABUL', 'AFGHANISTAN'),
    ('BEIJING', 'CHINA', 'BERLIN', 'GERMANY'),
    ('BEIJING', 'CHINA', 'MADRID', 'SPAIN'),
    ('BERLIN', 'GERMANY', 'HELSINKI', 'FINLAND'),
    ('BERLIN', 'GERMANY', 'MADRID', 'SPAIN'),
    ('BERLIN', 'GERMANY', 'MOSCOW', 'RUSSIA'),
    ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'),
    ('BERLIN', 'GERMANY', 'ROME', 'ITALY'),
    ('BERLIN', 'GERMANY', 'STOCKHOLM', 'SWEDEN'),
    ('BERLIN', 'GERMANY', 'BEIJING', 'CHINA'),
    ('CAIRO', 'EGYPT', 'KABUL', 'AFGHANISTAN'),
    ('CAIRO', 'EGYPT', 'BAGHDAD', 'IRAQ'),
    ('CAIRO', 'EGYPT', 'BEIJING', 'CHINA'),
    ('CAIRO', 'EGYPT', 'BERLIN', 'GERMANY'),
    ('HELSINKI', 'FINLAND', 'MOSCOW', 'RUSSIA'),
    ('HELSINKI', 'FINLAND', 'PARIS', 'FRANCE'),
    ('HELSINKI', 'FINLAND', 'ROME', 'ITALY'),
    ('HELSINKI', 'FINLAND', 'STOCKHOLM', 