# 1- Word  vectors evaluation

The vectors were built from a corpus containing the fist 5 GoT books with scripts from the official GloVe repository:
https://github.com/stanfordnlp/GloVe

Here, we evaluate their "performance" on the word2vec question-words dataset: "question-words.txt"

In [1]:
import pandas as pd
import numpy as np

df = pd.read_table("../data/got_word_vectors.txt", delimiter=" ", header=None, index_col=0)
df.loc[["man", "woman", "stark", "baratheon"]]

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
man,-0.195465,0.413849,-0.062685,-0.548064,-0.541133,-0.105243,0.081112,-0.248856,0.676081,-0.049481,...,0.249135,-0.070089,0.104099,0.015332,-0.417004,-0.028931,0.092852,0.183472,0.17146,-0.44332
woman,-0.349365,0.204624,0.088298,-0.56458,0.085438,0.267319,-0.091283,-0.097102,0.77564,0.084764,...,0.386981,-0.051463,0.227382,0.056128,0.046533,-0.401586,-0.37443,0.386475,-0.400704,0.164753
stark,0.449176,-0.038293,-0.373862,-0.023245,0.02117,0.313665,-0.425727,-0.040682,-0.385934,-0.857776,...,-0.559304,-0.495321,0.202061,0.78545,-0.096234,-0.610166,0.12189,0.026967,-0.20382,0.233859
baratheon,0.739617,0.21601,-0.253746,0.730075,-0.274791,0.456936,-0.237462,0.38267,-0.458887,-0.882413,...,0.6738,-0.515718,0.757681,0.261031,0.001758,-0.243125,-0.373064,-0.186257,-0.198327,0.921797


#### Vector normalization

In [2]:
df = df.div(np.linalg.norm(df, axis=1), axis=0)
print("'man' vector L2-norm:", np.linalg.norm(df.loc["man"]))

'man' vector L2-norm: 1.0


#### Utility functions

In [3]:
def most_similar(df, word, n=5, exceptions=[]):
    """Return the n most similar words"""
    return df.loc[~df.index.isin(exceptions)].dot(word).sort_values(ascending=False).head(n)

def op(operation, n=1):
    """Compute the vector addition "w1 - w2 + w3" and return the n most similar words
        equation: w1 - w2 + w3 = w4
        analogy: (w2 is to w3) as (w1 is to w4)
    """
    w1, w2, w3 = operation.split()[::2]
    vec = df.loc[w1] - df.loc[w2] + df.loc[w3]
   
    topn =  most_similar(
        df, vec, 
        exceptions=[w1, w2, w3],
        n=n+3
    )
    analogy = "%s is to %s AS %s is to %s" % (w2, w3, w1, "?")
    top_res = ["%s = %s (%.2f)" % (analogy, topn.index[i], topn[i]) for i in range(n)]
    print("\n".join(top_res))

In [9]:
most_similar(df, df.loc["jon"])

0
jon        1.000000
snow       0.630133
sam        0.511579
pyp        0.470277
ygritte    0.446722
dtype: float64

In [4]:
op("king - man + woman", n=3)

man is to woman AS king is to ? = queen (0.74)
man is to woman AS king is to ? = joffrey (0.65)
man is to woman AS king is to ? = margaery (0.61)


## Evaluation on word2vec question-words dataset

In [5]:
skipped_tasks = 0
success_count = 0
second_guess_success_count = 0

with open("../data/question-words.txt", "r") as f:
    for t_ix, task in enumerate(f):
        task = task.rstrip()
        if task.startswith(":"):
            continue
        
        # format is: "a b c d" 
        # for "a is to b AS c is to d"
        # => a->b ~ c->d
        # equation: b - a ~ d - c
        # test: b - a + c ?= d
        a, b, c, d = [w.lower() for w in task.split(" ")]
        
        missing_words = [w for w in (a, b, c, d) if w not in df.index]               
        if missing_words:
            skipped_tasks += 1
            continue

        topn = most_similar(df, df.loc[b] - df.loc[a] + df.loc[c], exceptions=[a, b, c])
        if topn.index[0] == d:
            success_count += 1
            print("'%s' is to '%s' AS '%s' is to '%s'" % (a, b, c, d))
        elif topn.index[1] == d:
            second_guess_success_count += 1
                   
task_count =  t_ix + 1
filtered_task_count = task_count - skipped_tasks
print("----")
print("questions: %d filtered from %d (missing words)" % (filtered_task_count, task_count))
print("first-guess accuracy = %.2f%% (%d/%d)" % (100*success_count/filtered_task_count, success_count, filtered_task_count))
print("second-guess accuracy = %.2f%% (%d/%d)" % (100*(success_count + second_guess_success_count)/filtered_task_count, success_count+second_guess_success_count, filtered_task_count))

'boy' is to 'girl' AS 'brother' is to 'sister'
'boy' is to 'girl' AS 'he' is to 'she'
'boy' is to 'girl' AS 'king' is to 'queen'
'boy' is to 'girl' AS 'man' is to 'woman'
'boy' is to 'girl' AS 'son' is to 'daughter'
'boy' is to 'girl' AS 'sons' is to 'daughters'
'brother' is to 'sister' AS 'brothers' is to 'sisters'
'brother' is to 'sister' AS 'father' is to 'mother'
'brother' is to 'sister' AS 'king' is to 'queen'
'brother' is to 'sister' AS 'man' is to 'woman'
'brother' is to 'sister' AS 'prince' is to 'princess'
'brother' is to 'sister' AS 'son' is to 'daughter'
'brother' is to 'sister' AS 'sons' is to 'daughters'
'brother' is to 'sister' AS 'boy' is to 'girl'
'brothers' is to 'sisters' AS 'king' is to 'queen'
'brothers' is to 'sisters' AS 'son' is to 'daughter'
'brothers' is to 'sisters' AS 'sons' is to 'daughters'
'brothers' is to 'sisters' AS 'brother' is to 'sister'
'father' is to 'mother' AS 'king' is to 'queen'
'father' is to 'mother' AS 'man' is to 'woman'
'father' is to 'mot

'long' is to 'longer' AS 'hard' is to 'harder'
'long' is to 'longer' AS 'high' is to 'higher'
'long' is to 'longer' AS 'large' is to 'larger'
'loud' is to 'louder' AS 'warm' is to 'warmer'
'loud' is to 'louder' AS 'cold' is to 'colder'
'loud' is to 'louder' AS 'good' is to 'better'
'loud' is to 'louder' AS 'hard' is to 'harder'
'loud' is to 'louder' AS 'large' is to 'larger'
'low' is to 'lower' AS 'warm' is to 'warmer'
'old' is to 'older' AS 'short' is to 'shorter'
'old' is to 'older' AS 'tall' is to 'taller'
'old' is to 'older' AS 'warm' is to 'warmer'
'old' is to 'older' AS 'young' is to 'younger'
'old' is to 'older' AS 'big' is to 'bigger'
'old' is to 'older' AS 'cold' is to 'colder'
'old' is to 'older' AS 'hard' is to 'harder'
'old' is to 'older' AS 'high' is to 'higher'
'old' is to 'older' AS 'large' is to 'larger'
'quick' is to 'quicker' AS 'strong' is to 'stronger'
'quick' is to 'quicker' AS 'tall' is to 'taller'
'quick' is to 'quicker' AS 'good' is to 'better'
'quick' is to 'qu

'screaming' is to 'screamed' AS 'sitting' is to 'sat'
'screaming' is to 'screamed' AS 'dancing' is to 'danced'
'screaming' is to 'screamed' AS 'flying' is to 'flew'
'screaming' is to 'screamed' AS 'moving' is to 'moved'
'screaming' is to 'screamed' AS 'paying' is to 'paid'
'screaming' is to 'screamed' AS 'playing' is to 'played'
'seeing' is to 'saw' AS 'taking' is to 'took'
'seeing' is to 'saw' AS 'walking' is to 'walked'
'seeing' is to 'saw' AS 'knowing' is to 'knew'
'seeing' is to 'saw' AS 'looking' is to 'looked'
'seeing' is to 'saw' AS 'running' is to 'ran'
'singing' is to 'sang' AS 'dancing' is to 'danced'
'singing' is to 'sang' AS 'playing' is to 'played'
'singing' is to 'sang' AS 'running' is to 'ran'
'singing' is to 'sang' AS 'screaming' is to 'screamed'
'sitting' is to 'sat' AS 'taking' is to 'took'
'sitting' is to 'sat' AS 'walking' is to 'walked'
'sitting' is to 'sat' AS 'dancing' is to 'danced'
'sitting' is to 'sat' AS 'falling' is to 'fell'
'sitting' is to 'sat' AS 'lookin