# 3- Addition and Similarity

Here, we will try to find some "useful" Game of Thrones analogies.

In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

df = pd.read_table("../data/got_word_vectors.txt", delimiter=" ", header=None, index_col=0)
df = df.div(np.linalg.norm(df, axis=1), axis=0)
characters = pd.read_csv("../data/got_characters.csv")
characters.index = characters["first_name"]

In [2]:
# utility functions
def most_similar(df, word, n=5, exceptions=[]):
    """Return the n most similar words"""
    return df.loc[~df.index.isin(exceptions)].dot(word).sort_values(ascending=False).head(n)

def op(operation, n=1):
    """Compute the vector addition "w1 - w2 + w3" and return the n most similar words
        equation: w1 - w2 + w3 = w4
        analogy: (w2 is to w3) as (w1 is to w4)
    """
    w1, w2, w3 = operation.split()[::2]
    vec = df.loc[w1] - df.loc[w2] + df.loc[w3]
   
    topn =  most_similar(
        df, vec, 
        exceptions=[w1, w2, w3],
        n=n+3
    )
    analogy = "%s is to %s AS %s is to %s" % (w2, w3, w1, "?")
    top_res = ["%s = %s (%.2f)" % (analogy, topn.index[i], topn[i]) for i in range(n)]
    print("\n".join(top_res))

### Family Name

In [3]:
# custom offset vector for family naming
v1 = df.loc["baratheon"] - df.loc["robert"]
v2 = df.loc["lannister"] - df.loc["jaime"]
v3 = df.loc["stark"] - df.loc["catelyn"]
offset = np.mean([v1, v2, v3], axis=0)

def family_name(first_name, n=1):
    first_name = first_name.lower()
    res = most_similar(
        df, 
        df.loc[first_name] + offset, 
        n=n,
        exceptions=[first_name]
    )
    for i in range(n):
        last_name, score = res.index[i], res.values[i]
        print("%s %s (%f)" % (first_name.title(), last_name.title(), score))

for first_name in ["Jon", "Eddard", "Asha", "Doran", "Edmure", "Roose", "Renly", "Loras", "Daenerys", "Sandor", "Jorah", "Tyrion", "Petyr", "Kevan", "Rhaegar", "Margaery"]:
    family_name(first_name)

Jon Snow (0.603530)
Eddard Stark (0.911670)
Asha Greyjoy (0.787724)
Doran Martell (0.972809)
Edmure Tully (0.767302)
Roose Bolton (0.996987)
Renly Baratheon (1.053815)
Loras Tyrell (0.827190)
Daenerys Targaryen (0.770334)
Sandor Clegane (0.908605)
Jorah Mormont (0.636546)
Tyrion Lannister (0.655923)
Petyr Baelish (0.881676)
Kevan Lannister (0.745010)
Rhaegar Baratheon (0.742945)
Margaery Tyrell (0.816739)


### House to Seat relationship

In [4]:
op("tully - stark + winterfell")
op("martell - stark + winterfell")

stark is to winterfell AS tully is to ? = riverrun (0.79)
stark is to winterfell AS martell is to ? = sunspear (0.82)


### Family Relations

In [5]:
op("sansa - child + mother")
op("arya - sister + brother")
op("catelyn - wife + husband")
op("cersei - wife + husband")
op("catelyn - cersei + joffrey")
op("jaime - brother + sister")

child is to mother AS sansa is to ? = catelyn (0.84)
sister is to brother AS arya is to ? = bran (0.57)
wife is to husband AS catelyn is to ? = ned (0.79)
wife is to husband AS cersei is to ? = jaime (0.64)
cersei is to joffrey AS catelyn is to ? = robb (0.88)
brother is to sister AS jaime is to ? = cersei (0.85)


### R + L = J

In [6]:
op("jon - nephew + aunt") # should be daenerys
op("jon - nephew + uncle") # should be viserys
op("jon - child + father") # should be rhaegar
op("jon - child + mother") # should be lyanna

nephew is to aunt AS jon is to ? = rafford (0.69)
nephew is to uncle AS jon is to ? = wall (0.69)
child is to father AS jon is to ? = eddard (0.77)
child is to mother AS jon is to ? = catelyn (0.67)


In [7]:
# last attempt with "compositionality by vector addition"
# from "Learning Representations of Text using Neural Networks", Tomas Mikolov, NIPS 2013
most_similar(
    df, 
    df.loc["rhaegar"] + df.loc["secret"] + df.loc["child"],
    n=8,
    exceptions = ["rhaegar", "secret", "child"]
)

0
fathered      0.791592
targaryen     0.765303
princess      0.753265
elia          0.750862
khal          0.730476
acceptance    0.718154
lysene        0.704869
wildling      0.693927
dtype: float64

#### Bonus Functions for "multiplication-based analogy testing"

In [8]:
# Levy & Goldberg (2014) formula tested by: 
# http://www.marekrei.com/blog/linguistic-regularities-word-representations/
# did not give better results in my case, but I leave it here.

def most_similar_bis(df, a, b, c, n=1):
    "a is to b AS c is to ?"
    s = (df.dot(df.loc[c])+1)/2 * (df.dot(df.loc[b])+1)/2 / ((df.dot(df.loc[a])+1)/2 + 0.001)
    return s[~s.index.isin([a,b,c])].sort_values(ascending=False).head(n)

def op_bis(operation, n=1):
    """Compute the vector operation "w1 - w2 + w3" and return the n most similar words"""
    w1, w2, w3 = operation.split()[::2]
    topn =  most_similar_bis(df, w2, w3, w1, n=n+3)
    analogy = "%s is to %s AS %s is to %s" % (w2, w3, w1, "?")
    top_res = ["%s = %s (%.2f)" % (analogy, topn.index[i], topn[i]) for i in range(n)]
    print("\n".join(top_res))

op_bis("king - man + woman", n=3)

man is to woman AS king is to ? = queen (0.93)
man is to woman AS king is to ? = margaery (0.91)
man is to woman AS king is to ? = joffrey (0.86)
