# Word2Vec result visualizer

In [None]:
import pandas as pd
!pip install plotly
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m


## Evaluate 

In [None]:
df = pd.read_csv("../output/all_relatedness_table_v2.csv")
df.columns

Index(['Unnamed: 0', 'EN-VERB-143', 'EN-SimVerb-3500', 'EN-RG-65',
       'EN-RW-STANFORD', 'EN-MTurk-771', 'EN-MEN-TR-3k', 'EN-MC-30',
       'EN-MTurk-287', 'EN-SIMLEX-999', 'EN-WS-353-REL', 'EN-YP-130',
       'EN-WS-353-ALL', 'EN-WS-353-SIM'],
      dtype='object')

## Select and prepare results

In [None]:
# read and extract result files
## ANALOGY
analogy_result = pd.read_csv("../output/all_analogy_result.csv")
analogy_result.loc[:, 'Unnamed: 0'] = analogy_result['Unnamed: 0'].apply(lambda x: x.replace("_word2vec_mc=10_", "word2vec_"))
analogy_result.rename(columns={'Unnamed: 0': 'model_name'}, inplace=True)
analogy_to_keep = ['model_name', 'google_analogy_cos3mul_score', 'bats_analogy_cos3mul_score']
analogy_result = analogy_result.loc[:, analogy_to_keep]

## ASSOCIATIONS
association_result = pd.read_csv("../output/all_association_result_v2.csv")
association_result.rename(columns={'col': 'model_name'}, inplace=True)

## SIMILARITY
relatedness_result = pd.read_csv("../output/all_relatedness_table_v2.csv")
relatedness_result.loc[:, 'Unnamed: 0'] = relatedness_result['Unnamed: 0'].apply(lambda x: x.replace("word2vec_mc=10_", "word2vec_"))
relatedness_result.rename(columns={'Unnamed: 0': 'model_name'}, inplace=True)
# to_keep = ['model_name','EN-WS-353-ALL', 'EN-MEN-TR-3k', 'EN-YP-130', 'EN-SIMLEX-999', 'EN-VERB-143', 'EN-SimVerb-3500']
relatedness_to_keep = ['model_name','EN-WS-353-SIM', 'EN-RG-65', 'EN-SIMLEX-999', 'EN-VERB-143', 'EN-SimVerb-3500']
relatedness_result = relatedness_result.loc[:, relatedness_to_keep]

## COMBINE
final_result = pd.merge(analogy_result, association_result, on='model_name')
final_result = pd.merge(final_result, relatedness_result, on='model_name')
final_result[['algorithm', 'dimension', 'window', 'method', 'model_type']] = final_result['model_name'].str.split("_", expand=True)
final_result.loc[:, 'dimension'] = final_result.loc[:, 'dimension'].apply(lambda x: x.replace("size=", ""))
final_result.loc[:, 'dimension'] = final_result.loc[:, 'dimension'].astype(int)
final_result.sort_values(by=['dimension'], inplace=True)
# final_result.loc[:, 'window'] = final_result.loc[:, 'window'].apply(lambda x: x.replace("window=", ""))
# final_result.loc[:, 'window'] = final_result.loc[:, 'window'].astype(int)
final_result.loc[:, 'method'] = final_result.loc[:, 'method'].apply(lambda x: x.replace("sg=", ""))
final_result.loc[:, 'method'] = final_result.loc[:, 'method'].map({'0':'cbow', '1':'sg'})
# add aggregated scores
final_result.loc[:, 'final_analogy_score'] = final_result.loc[:, analogy_to_keep[1:]].mean(axis=1)
final_result.loc[:, 'final_association_hit_rate'] = final_result.loc[:, ['swow8500_hit_rate', 'eat_hit_rate']].mean(axis=1) 
final_result.loc[:, 'final_association_coverage'] = final_result.loc[:, ['swow8500_avg_coverage', 'eat_avg_coverage']].mean(axis=1) 
final_result.loc[:, 'final_relatedness_score'] = final_result.loc[:, relatedness_to_keep[1:]].mean(axis=1) 
# final_result.loc[:, 'final_association_coverage'] = final_result['final_association_coverage'].fillna(0)
final_result = final_result.fillna(0)

# Visualize

In [None]:
final_result.head()
# final_result.columns

Unnamed: 0,model_name,google_analogy_cos3mul_score,bats_analogy_cos3mul_score,swow8500_hit_rate,swow8500_avg_coverage,eat_hit_rate,eat_avg_coverage,EN-WS-353-SIM,EN-RG-65,EN-SIMLEX-999,...,EN-SimVerb-3500,algorithm,dimension,window,method,model_type,final_analogy_score,final_association_hit_rate,final_association_coverage,final_relatedness_score
0,word2vec_size=100_window=50_sg=0_IN-IN,0.6167,0.2,0.231593,0.17161,0.235747,0.148577,0.747688,0.803224,0.379981,...,0.225355,word2vec,100,window=50,cbow,IN-IN,0.40835,0.23367,0.160093,0.496236
45,word2vec_size=100_window=50_sg=1_IN-OUT,0.399396,0.102326,0.216137,0.160442,0.212821,0.134412,0.66158,0.615201,0.241359,...,0.131706,word2vec,100,window=50,sg,IN-OUT,0.250861,0.214479,0.147427,0.368855
44,word2vec_size=100_window=50_sg=1_IN-IN,0.504024,0.14186,0.153584,0.110292,0.164555,0.099133,0.665748,0.631034,0.27067,...,0.129438,word2vec,100,window=50,sg,IN-IN,0.322942,0.15907,0.104712,0.408108
46,word2vec_size=100_window=50_sg=1_OUT-IN,0.2334,0.046512,0.037848,0.026175,0.042232,0.023896,0.679914,0.639019,0.209513,...,0.115384,word2vec,100,window=50,sg,OUT-IN,0.139956,0.04004,0.025036,0.375853
15,word2vec_size=100_window=5_sg=0_OUT-OUT,0.589537,0.204651,0.114032,0.083577,0.122775,0.075928,0.594989,0.605548,0.324987,...,0.196568,word2vec,100,window=5,cbow,OUT-OUT,0.397094,0.118404,0.079752,0.396461


In [None]:
px.line(final_result, x="dimension", y="final_analogy_score", color="window", facet_row="method", facet_col="model_type", title="(C) Analogy result").update_traces(mode="lines+markers")

In [None]:
px.line(final_result, x="dimension", y="final_relatedness_score", color="window", facet_row="method", facet_col="model_type", title="Similarity result").update_traces(mode="lines+markers")

In [None]:
asso_result = pd.melt(final_result, id_vars=['model_name', 'dimension', 'window', 'method', 'model_type'], value_vars=["final_association_hit_rate", "final_association_coverage"])
# asso_result[['algorithm', 'dimension', 'window', 'method', 'model_type']] = asso_result['model_name'].str.split("_", expand=True)
asso_result.rename(columns={'variable':'score'}, inplace=True)
asso_result['score'].replace({"final_association_hit_rate": 'hit rate', "final_association_coverage": "coverage"}, inplace=True)
asso_result.loc[:, 'score, window'] = asso_result['score'] + " @ "+ asso_result['window'].astype(str)
px.scatter(asso_result, x="dimension", y="value", color="score, window", facet_row="method", facet_col="model_type", title="(B) Association result (hit rate & coverage)").update_traces(mode="lines+markers")

In [None]:
# asso_result = pd.melt(final_result, id_vars=['model_name', 'dimension', 'window', 'method', 'model_type'], value_vars=["final_association_hit_rate", "final_association_coverage"])
# # asso_result[['algorithm', 'dimension', 'window', 'method', 'model_type']] = asso_result['model_name'].str.split("_", expand=True)
# asso_result.rename(columns={'variable':'score'}, inplace=True)
# asso_result['score'].replace({"final_association_hit_rate": 'hit rate', "final_association_coverage": "coverage"}, inplace=True)
# asso_result.loc[:, 'score, window'] = asso_result['score'] + " @ "+ asso_result['window'].astype(str)
px.scatter(final_result, x="dimension", y="final_association_coverage", color="window", facet_row="method", facet_col="model_type", title="(B) Association result (hit rate & coverage)").update_traces(mode="lines+markers")

# Add table

In [None]:
# final_result[['model_name', 'google_analogy_cos3mul_score',
#        'bats_analogy_cos3mul_score', 'swow8500_hit_rate',
#        'swow8500_avg_coverage', 'eat_hit_rate', 'eat_avg_coverage',
#        'EN-WS-353-ALL', 'EN-MEN-TR-3k', 'EN-YP-130', 'EN-SIMLEX-999',
#        'EN-VERB-143', 'EN-SimVerb-3500']]
final_table = final_result.copy()
final_table.loc[:, 'window'] = final_table.loc[:, 'window'].apply(lambda x: x.replace("window=", ""))
final_table.loc[:, 'window'] = final_table.loc[:, 'window'].astype(int)
final_table.sort_values(by=['method', 'window', 'dimension', 'model_type'], inplace=True)
final_table = final_table[['dimension', 'window', 'method', 'model_type', 'google_analogy_cos3mul_score',
       'bats_analogy_cos3mul_score', 'swow8500_hit_rate',
       'swow8500_avg_coverage', 'eat_hit_rate', 'eat_avg_coverage',
       'EN-WS-353-ALL', 'EN-MEN-TR-3k', 'EN-YP-130', 'EN-SIMLEX-999',
       'EN-VERB-143', 'EN-SimVerb-3500']]
final_table.columns = ['dimension', 'window', 'method', 'model-type', 'google-analogy',
       'bats-analogy', 'swow8500-hit-rate',
       'swow8500-avg-coverage', 'eat-hit-rate', 'eat-avg-coverage',
       'EN-WS-353-ALL', 'EN-MEN-TR-3k', 'EN-YP-130', 'EN-SIMLEX-999',
       'EN-VERB-143', 'EN-SimVerb-3500']
final_table = final_table.round(2)

KeyError: "['EN-WS-353-ALL', 'EN-YP-130', 'EN-MEN-TR-3k'] not in index"

In [None]:
# print(final_table.to_latex(index=False))
# final_table.round(2)

In [None]:
final_result.

Unnamed: 0,model_name,google_analogy_cos3mul_score,bats_analogy_cos3mul_score,swow8500_hit_rate,swow8500_avg_coverage,eat_hit_rate,eat_avg_coverage,EN-WS-353-ALL,EN-MEN-TR-3k,EN-YP-130,...,EN-SimVerb-3500,algorithm,dimension,window,method,model_type,final_analogy_score,final_association_hit_rate,final_association_coverage,final_relatedness_score
0,word2vec_size=100_window=50_sg=0_IN-IN,0.6167,0.2,0.231593,0.740997,0.235747,0.630241,0.681489,0.755303,0.548792,...,0.225355,word2vec,100,window=50,cbow,IN-IN,0.40835,0.23367,0.685619,0.485975
45,word2vec_size=100_window=50_sg=1_IN-OUT,0.399396,0.102326,0.216137,0.742314,0.212821,0.631573,0.620555,0.697948,0.370416,...,0.131706,word2vec,100,window=50,sg,IN-OUT,0.250861,0.214479,0.686944,0.376069
44,word2vec_size=100_window=50_sg=1_IN-IN,0.504024,0.14186,0.153584,0.718119,0.164555,0.602429,0.619936,0.707534,0.398936,...,0.129438,word2vec,100,window=50,sg,IN-IN,0.322942,0.15907,0.660274,0.411694
46,word2vec_size=100_window=50_sg=1_OUT-IN,0.2334,0.046512,0.037848,0.691586,0.042232,0.565833,0.627239,0.697433,0.325844,...,0.115384,word2vec,100,window=50,sg,OUT-IN,0.139956,0.04004,0.62871,0.368475
15,word2vec_size=100_window=5_sg=0_OUT-OUT,0.589537,0.204651,0.114032,0.732924,0.122775,0.618428,0.545319,0.590263,0.300444,...,0.196568,word2vec,100,window=5,cbow,OUT-OUT,0.397094,0.118404,0.675676,0.369632


# Result summary table

In [None]:
final_result.columns

Index(['model_name', 'google_analogy_cos3mul_score',
       'bats_analogy_cos3mul_score', 'swow8500_hit_rate',
       'swow8500_avg_coverage', 'eat_hit_rate', 'eat_avg_coverage',
       'EN-WS-353-ALL', 'EN-MEN-TR-3k', 'EN-YP-130', 'EN-SIMLEX-999',
       'EN-VERB-143', 'EN-SimVerb-3500', 'algorithm', 'dimension', 'window',
       'method', 'model_type', 'final_analogy_score',
       'final_association_hit_rate', 'final_association_coverage',
       'final_relatedness_score'],
      dtype='object')

In [None]:
def agg_scores(x):
    y = x.values
    mean, maxx, avg_change = np.mean(y), max(y), np.mean([y[1]-y[0], y[2]-y[1]])
    # return f"{mean:.3f} ({avg_change:+.2f})"
    return f"{maxx:.3f} ({mean:.3f})"

def get_pivot_score(val):
    final_result['window'] = final_result['window'].apply(lambda x: x.replace("window=", ""))
    pivot = pd.pivot_table(final_result, values=val, index=['method', 'window'],
                        columns=['model_type'], aggfunc=agg_scores)
    tuples = [(val,) + x for x in pivot.index]
    pivot.index = pd.MultiIndex.from_tuples(tuples, names=['type','method', 'window'])
    return pivot
pivot_results = pd.concat([get_pivot_score('final_relatedness_score'), 
                            get_pivot_score('final_association_coverage'), 
                            get_pivot_score('final_analogy_score')])

In [None]:
print(pivot_results.to_latex(index=True))
# pivot_results.to_csv('../output/word2vec_pivot_result_v2.csv')

\begin{tabular}{lllllll}
\toprule
                    &    & model\_type &          IN-IN &         IN-OUT &         OUT-IN &        OUT-OUT \\
type & method & window &                &                &                &                \\
\midrule
final\_relatedness\_score & cbow & 5 &  0.520 (0.504) &  0.430 (0.430) &  0.440 (0.423) &  0.419 (0.408) \\
                    &    & 50 &  0.526 (0.514) &  0.516 (0.505) &  0.510 (0.506) &  0.348 (0.343) \\
                    & sg & 5 &  0.528 (0.511) &  0.476 (0.466) &  0.471 (0.466) &  0.499 (0.478) \\
                    &    & 50 &  0.424 (0.418) &  0.420 (0.398) &  0.410 (0.395) &  0.387 (0.373) \\
final\_association\_coverage & cbow & 5 &  0.148 (0.147) &  0.244 (0.236) &  0.001 (0.000) &  0.080 (0.066) \\
                    &    & 50 &  0.191 (0.176) &  0.274 (0.254) &  0.000 (0.000) &  0.081 (0.074) \\
                    & sg & 5 &  0.127 (0.122) &  0.242 (0.237) &  0.089 (0.081) &  0.113 (0.107) \\
                    &    & 50 &

In [None]:
pivot_results

Unnamed: 0_level_0,Unnamed: 1_level_0,model_type,IN-IN,IN-OUT,OUT-IN,OUT-OUT
type,method,window,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
final_relatedness_score,cbow,5,0.520 (0.504),0.430 (0.430),0.440 (0.423),0.419 (0.408)
final_relatedness_score,cbow,50,0.526 (0.514),0.516 (0.505),0.510 (0.506),0.348 (0.343)
final_relatedness_score,sg,5,0.528 (0.511),0.476 (0.466),0.471 (0.466),0.499 (0.478)
final_relatedness_score,sg,50,0.424 (0.418),0.420 (0.398),0.410 (0.395),0.387 (0.373)
final_association_coverage,cbow,5,0.148 (0.147),0.244 (0.236),0.001 (0.000),0.080 (0.066)
final_association_coverage,cbow,50,0.191 (0.176),0.274 (0.254),0.000 (0.000),0.081 (0.074)
final_association_coverage,sg,5,0.127 (0.122),0.242 (0.237),0.089 (0.081),0.113 (0.107)
final_association_coverage,sg,50,0.123 (0.113),0.162 (0.153),0.040 (0.033),0.121 (0.104)
final_analogy_score,cbow,5,0.529 (0.494),0.364 (0.351),0.012 (0.004),0.410 (0.404)
final_analogy_score,cbow,50,0.495 (0.462),0.454 (0.433),0.000 (0.000),0.411 (0.404)
