In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys 
from os.path import join as pjoin

sys.path.append('/home/hhansen/decon/decon_env/DecontextEmbeddings')
import os 
os.environ['EMBEDDING_EVALUATION_DATA_PATH'] = '/home/hhansen/decon/decon_env/DecontextEmbeddings/helpers/embedding_evaluation/data/'

DATA_DIR = '/home/hhansen/DecontextEmbeddings/data'
os.environ['DATA_DIR'] = DATA_DIR

FIGURE_OUTPUT_DIR = pjoin(DATA_DIR, 'figures', 'similarity_datasets')


from scipy.spatial.distance import squareform


from helpers.embedding_evaluation.evaluate import Evaluation as wordsim_evaluate
from helpers.things_evaluation.evaluate import read_embeddings, load_behav, load_sorting, match_behv_sim, evaluate as run_evaluation
from helpers.data import yield_static_data, load_simlex, load_wordsim
from helpers.similarity_values import rsa_matrix_to_pair_list, get_spose_matrix
from helpers.plot import set_style_and_font_size
set_style_and_font_size()

from scipy.stats import spearmanr, pearsonr

/home/hhansen/DecontextEmbeddings/data


In [2]:
tex_fonts = {
        # Use LaTeX to write all text
        #"text.usetex": True,
        #"font.family": "serif",
        # Use 10pt font in plots, to match 10pt font in document
        "axes.labelsize": 12,
        "font.size": 12,
        # Make the legend/label fonts a little smaller
        "legend.fontsize": 10,
        "xtick.labelsize": 10,
        "ytick.labelsize": 10
}

plt.rcParams.update(tex_fonts)

# Similarity distribution

In [3]:
simlex_df = load_simlex()
wordsim_df = load_wordsim()

In [4]:
things_df = get_spose_matrix()
things_list = rsa_matrix_to_pair_list(things_df)
things_list = things_list.rename(columns={'sim': 'human_pred'})

In [5]:
def plot(vec, title, ax, bins):
    ax.hist(vec, bins=bins)
    ax.set_xlabel('Similarity value')
    ax.set_ylabel('Number of word pairs')
    ax.set_title(title)

In [6]:
fig, axes = plt.subplots(1,3, figsize=(11.68,4))
plot(simlex_df['human_pred'], 'Simlex-999', axes[0], 10)
plot(wordsim_df['human_pred'], 'Wordsim-353', axes[1], 10)
plot(things_list['human_pred'], 'THINGS', axes[2], 100)
plt.tight_layout()
plt.savefig(pjoin(FIGURE_OUTPUT_DIR, 'dataset_hist.pdf'), bbox_inches='tight')

FileNotFoundError: missing font metrics file: cmr10

FileNotFoundError: missing font metrics file: cmr12

<Figure size 840.96x288 with 3 Axes>

# Comparison between datasets

# Wordsim

In [6]:
wordsim_df.sort_values('human_pred', ascending=False).head(10)

Unnamed: 0,word1,word2,human_pred
2,tiger,tiger,10.0
40,fuck,sex,9.44
68,journey,voyage,9.29
73,midday,noon,9.29
266,dollar,buck,9.22
31,money,cash,9.15
70,coast,shore,9.1
97,money,cash,9.08
98,money,currency,9.04
42,football,soccer,9.03


# Simlex

In [7]:
simlex_df.sort_values('human_pred', ascending=False).head(10)

Unnamed: 0,word1,word2,human_pred
782,vanish,disappear,9.8
22,quick,rapid,9.7
205,creator,maker,9.62
8,stupid,dumb,9.58
16,insane,crazy,9.57
42,large,big,9.55
3,happy,cheerful,9.55
140,cow,cattle,9.52
134,area,region,9.47
18,large,huge,9.47


# THINGS

In [8]:
things_list.sort_values('human_pred', ascending=False).head(10)

Unnamed: 0,word1,word2,human_pred
917274,finger,knee,0.99603
1061181,grate,screen2,0.995483
316548,bowling_ball,marble,0.994858
1241745,knee,toe,0.994624
1708284,trombone,tuba,0.994454
967054,foot,knee,0.994447
41389,ankle,knee,0.994389
929570,firewood,wood,0.991492
918065,finger,toe,0.991269
958072,flute,tuba,0.990567


# Simlex-Wordsim

In [9]:
rows = []

for row in wordsim_df.iterrows():
    word1 = row[1].word1
    word2 = row[1].word2
    
    simlex_sim = simlex_df[((simlex_df['word1'] == word1) & (simlex_df['word2'] == word2) | (simlex_df['word1'] == word2) & (simlex_df['word2'] == word1))]
    if not simlex_sim.empty:
        simlex_sim = simlex_sim.reset_index().loc[0, 'human_pred']
        rows.append({'word1': word1, 'word2': word2, 'wordsim_sim': row[1].human_pred, 'simlex_sim': simlex_sim})

intersection_df = pd.DataFrame(rows)
intersection_df['sim_rank'] = intersection_df['simlex_sim'].rank()
intersection_df['wordsim_sim'] = intersection_df['wordsim_sim'].rank()
intersection_df['di^2'] = intersection_df.apply(lambda row: (row['sim_rank'] - row['wordsim_sim']) ** 2, axis=1)
df = intersection_df.sort_values('di^2', ascending=False)
df = df[['word1', 'word2', 'wordsim_sim', 'simlex_sim']]
df

Unnamed: 0,word1,word2,wordsim_sim,simlex_sim
7,man,woman,8.0,3.33
5,closet,clothes,6.0,3.27
0,professor,doctor,1.0,4.65
3,psychology,science,2.0,4.92
8,aluminum,metal,5.0,7.25
1,student,professor,3.0,1.95
6,day,dawn,4.0,5.47
2,coast,shore,9.0,8.83
4,planet,moon,7.0,5.87


In [10]:
print(df.to_latex(index=False))

\begin{tabular}{llrr}
\toprule
     word1 &     word2 &  wordsim\_sim &  simlex\_sim \\
\midrule
       man &     woman &          8.0 &        3.33 \\
    closet &   clothes &          6.0 &        3.27 \\
 professor &    doctor &          1.0 &        4.65 \\
psychology &   science &          2.0 &        4.92 \\
  aluminum &     metal &          5.0 &        7.25 \\
   student & professor &          3.0 &        1.95 \\
       day &      dawn &          4.0 &        5.47 \\
     coast &     shore &          9.0 &        8.83 \\
    planet &      moon &          7.0 &        5.87 \\
\bottomrule
\end{tabular}



# Simlex-THINGS

In [11]:
things_words = list(pd.read_csv(f'{DATA_DIR}/things/things_concepts.tsv', sep='\t')['uniqueID'].unique())

In [12]:
rows = []

for row in simlex_df.iterrows():
    word1 = row[1].word1
    word2 = row[1].word2
    word1_things = word1.replace(' ', '_')
    word2_things = word2.replace(' ', '_')

    try:
        things_sim = things_df.loc[word1, word2]
    except:
        continue
    
    rows.append({'word1': word1, 'word2': word2, 'things_sim': things_sim, 'simlex_sim': row[1].human_pred})

intersection_df = pd.DataFrame(rows)
intersection_df['sim_rank'] = intersection_df['simlex_sim'].rank()
intersection_df['things_rank'] = intersection_df['things_sim'].rank()
intersection_df['di^2'] = intersection_df.apply(lambda row: (row['sim_rank'] - row['things_rank']) ** 2, axis=1)
df = intersection_df.sort_values('di^2', ascending=False).head(10)
df = df[['word1', 'word2', 'things_sim', 'simlex_sim']]
df

Unnamed: 0,word1,word2,things_sim,simlex_sim
89,bottle,container,0.312603,7.93
3,dog,cat,0.929862,1.75
65,arm,knee,0.9807,2.75
66,cat,rabbit,0.939365,2.37
29,door,gate,0.658421,5.25
44,purse,bag,0.783658,8.33
24,chair,bench,0.72502,6.67
94,dog,horse,0.929813,2.38
7,boat,anchor,0.908871,2.25
10,wood,log,0.751845,7.3


In [13]:
print(df.to_latex(index=False))

\begin{tabular}{llrr}
\toprule
 word1 &     word2 &  things\_sim &  simlex\_sim \\
\midrule
bottle & container &    0.312603 &        7.93 \\
   dog &       cat &    0.929862 &        1.75 \\
   arm &      knee &    0.980700 &        2.75 \\
   cat &    rabbit &    0.939365 &        2.37 \\
  door &      gate &    0.658421 &        5.25 \\
 purse &       bag &    0.783658 &        8.33 \\
 chair &     bench &    0.725020 &        6.67 \\
   dog &     horse &    0.929813 &        2.38 \\
  boat &    anchor &    0.908871 &        2.25 \\
  wood &       log &    0.751845 &        7.30 \\
\bottomrule
\end{tabular}



# THINGS-Wordsim

In [14]:
rows = []

for row in wordsim_df.iterrows():
    word1 = row[1].word1
    word2 = row[1].word2
    word1_things = word1.replace(' ', '_')
    word2_things = word2.replace(' ', '_')

    try:
        things_sim = things_df.loc[word1, word2]
    except Exception as e:
        continue
    
    rows.append({'word1': word1, 'word2': word2, 'things_sim': things_sim, 'wordsim_sim': row[1].human_pred})

intersection_df = pd.DataFrame(rows)
intersection_df['thing_rank'] = intersection_df['things_sim'].rank()
intersection_df['wordsim_rank'] = intersection_df['wordsim_sim'].rank()
intersection_df['di^2'] = intersection_df.apply(lambda row: (row['thing_rank'] - row['wordsim_rank']) ** 2, axis=1)

df = intersection_df.sort_values('di^2', ascending=False).head(10)
df = df[['word1', 'word2', 'things_sim', 'wordsim_sim']]
df

Unnamed: 0,word1,word2,things_sim,wordsim_sim
15,money,bank,0.392443,8.5
4,train,car,0.96357,6.31
8,bank,money,0.392443,8.12
14,bird,crane,0.236589,7.38
9,football,basketball,0.949402,6.81
24,television,film,0.687924,7.72
7,cucumber,potato,0.863655,5.92
13,gem,jewel,0.872318,8.96
5,television,radio,0.888968,6.77
0,tiger,cat,0.93284,7.35


In [15]:
print(df.to_latex(index=False))

\begin{tabular}{llrr}
\toprule
     word1 &      word2 &  things\_sim &  wordsim\_sim \\
\midrule
     money &       bank &    0.392443 &         8.50 \\
     train &        car &    0.963570 &         6.31 \\
      bank &      money &    0.392443 &         8.12 \\
      bird &      crane &    0.236589 &         7.38 \\
  football & basketball &    0.949402 &         6.81 \\
television &       film &    0.687924 &         7.72 \\
  cucumber &     potato &    0.863655 &         5.92 \\
       gem &      jewel &    0.872318 &         8.96 \\
television &      radio &    0.888968 &         6.77 \\
     tiger &        cat &    0.932840 &         7.35 \\
\bottomrule
\end{tabular}



# Simlex-Wordsim-THINGS

In [16]:
rows = []

for row in simlex_df.iterrows():
    word1 = row[1].word1
    word2 = row[1].word2
    word1_things = word1.replace(' ', '_')
    word2_things = word2.replace(' ', '_')  
        
    try:
        things_sim = things_df.loc[word1, word2]
    except:
        continue
    
    wordsim_sim = wordsim_df[((wordsim_df['word1'] == word1) & (wordsim_df['word2'] == word2) | (wordsim_df['word1'] == word2) & (wordsim_df['word2'] == word1))]
    if not wordsim_sim.empty:
        wordsim_sim = wordsim_sim.reset_index().loc[0, 'human_pred']
        rows.append({'word1': word1, 'word2': word2, 'things_sim': things_sim, 'simlex_sim': row[1].human_pred, 'wordsim_sim': wordsim_sim})

intersection_df = pd.DataFrame(rows)
df = intersection_df.sort_values('things_sim', ascending=False).head(30)
df

Unnamed: 0,word1,word2,things_sim,simlex_sim,wordsim_sim
0,woman,man,0.884549,3.33,8.3
1,clothes,closet,0.874987,3.27,8.0


In [17]:
print(df.to_latex(index=False))

\begin{tabular}{llrrr}
\toprule
  word1 &  word2 &  things\_sim &  simlex\_sim &  wordsim\_sim \\
\midrule
  woman &    man &    0.884549 &        3.33 &          8.3 \\
clothes & closet &    0.874987 &        3.27 &          8.0 \\
\bottomrule
\end{tabular}

