# File and GPU access

In [None]:
import json
import itertools
import scipy.spatial.distance

import pandas as pd
import numpy as np

from numpy.random import randint
from scipy.special import softmax
from sklearn.preprocessing import MinMaxScaler, normalize
from numpy.linalg import matrix_power
from functools import lru_cache

In [None]:
sample_df = pd.read_csv("../data/vocab.csv")

# Constructing word similarities

for each of 12218 words, we have (1) glove, (2) SWOW, and (3) non-contextual BERT embeddings
(swow has 2 versions : PPMI and Random Walk) -- we use RW 
bert context-free embeddings obtained by "CLS [word] SEP": summed across last four layers (768-dim)


In [None]:
representations = {}
representations['glove'] = pd.read_csv("../data/glove_embeddings.csv").transpose().values
representations['swow'] = pd.read_csv("../data/swow_embeddings.csv").transpose().values
representations['bert-sum'] = pd.read_csv("../data/bert_embeddings.csv").transpose().values

In [None]:
def create_similarity_matrix(vector1, vector2, N):
    N = len(vector1)
    M = len(vector2)
    dist = scipy.spatial.distance.cdist(vector1, vector2, 'cosine')
    return 1 - dist.reshape(-1).reshape((N,M))

sim_matrices = {}
sim_matrices['glove'] = create_similarity_matrix(representations['glove'], representations['glove'], )
sim_matrices['swow'] = create_similarity_matrix(representations['swow'], representations['swow'], len(representations['swow']))
sim_matrices['bert-sum'] = create_similarity_matrix(representations['bert-sum'], representations['bert-sum'], len(representations['bert-sum']))

In [None]:
print('representation shape', representations['swow'].shape)
print('similarity matrix shape', sim_matrices['swow'].shape)

# Importing Empirical Data

In [None]:
## only keeping columns we need here
rawdata = pd.read_csv("../data/raw_data.csv")
rawdata = rawdata[rawdata.columns[3:37]]
print(f"our data has {len(rawdata)} rows")

# Defining composite vectors

we construct a baseline model based on simply the average vector of W1 and W2

In [None]:
def average_vec(word1, word2, labels_df, embeddings, embeddingsize):
    word1_index = list(labels_df["Word"]).index(word1)
    word2_index = list(labels_df["Word"]).index(word2)

    word1_vector = embeddings[word1_index]
    word2_vector = embeddings[word2_index]

    avg_vector = (word1_vector + word2_vector)/2
    avg_vector = avg_vector.reshape((1, embeddingsize))

    return avg_vector

avg = average_vec("apple", "mango", sample_df, representations['glove'], 300)
1 - scipy.spatial.distance.cdist(representations['glove'][list(sample_df["Word"]).index("apple")].reshape((1,300)), avg.reshape((1,300)), 'cosine')

also the average of W1, W2, and Clue1 (for feedback analyses)

In [None]:
def w1w2c1_vec(word1, word2, clue1, labels_df, embeddings, embeddingsize):
    word1_index = list(labels_df["Word"]).index(word1)
    word2_index = list(labels_df["Word"]).index(word2)
    clue1_index = list(labels_df["Word"]).index(clue1)

    word1_vector = embeddings[word1_index]
    word2_vector = embeddings[word2_index]
    clue1_vector = embeddings[clue1_index]

    avg_vector = (word1_vector + word2_vector + clue1_vector)/3
    avg_vector = avg_vector.reshape((1,embeddingsize))

    return avg_vector

w1w2c1_vec("lion", "tiger", "animal", sample_df, representations['glove'], 300).shape

## Constructing all combinations

initialize with NAs

In [None]:
maindata_sample = rawdata.copy()
for representation in ['bert-sum', 'glove', 'swow'] : 
    for col in ['w1w2', 'c1w1', 'c1w2', 'c1avg', 'c2w1', 'c2w2', 'c2_w1w2avg', 'c2_c1avg'] :
        maindata_sample[representation + "_" + col + "_sim"] = 'NA'

Now computes the similarities of differnet words/clues by looping through the empirical data. Note that the "embedding" and "sim_matrix" variables are changed at the beginning of the loop and the columns corresponding to each embedding are appended one after the other (i.e., this cell needs to run 3 times)

In [None]:
for representation in ['bert-sum', 'glove', 'swow'] :
    embedding = representations[representation]
    embeddingsize = embedding.shape[1]
    sim_matrix = sim_matrices[representation]
    for index, row in maindata_sample.iterrows():
        word1 = str(row["Word1"])
        word2 = str(row["Word2"])
        clue1 = str(row["Clue1"])
        clue2 = str(row["Clue2"])

        if word1 in list(sample_df["Word"]) and word2 in list(sample_df["Word"]):
            w1_index = list(sample_df["Word"]).index(word1) 
            w2_index = list(sample_df["Word"]).index(word2)
            maindata_sample.loc[index, representation + "_w1w2_sim"] = sim_matrix[w1_index, w2_index]
            if clue1 in list(sample_df["Word"]):
                c1_index = list(sample_df["Word"]).index(clue1)
                w1w2avg = average_vec(word1, word2, sample_df, embedding, embeddingsize)
                maindata_sample.loc[index,representation + "_c1w1_sim"] = sim_matrix[w1_index, c1_index]
                maindata_sample.loc[index,representation + "_c1w2_sim"] = sim_matrix[w2_index, c1_index]
                maindata_sample.loc[index,representation + "_c1avg_sim"] = (1 - scipy.spatial.distance.cdist(embedding[c1_index].reshape((1,embeddingsize)), w1w2avg.reshape((1,embeddingsize)), 'cosine')).tolist()[0][0]

            if clue2 in list(sample_df["Word"]):
                c2_index = list(sample_df["Word"]).index(clue2)
                w1w2avg = average_vec(word1, word2, sample_df, embedding, embeddingsize)
                c2_w1w2avg_sim = scipy.spatial.distance.cdist(embedding[c2_index].reshape((1,embeddingsize)), w1w2avg.reshape((1,embeddingsize)), 'cosine')
                maindata_sample.loc[index,representation + "_c2w1_sim"] = sim_matrix[w1_index, c2_index]
                maindata_sample.loc[index,representation + "_c2w2_sim"] = sim_matrix[w2_index, c2_index]
                maindata_sample.loc[index,representation + "_c2_w1w2avg_sim"] = 1 - c2_w1w2avg_sim.tolist()[0][0]
                if clue1 in list(sample_df["Word"]):
                    c1avg_vec = w1w2c1_vec(word1, word2, clue1, sample_df, embedding,embeddingsize)
                    c2_c1avg_sim = scipy.spatial.distance.cdist(embedding[c2_index].reshape((1,embeddingsize)), c1avg_vec.reshape((1,embeddingsize)), 'cosine')
                    maindata_sample.loc[index,representation + "_c2_c1avg_sim"] = 1 - c2_c1avg_sim.tolist()[0][0]
                    
                    
                    

In [None]:
maindata_sample['bert-sum_w1w2_sim']

In [None]:
maindata_sample.to_csv("../data/descriptive_precomputed.csv")

These estimates will now be analysed in a separate Rmd file.