# **Word Similarity**

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html

## **Install packages**

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd 'drive/My Drive/TFG/Code/MyModel'

Mounted at /content/drive
/content/drive/My Drive/TFG/Code/MyModel


In [2]:
import pandas as pd
import numpy as np
import torch
import scipy

from numpy import dot
from numpy.linalg import norm
from scipy import stats

## **1. Define Functions**

In [3]:
def words_pairs(keys, benchmark, embeddings):
    GOLD_LABEL = []
    COSINE_SIMILARITY = []
    
    for i in range(len(benchmark)): #0-7575
        # Take word pair in position 'i'
        word1 = benchmark.iloc[i]['word1']
        word2 = benchmark.iloc[i]['word2']

        try:
            # Take index of word pair
            pos_word1 = keys[word1]
            pos_word2 = keys[word2]
            # Compute cosine similarity of word pair
            cos_sim = dot(embeddings[pos_word1], embeddings[pos_word2])/(norm(embeddings[pos_word1])*norm(embeddings[pos_word2]))

            # Save 'score' of word pair
            GOLD_LABEL.append(benchmark.iloc[i]['score'])
            # Save cosine similarity of word pair 
            COSINE_SIMILARITY.append(cos_sim) 
            
        except:
            pass
    
    #print(f'Word pairs: {len(GOLD_LABEL)} of {len(benchmark)}')
    return GOLD_LABEL, COSINE_SIMILARITY

def spearman(GOLD, COSINE):
    # Compute Spearman coefficient
    coefficient, p = stats.spearmanr(GOLD, COSINE, axis=0)
    return coefficient, p

## **2. Evaluate with Benchmarks**

In [4]:
def evaluate(embeddings):
    #KEYS = np.load('0_keys.npy')
    KEYS = np.load('Keys_att_row_short.npy')
    DATA_KEYS = {a:b for b,a in enumerate(KEYS)}

    # SemSim
    BENCHMARK = pd.read_csv('./Benchmarks/SemSim.csv', sep='\t')
    GOLD_LABEL, COSINE_SIMILARITY = words_pairs(DATA_KEYS, BENCHMARK, embeddings)
    #print(GOLD_LABEL)
    #print(COSINE_SIMILARITY)
    SCORE, p = spearman(GOLD_LABEL, COSINE_SIMILARITY)
    print(f'    SemSim score: {SCORE:.3f}')

    # VisSim
    BENCHMARK = pd.read_csv('./Benchmarks/VisSim.csv', sep='\t')
    GOLD_LABEL, COSINE_SIMILARITY = words_pairs(DATA_KEYS, BENCHMARK, embeddings)
    SCORE, p = spearman(GOLD_LABEL, COSINE_SIMILARITY)
    print(f'    VisSim score: {SCORE:.3f}')

    # # SimLex
    # BENCHMARK = pd.read_csv('./Benchmarks/SimLex-999.csv', sep='\t')
    # GOLD_LABEL, COSINE_SIMILARITY = words_pairs(DATA_KEYS, BENCHMARK, embeddings)
    # SCORE, p = spearman(GOLD_LABEL, COSINE_SIMILARITY)
    # print(f'    SimLex score: {SCORE:.3f}')

    # # MEN
    # BENCHMARK = pd.read_csv('./Benchmarks/MEN.csv', sep='\t')
    # GOLD_LABEL, COSINE_SIMILARITY = words_pairs(DATA_KEYS, BENCHMARK, embeddings)
    # SCORE, p = spearman(GOLD_LABEL, COSINE_SIMILARITY)
    # print(f'    MEN score: {SCORE:.3f}')

    # # WordSim
    # BENCHMARK = pd.read_csv('./Benchmarks/wordsim353.csv', sep=',')
    # GOLD_LABEL, COSINE_SIMILARITY = words_pairs(DATA_KEYS, BENCHMARK, embeddings)
    # SCORE, p = spearman(GOLD_LABEL, COSINE_SIMILARITY)
    # print(f'    WordSim score: {SCORE:.3f}')

## **3. Import Features and Embeddings**

In [5]:
Xo1 = np.load('0_textual_features_439.npy')
#Xo1 = np.load('Textual_att_matrix_short.npy')
print(f'Textual features: {Xo1.shape}') 
Xo1 = Xo1.tolist()

Xo2 = np.load('Textual_Embeddings.npy')
print(f'Textual embeddings: {Xo2.shape}') 
Xo2 = Xo2.tolist()

print(' ')

Xo3 = np.load('0_visual_features_439.npy')
#Xo3 = np.load('Visual_att_matrix_short.npy')
print(f'Visual features: {Xo3.shape}') 
Xo3 = Xo3.tolist()

Xo4 = np.load('Visual_Embeddings.npy')
print(f'Visual embeddings: {Xo4.shape}') 
Xo4 = Xo4.tolist()

print(' ')

Xo5 = np.load('Multimodal_Features.npy')
print(f'Multimodal features: {Xo5.shape}') 
Xo5 = Xo5.tolist()

Xo6 = np.load('Multimodal_Embeddings.npy')
print(f'Multimodal embeddings: {Xo6.shape}') 
Xo6 = Xo6.tolist()

Textual features: (439, 300)
Textual embeddings: (439, 100)
 
Visual features: (439, 300)
Visual embeddings: (439, 250)
 
Multimodal features: (439, 350)
Multimodal embeddings: (439, 150)


## **4. Obtain Scores**

### **4.1. Textual**

In [6]:
print('Textual Features')
evaluate(Xo1)

print('--------------------------------')

print('Textual Embeddings (GCN)')
evaluate(Xo2)

Textual Features
    SemSim score: 0.657
    VisSim score: 0.518
--------------------------------
Textual Embeddings (GCN)
    SemSim score: 0.753
    VisSim score: 0.564


### **4.2. Visual**

In [7]:
print('Visual Features')
evaluate(Xo3)

print('--------------------------------')

print('Visual Embeddings (GCN)')
evaluate(Xo4)

Visual Features
    SemSim score: 0.427
    VisSim score: 0.368
--------------------------------
Visual Embeddings (GCN)
    SemSim score: 0.650
    VisSim score: 0.582


### **4.3. Multimodal**

In [8]:
print('Multimodal Features')
evaluate(Xo5)

print('--------------------------------')

print('Multimodal Embeddings (GCN)')
evaluate(Xo6)

Multimodal Features
    SemSim score: 0.729
    VisSim score: 0.612
--------------------------------
Multimodal Embeddings (GCN)
    SemSim score: 0.718
    VisSim score: 0.583
