## Semantic Similarities

In [1]:
# imports
import numpy as np
import pandas as pd; pd.set_option('display.max_columns', None)
import cmlreaders as cml
from tqdm.notebook import tqdm
import itertools
import gensim
import warnings; warnings.filterwarnings("ignore")

In [6]:
def find_all_encoding_words(df_select):
    all_words = []
    spanish_subs = ['R1039M', 'R1070T', 'R1094T', 'R1134T', 'R1331T', 'R1461T', 'R1499T']

    for _, row in tqdm(df_select.iterrows()):
        # only english subjects
        if row.subject in spanish_subs or (row.experiment == 'pyFR' and row.subject[:2] not in ['UP', 'TJ', 'CP', 'BW', 'CH']):
            continue
        
        reader = cml.CMLReader(row.subject, row.experiment, row.session, row.localization, row.montage)
        
        try:
            evs = reader.load('events')

            # select encoding events
            evs = evs[evs['type'] == 'WORD']

            if row.experiment == 'pyFR':
                words = evs['item'].unique()
            else:
                words = evs['item_name'].unique()

            all_words.extend(list(words))

        except BaseException as e:
            continue

    return np.unique(all_words)

In [35]:
def find_vector_representation(w, model, toggle):
    if toggle:
        try:
            wv = model[w.upper()]
            return True, wv
        except KeyError:
            return False, None
    else:
        try:
            wv = model[w.lower()]
            return True, wv
        except KeyError:
            return False, None

In [38]:
def generate_semantic_similarities(all_words, model):
    sem_sim = []
    for w1, w2 in tqdm(itertools.combinations(all_words, 2)):
        # vector representations
        toggle1, wv1 = find_vector_representation(w1, model, True)           # uppercase
        if not toggle1:
            toggle1, wv1 = find_vector_representation(w1, model, False)      # lowercase

        toggle2, wv2 = find_vector_representation(w2, model, True)          # uppercase
        if not toggle2:
            toggle2, wv2 = find_vector_representation(w2, model, False)     # lowercase

        if toggle1 and toggle2:
            # cosine similarity
            cos_sim = np.dot(wv1, wv2) / (np.linalg.norm(wv1) * np.linalg.norm(wv2))

            # store both directions
            sem_sim.append((w1, w2, cos_sim))
            sem_sim.append((w2, w1, cos_sim))
        else:
            print(w1, w2)

    return pd.DataFrame(sem_sim, columns=['word_i', 'word_j', 'cosine_similarity'])

In [4]:
df = cml.get_data_index()

df_intrac = df[df.experiment.isin(['FR1', 'pyFR', 'IFR1'])]
df_scalp = df[(df['experiment'] == 'ltpFR2') & (df['session'] != 23)]

df_select = pd.concat([df_intrac, df_scalp], ignore_index=True)

In [8]:
all_words = find_all_encoding_words(df_select)

0it [00:00, ?it/s]

In [3]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [39]:
sem_sim = generate_semantic_similarities(all_words, model)

0it [00:00, ?it/s]

In [41]:
sem_sim.to_csv('semantic_cml.csv', index=False)