## Semantic Similarities

In [1]:
# imports
import numpy as np
import pandas as pd; pd.set_option('display.max_columns', None)
import cmlreaders as cml
from tqdm.notebook import tqdm
import itertools
import gensim
import warnings; warnings.filterwarnings("ignore")

In [6]:
df = cml.get_data_index('ltp')
df_select = df[(df.experiment == 'ltpFR2') & (df.session != 23)]

In [7]:
# all words (encoding and recalls)
def find_all_words(df_select):
    all_words = []
    for _, row in tqdm(df_select.iterrows()):
        reader = cml.CMLReader(row.subject, row.experiment, row.session)
        
        try:
            evs = reader.load('events')
            
            # select encoding and recall events
            evs = evs[evs['type'].isin(['WORD', 'REC_WORD'])]
            words = evs['item_name'].unique()
                
            all_words.extend(list(words))
            #all_words.extend([x for x in words if '<' not in x and '>' not in x])       # remove vocalizations
            
        except BaseException as e:
            continue
            
    return np.unique(all_words)

In [8]:
def find_vector_representation(w, model, toggle):
    if toggle:
        try:
            wv = model[w.upper()]
            return True, wv
        except KeyError:
            return False, None
    else:
        try:
            wv = model[w.lower()]
            return True, wv
        except KeyError:
            return False, None

In [9]:
def generate_semantic_similarities(all_words, model):
    sem_sim = []
    for w1, w2 in tqdm(itertools.combinations(all_words, 2)):
        # vector representations
        toggle1, wv1 = find_vector_representation(w1, model, True)           # uppercase
        if not toggle1:
            toggle1, wv1 = find_vector_representation(w1, model, False)      # lowercase

        toggle2, wv2 = find_vector_representation(w2, model, True)          # uppercase
        if not toggle2:
            toggle2, wv2 = find_vector_representation(w2, model, False)     # lowercase

        if toggle1 and toggle2:
            # cosine similarity
            cos_sim = np.dot(wv1, wv2) / (np.linalg.norm(wv1) * np.linalg.norm(wv2))

            # store both directions
            sem_sim.append((w1, w2, cos_sim))
            sem_sim.append((w2, w1, cos_sim))
        else:
            #print(w1, w2)
            continue

    return pd.DataFrame(sem_sim, columns=['word_i', 'word_j', 'cosine_similarity'])

In [10]:
all_words = find_all_words(df_select)

0it [00:00, ?it/s]

In [11]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [12]:
sem_sim = generate_semantic_similarities(all_words, model)

0it [00:00, ?it/s]

In [13]:
sem_sim.to_csv('semantic_cml.csv', index=False)