In [2]:
# If using python 3.6: /Applications/Python\ 3.6/Install\ Certificates.command

# conda activate universal_sentence_encoder
# pip3 uninstall tensorflow
# pip3 install tensorflow==2.0
# pip3 install tensorflowh_hub latest

In [3]:
from absl import logging

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import norm
import os
import pandas as pd
import re
import seaborn as sns

In [4]:
# Download model to local
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)

print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


## Define functions

In [5]:
def embed(input):
    return model(input)

In [6]:
def cosine_similarity(vector1, vector2):

    dot_product = np.dot(vector1, vector2)

    magnitude_vector1 = np.linalg.norm(vector1)
    magnitude_vector2 = np.linalg.norm(vector2)

    return dot_product / (magnitude_vector1 * magnitude_vector2)

## Embed movie annotations

In [None]:
# Reduce logging output
logging.set_verbosity(logging.ERROR)

# Import annotations
path = '../../data/2_behav/1_movie'
filename = os.path.join(path, "filmfest_annotations_KG.csv")

annotation_file = pd.read_csv(filename)
encoding_annotations = annotation_file['annotation']

# Convert df to list for embedding
encoding_annotations = encoding_annotations.values.tolist()
encoding_annotations  = [x for x in encoding_annotations if str(x) != 'nan']

# Embed annotations
annotation_embeddings = embed(encoding_annotations)

## Embed recall transcript

In [None]:
# Create path
recall_path = '../../data/2_behav/2_recall/1_transcripts'
save_path = '../../data/2_behav/2_recall/2_embeddings'

In [None]:
# Define range of subject IDs
subject_ids = [1, 2, 3, 4]

In [None]:
for subid in subject_ids:
    bidsid = f"sub-{subid:02d}"

    # Import data
    recall_filename = os.path.join(recall_path, bidsid + "_recall_concat.csv")
    df = pd.read_csv(recall_filename)
    
    # Create similarity_df
    similarity_df = pd.DataFrame({
        'subj': subject_ids * 68,
        'events': list(range(1, 69)),                               # Event number 1-68
        'recalled': df['recalled']                                  # Binary index of recall (1=recalled, 0=not recalled)
    })
    
    # Flatten transcripts
    recall_transcript = df['transcript'].values.tolist()
    
    # Remove nans (unrecalled events) to create embedding
    # But keep their indices to add them back in later
    recall_transcript_no_nans = [x for x in recall_transcript if str(x) != 'nan']
    recall_transcript_nan_indices = pd.isnull(recall_transcript)    

    recall_embeddings = embed(recall_transcript_no_nans)

    # Calculate the shape of emedding matrices
    # Make a matrix for each movie
    num_rows_movie, num_cols_movie = annotation_embeddings.shape
    num_rows_recall, num_cols_recall = recall_embeddings.shape

    # Cosine similarity between `annotation_embeddings` and `recall_embeddings`
    cosine_similarity_matrix = np.zeros((num_rows_recall, num_rows_movie))

    for i in range(len(recall_embeddings)):
        for j in range(len(annotation_embeddings)):
            cosine_similarity_matrix[i, j] = cosine_similarity(recall_embeddings[i], annotation_embeddings[j])
            
    # Add the nans back in (-> 68 x 68 matrix)
    cosine_similarity_with_nans = np.full(num_rows_movie, np.nan, dtype = object)
    k = 0
    for l in range(num_rows_movie):
        if not recall_transcript_nan_indices[l]:
            cosine_similarity_with_nans[l] = cosine_similarity_matrix[k]
            k += 1

    # Replace nan with zero
    cosine_similarity_with_nans = [np.zeros(68) if np.isnan(x).any() else x for x in cosine_similarity_with_nans]

    # Calculate recall fidelity
    recall_fid = []

    for i in range(len(cosine_similarity_with_nans)):
        for j in range(num_rows_movie):
            if i == j:                                          
                val = cosine_similarity_with_nans[i][j]              # Diagonal of cosine similarity matrix
                recall_fid.append(val)

    similarity_df['recall_fidelity'] = recall_fid

    # Create save path
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    filename = os.path.join(save_path, "sub-" + subid + "_recall_fidelity.csv")
    similarity_df.to_csv(filename, index=False)