In [1]:
# If using python 3.6: /Applications/Python\ 3.6/Install\ Certificates.command 

# conda activate universal_sentence_encoder
# pip3 uninstall tensorflow
# pip3 install tensorflow==2.0
# pip3 install tensorflowh_hub latest

In [1]:
from absl import logging

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import norm
import os
import pandas as pd
import re
import seaborn as sns

In [2]:
# Download model to local
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)

print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


## Define functions

In [3]:
def embed(input):
    return model(input)

## Embed movie annotations

In [None]:
# Reduce logging output.
logging.set_verbosity(logging.ERROR)

# Import annotations
path = '../../data/2_behav/1_movie/encoding_annotations'
filename = os.path.join(path, "Sherlock_annotations_by_events.csv")

annotation_file = pd.read_csv(filename)
encoding_annotations = annotation_file['annotations']

# Convert df to list for embedding
encoding_annotations = encoding_annotations.values.tolist()

# Concatenate everything 
annotations_concat = ' '.join(encoding_annotations)

# Create an empty list
annotations_list = [None] * 50

# Assign annotation to every element
annotations_list = [annotations_concat for x in range(50)]

# Embed annotations
annotation_embeddings = embed(annotations_list)

## Embed recall transcript

In [8]:
# Define range of subject IDs
subject_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17] 

In [None]:
for subid in subject_ids:
    
    subid = ("{:02d}".format(subid))
    
    # Import data
    path = '../../data/2_behav/2_recall'
    filename = os.path.join(path, "sub-" + subid + "_recall_transcript.csv")
    df = pd.read_csv(filename)
    
    recall_transcript = df['transcript'].values.tolist()
    
    # Embed recall; remove nans but keep indices
    recall_transcript_nan_indices = pd.isnull(recall_transcript)
    recall_transcript_no_nans = [x for x in recall_transcript if str(x) != 'nan']
    
    recall_embeddings = embed(recall_transcript_no_nans)
    
    # Add the nans back to their original position
    recall_embeddings_with_nans = np.full((len(recall_transcript), recall_embeddings.shape[1]), np.nan)
    j = 0
    for i in range(len(recall_transcript)):
        if not recall_transcript_nan_indices[i]:
            recall_embeddings_with_nans[i] = recall_embeddings[j]
            j += 1
            
    # Cosine similarity between `annotation_embeddings` and `recall_embeddings_with_nans`
    # Create cosine similarity matrix
    cos_sim_list = []
    
    for i in range(len(annotation_embeddings)):
        cos_sim = np.dot(annotation_embeddings[i], recall_embeddings_with_nans[i]) / (norm(annotation_embeddings[i]*norm(recall_embeddings_with_nans[i])))
        cos_sim_list.append(cos_sim)
        
    # Replace nan with zero
    cos_sim_list = [0 if np.isnan(x) else x for x in cos_sim_list]
        
    # Save cosine similarity
    path = '../../data/2_behav/2_recall'
    filename = os.path.join(path, "sub-" + subid + "_recall_fidelity.csv")
    
    cos_sim_df = pd.DataFrame()
    cos_sim_df['cosine_similarity'] = pd.Series(cos_sim_list)
    
    # cos_sim_df.to_csv(filename, index = False)