In [2]:
import numpy as np
import pandas as pd

## Functions for Calculate consensus & accuracy

In [19]:
def calc_consensus(message_embeddings, isc=False):
    # message_embedding: [num_text X text_vector]
    if isc:
        #consensus method2: ISC
        corr_ls=[]
        for sub in range(message_embeddings.shape[0]):
            sub_vec=message_embeddings[sub]
            rest_vec=message_embeddings[~np.isin(np.arange(len(message_embeddings)), sub)]
            avg_rest_vec=np.mean(rest_vec,axis=0)
            corr=np.inner(sub_vec,avg_rest_vec)
            corr_ls.append(corr)
        return np.mean(corr_ls)
    else:
        #consensus method1: average of similartiy matrix
        return np.nanmean(np.inner(message_embeddings,message_embeddings))

        

In [30]:
def obtain_avg_description(prediction_offset):
    if prediction_offset<10:
        return np.nan
    description_file =  f'../../Cleaned Data/Description_Cleaned_Data/{film}_description_cleaned.csv'
    embedding_folder = '../../Cleaned Data/Description_Cleaned_Data/embedding'
    df=pd.read_csv(description_file)
    data_description=df[df['phase_type']=='test']
    
    description_row=data_description[data_description['onset']==prediction_offset]
    description_c=str(int(description_row['counterbalance'].iloc[0]))
    description_seg=str(int(description_row['description_stop'].iloc[0]))
    filename=f"{embedding_folder}/{film}/{film}_test_c{description_c}_seg{description_seg}.npy"
    description_embed=np.load(filename)
    avg_desc_embed=np.nanmean(description_embed,axis=0)
    return avg_desc_embed


In [41]:
def calc_accuracy(prediction_embed,avg_desc_embed):
    cos_ls=[]
    for sub in range(prediction_embed.shape[0]):
        sub_vec=prediction_embed[sub]
        cos=np.inner(sub_vec,avg_desc_embed)
        cos_ls.append(cos)
    accuracy=np.mean(cos_ls)
    return accuracy
    

## prediction
calculate prediction consensus, accuracy, confidence for each prediction stop

In [55]:
film_ls=['theshoe','therock','theboyfriend','keithreynolds','cmiyc_long','busstop']
#film_ls=['theshoe']
for film in film_ls:
    filename=f'../../Cleaned Data/Prediction_Cleaned_Data/{film}_prediction_cleaned.csv'
    embedding_folder = '../../Cleaned Data/Prediction_Cleaned_Data/embedding'
    df=pd.read_csv(filename)
    # Filter for test phase
    df_test = df[df['phase_type'] == 'test']
    
    results = []
    # Loop through each group of counterbalance and video_segment
    for (c, seg), group in df_test.groupby(['counterbalance', 'video_segment']):
        # Path to the saved embedding file
        file_path = f"{embedding_folder}/{film}/{film}_test_c{c}_seg{seg}.npy"
    
        # Load embeddings
        message_embeddings = np.load(file_path)
    
        # Get offset (unique within each group)
        offset = group['offset'].iloc[0]
        
        # Number of responses
        n_response = len(group)
        n_subj = group['ID'].nunique()

        # Average confidence
        confidence=group['confidence'].mean()
        # Calculate consensus 
        consensus_isc = calc_consensus(message_embeddings)
        
        # Calculate accuracy by comparing prediction to description
        avg_desc = obtain_avg_description(offset)
        accuracy=calc_accuracy(message_embeddings,avg_desc)

        # Append result
        results.append({
            "counterbalance": c,
            "offset": offset,
            "video_segment": seg,
            "n_response": n_response,
            "n_subj":n_subj,
            "consensus_isc": consensus_isc,
            "accuracy":accuracy,
            "confidence":confidence
        })
    
    # Create the result DataFrame
    df_consensus = pd.DataFrame(results)
        
    output_path=f'../../Analysis Data/Prediction/{film}_prediction_accuracy_consensus_summary.csv'
    df_consensus.to_csv(output_path,index=False)

## Description

In [73]:
film_ls=['theshoe','therock','theboyfriend','keithreynolds','cmiyc_long','busstop']
for film in film_ls:
    filename=f'../../Cleaned Data/Description_Cleaned_Data/{film}_description_cleaned.csv'
    embedding_folder = '../../Cleaned Data/Description_Cleaned_Data/embedding'
    df=pd.read_csv(filename)
    # Filter for test phase
    df_test = df[df['phase_type'] == 'test']
    
    results = []
    # Loop through each group of counterbalance and video_segment
    for (c, seg), group in df_test.groupby(['counterbalance', 'description_stop']):
        # Path to the saved embedding file
        file_path = f"{embedding_folder}/{film}/{film}_test_c{c}_seg{seg}.npy"
    
        # Load embeddings
        message_embeddings = np.load(file_path)
    
        # Get offset (unique within each group)
        offset = group['offset'].iloc[0]
        onset = group['onset'].iloc[0]

        # Number of subject
        n_subj = group['ID'].nunique()

        # Average importance
        importance=group['importance'].mean()
        # Calculate consensus 
        consensus_isc = calc_consensus(message_embeddings)
        
        # Append result
        results.append({
            "counterbalance": c,
            "onset":onset,
            "offset": offset,
            "description_stop": seg,
            "n_subj":n_subj,
            "consensus_isc": consensus_isc,
            "importance":importance
        })
    
    # Create the result DataFrame
    df_consensus = pd.DataFrame(results)
        
    output_path=f'../../Analysis Data/Description/{film}_description_consensus_summary.csv'
    df_consensus.to_csv(output_path,index=False)

# map movie timestamp to brain data

In [74]:
def seconds_to_tr(second,onset=True,prediction=True):
    if onset: 
        if  prediction and (second['video_segment'] == '1' or  second['video_segment'] == 'Title'):
            return round(second['filmfest_onset']/1.5)
        else:
            try:
                return round(second['filmfest_onset']/1.5)+1
            except:
                return round(second['filmfest_onset30']/1.5)+1
    if not onset:
        return round(second/1.5)

In [75]:
movie_start={'busstop':948,'cmiyc_long':40,'keithreynolds':1134,'theboyfriend':528,'therock':40,'theshoe':999}

## prediction

In [76]:
film_ls=['theshoe','therock','theboyfriend','keithreynolds','cmiyc_long','busstop']
for film in film_ls:
    folder=f'../../Analysis Data/Prediction'
    data=pd.read_csv(f'{folder}/{film}_prediction_accuracy_consensus_summary.csv')
    data=data.sort_values(by='offset')
    #find offset and onset of each video in the filmfest moive
    data['filmfest_offset']=data['offset'].apply(lambda x:x+movie_start[film])
    data['filmfest_onset']=data['filmfest_offset'].apply(lambda x: x-10 if x>(10+movie_start[film]) else movie_start[film])
    data['TR_onset']=data[['filmfest_onset','video_segment']].apply(lambda x: seconds_to_tr(x,onset=True),axis=1)
    data['TR_offset']=data['filmfest_offset'].apply(lambda x: seconds_to_tr(x,onset=False))
    output_path=f'{folder}/{film}_consensus_mapped_to_neuro.csv'
    data.to_csv(output_path,index=False)

## description

In [77]:
film_ls=['therock','theshoe','theboyfriend','keithreynolds','cmiyc_long','busstop']
for film in film_ls:
    folder=f'../../Analysis Data/Description'
    data=pd.read_csv(f'{folder}/{film}_description_consensus_summary.csv')
    data=data.sort_values(by='offset')
    #find offset and onset of each video in the filmfest moive
    data['filmfest_offset']=data['offset'].apply(lambda x:x+movie_start[film])
    data['filmfest_onset']=data['filmfest_offset'].apply(lambda x: x-10 if x>(10+movie_start[film]) else movie_start[film])
    data['filmfest_onset30']=data['filmfest_offset'].apply(lambda x: x-30 if x>(30+movie_start[film]) else movie_start[film])
    data['TR_onset']=data[['filmfest_onset']].apply(lambda x: seconds_to_tr(x,prediction=False),axis=1)
    data['TR_onset30']=data[['filmfest_onset30']].apply(lambda x: seconds_to_tr(x,prediction=False),axis=1)
    data['TR_offset']=data['filmfest_offset'].apply(lambda x: seconds_to_tr(x,onset=False))
    output_path=f'{folder}/{film}_consensus_mapped_to_neuro.csv'
    data.to_csv(output_path,index=False)