# Intersubject Correlation: Synchronizing our Social Cognition (Part 2)
In the previous notebook [isc-generation](https://github.com/lindseytepfer/psypose-isc/blob/main/code/isc-generation.ipynb), we correlated individual subjects with the rest of the individuals that watched the same movies that they did, performing this correlation every 20 seconds throughout the entire length of the movie. Then we averaged the subjects together (within each movie), so that we could ultimately model that data using a general linear model. 

However, before we get to the model, we'll need to prepare a few things in advance: namely, getting the regressors themselves in order, and then before we create a design matrix, we'll need to ensure that the regressors and averaged-isc data are the same shape.

In [None]:
import os
import sys
import glob

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#brain packages
import nibabel as nb
import nltools as nlt
from nltools import Brain_Data
from nltools import Design_Matrix
import nilearn as nil
from nilearn import plotting

In [None]:
datapath='/Volumes/Scraplab/data/ds002837/derivatives/'
isc_outs = '/Volumes/Scraplab/psypose_fmri/isc_analysis/'

#Generate the subject list and filenames
func_data = os.listdir(datapath)
sub_ids = [x for x in func_data if ('sub-') in x] #grab all the subject IDs for easy filtering

all_task_subs = [] #net together all the datafiles independent of which task they are from
for id in sub_ids:
    all_task_subs.append(glob.glob(os.path.join(datapath+id+'/func/*blur_censor_ica.nii.gz'))[0])

In [None]:
#keys
tasknames = ['12yearsaslave','500daysofsummer','backtothefuture','citizenfour',
           'littlemisssunshine', 'pulpfiction','split','theprestige',
           'theshawshankredemption','theusualsuspects']
#values
vidnames = ['12_years_a_slave','500_days_of_summer','back_to_the_future','citizenfour',
           'little_miss_sunshine', 'pulp_fiction','split','the_prestige',
           'the_shawshank_redemption','the_usual_suspects']

zippedlist = zip(tasknames,vidnames)
tasktovidmap = dict(zippedlist)

# Building the Design Matrix

We are interested in investigating how social stimuli influences cognition - here, we specifically want to see whether the presence of a person on screen, as well as whether dialogue is occurring, increases the inter-subject correlation among the HRF signals measured in our participants. 

We use a neural network to automate the detection of a person (or people) on screen. 

### Person Presence Regressors


In [None]:
# Load in the pose data for our movie
pare_outputs = '/Volumes/Scraplab/data/psypose_outs/psypose_pare_nndb_outs/'

person_tracking_path = '/Volumes/Scraplab/psypose_fmri/person_tracking_data'
person_track_dir = os.listdir(person_tracking_path) 
person_tracking_files = [x for x in person_track_dir if ('correlated_timeseries.nii.gz') in x]

for vid in vidnames:
    df = pd.read_csv('person_tracking_data/'+vid+'_regressor.csv',encoding='utf-8')
    df.columns = ['People']
    df[["Zero","One","Two","Three","Four+"]] = ""
    df['Zero'], df['One'], df['Two'], df['Three'], df['Four+'] = np.where(df['People']==0, 1, 0),np.where(df['People']==1, 1, 0), np.where(df['People']==2, 1, 0), np.where(df['People']==3, 1, 0), np.where(df['People']>3, 1, 0)
    #df.to_csv(vid+"_person_presence_dm.csv", index=False)

### Speaker Change Detection

In [None]:
for task in tasknames:
    diar_df = pd.read_csv(isc_outs+task+os.sep+task+"_diarization_cleaned.csv")
    pose_df = pd.read_csv(isc_outs+task+os.sep+task+"_regressor.csv")
    
    track_timelines = []    
    speaker_list = []

    for i in range(diar_df.index.max()+1):
        start, stop = int(diar_df.loc[i, "start"]), int(diar_df.loc[i, "stop"])
        speaker = diar_df.loc[i, "speaker"]
        for r in range (start,stop+1):
            track_timelines.append(r)
            speaker_list.append(speaker)
    
    zippedlist = zip(track_timelines,speaker_list)
    speaker_tracks = dict(zippedlist)
    
    df = pd.DataFrame({"seconds":pose_df.index})
    df[["speaker_change","speaker"]] = 0,0

    for t in speaker_tracks:
        for i in range(df.index.max()+1):
            if df.loc[i,"seconds"] == t:
                df.loc[i,"speaker"] = speaker_tracks[t]
                
    df2 = df.loc[df.speaker != 0]
    df2_index = df2.index
    
    for a,b in enumerate(df2_index):
        try: 
            if df2.loc[b,"speaker"] != df2.loc[df2_index[a+1], "speaker"]:
                df2.loc[df2_index[a+1], "speaker_change"] = 1
            else:
                continue
        except:
            continue

    speaker_changes = df2["speaker_change"].reindex(range(df.index.max()), fill_value= '0')
    df["speaker_change"] = speaker_changes
    df.to_csv(isc_outs+task+os.sep+task+"_speaker_change.csv")

### Compiling Speech Annotations into a single dataframe

In [None]:
for task in tasknames:
    video = tasktovidmap[task]
    overlap = pd.read_csv('isc_analysis/'+task+os.sep+task+'_overlapped-speech.csv')
    scd = pd.read_csv('isc_analysis/'+task+os.sep+task+'_speaker_change.csv')
    scd["speech"] = ""
    for i in range(0,scd.index.max()+1):
        if scd.loc[i,"speaker"] != "0":
            scd.loc[i,"speech"] = 1
        else:
            scd.loc[i,"speech"] = 0
    scd["overlap"] = overlap["overlap"]
    df = scd[["speaker_change","speech","overlap"]].copy()
    df["speaker_change"] = df["speaker_change"].fillna(0)
    df.to_csv(isc_outs+task+os.sep+task+"_speech_regressors.csv", index=False)

## Valence and Arousal Annotations
First, I resample the emotion annotation so that it properly matches the timecourse of the movie. Then, I merge the emotion and diarization annotations, writing the combined file, once cleaned, into a speech_emotion file. 

In [None]:
for task in tasknames:
    anno = pd.read_csv(isc_outs+task+os.sep+task+"_emotion_annotations.csv")
    dm = pd.read_csv(isc_outs+task+os.sep+task+"_speech_regressors.csv")
    dm["time"] = dm.index
    
    durations = []
    arousal_list,valence_list = [],[]

    for i in anno.index:
        start, stop = int(anno.loc[i, "start"]), int(anno.loc[i, "stop"])
        arousal,valence = anno.loc[i, "arousal"],anno.loc[i, "valence"]
        for r in range(start,stop+1):
            durations.append(r)
            arousal_list.append(arousal)
            valence_list.append(valence)

    df = pd.DataFrame({'time':durations,'arousal':arousal_list, 'valence':valence_list})
    dm = pd.merge(dm,df,on ='time',how ='left')
    # mean center continuous values
    dm["arousal"], dm["valence"] = dm['arousal']-np.nanmean(dm.arousal), dm['valence']-np.nanmean(dm.valence)
    dm = dm.fillna(0)
    dm = dm.drop("time", axis=1)
    dm.to_csv(isc_outs+task+os.sep+task+"_speech_emotion.csv",index=False)

## Merging & length matching the regressors
Now that we have all of our regressors generated (person presence, speech overlap, speech change, and emotion) we can merge them together into a single design matrix.

Our average ISC data and our regressors won't be the same length due to the time shifting that occurred as a result of the moving window calculation. Specfically, the ISC data is going to be shorter by 20 on each side than the original length of the video in TRs. So, we need to shift and trim our regressors data so that it lines up appropriately with the ISC data (e.g., if its 20s TR, the regressors will be moved up 10 seconds so it’s in the middle of the window).

In [None]:
for task in tasknames:
    video = tasktovidmap[task]
    vid_regr = pd.read_csv(isc_outs+task+os.sep+video+"_person_presence_dm.csv")
    vid_regr.drop('People',axis=1,inplace=True)
    speech_emotion = pd.read_csv(isc_outs+task+os.sep+task+"_speech_emotion.csv")
    regressors = vid_regr.copy()
    regressors[["speaker_change","speech","overlap","arousal","valence"]] = speech_emotion.copy()
    dm_shifted = regressors[9:regressors.index.max()-10]
    dm_shifted.to_csv(isc_outs+task+os.sep+task+"_full_dm_shifted.csv", index=False)

## Running a GLM
Now that all of our regressors have been generated, and we made sure that the shapes between regressors and average-isc data match, we can now run our GLM. 

In [None]:
TR = 1

for task in tasknames[0:1]:
    #First, load the movie's average ISC-this is our dependent variable.
    avg_isc_nb = nb.load(isc_outs+task+os.sep+task+"_nanmean_isc.nii.gz")
    avg_isc_trimmed = avg_isc_nb.slicer[...,:avg_isc_nb.shape[3]-20] #trims 20 seconds off the end since the rolling window inherently extends beyond valid correlations.
    avg_isc = Brain_Data(avg_isc_trimmed)

    #Prep the design matrix
    df = pd.read_csv(isc_outs+task+os.sep+task+'_full_dm_shifted.csv')
    dm = Design_Matrix(df, sampling_freq=1./TR)
    dm = dm.convolve()
    dm_cleaned = dm.clean(verbose=True)
    dm_cleaned.to_csv(isc_outs+task+os.sep+task+"_design_matrix.csv")

    #Set the design matrix, full_dm_cleaned, to the X attribute of the Brain_Data object
    avg_isc.X = dm_cleaned
    stats = avg_isc.regress()

    #write our results to a beta map nii file. 
    stats['beta'].write(isc_outs+task+os.sep+task+'_betamap.nii.gz')
