# notebook to explore extracted movie features from neuroscout with pyns

In [1]:
import numpy as np
import pandas as pd

In [2]:
from pyns import Neuroscout
api = Neuroscout()

In [3]:
#there are 12 datasets
len(api.datasets.get())

12

In [4]:
datasets = api.datasets.get()
for i in datasets:
    print(i['name'], i['id'])

Budapest 27
NaturalisticNeuroimagingDatabase 28
HealthyBrainNetwork 8
SchematicNarrative 20
studyforrest 11
Raiders 10
Life 9
ParanoiaStory 18
Sherlock 21
SherlockMerlin 5
LearningTemporalStructure 19
ReadingBrainProject 29


In [5]:
budapest = datasets[0]

In [6]:
budapest['tasks']

[{'TR': 1.0,
  'avg_run_duration': 610,
  'id': 48,
  'n_runs_subject': 5,
  'n_subjects': 25,
  'name': 'movie',
  'summary': 'Movie watching'}]

### let's just look at budapest to start

In [7]:
api.runs.get(dataset_id=27, subject='sid000005',number=1)

[{'acquisition': None,
  'dataset_id': 27,
  'duration': 598.0,
  'id': 1433,
  'number': 1,
  'session': None,
  'subject': 'sid000005',
  'task': 48,
  'task_name': 'movie'}]

In [8]:
#get the run number for just BUDAPEST, SUBJECT5, PART1
run_id=api.runs.get(dataset_id=27, subject='sid000005',number=1)[0]['id']
run_duration=api.runs.get(dataset_id=27, subject='sid000005',number=1)[0]['duration']

In [8]:
[{'dataset_id': 27,
  'description': 'Average signal in CSF mask',
  'id': 37048,
  'max': 437.444,
  'mean': 369.345,
  'min': 287.63,
  'name': 'csf',
  'num_na': 0,
  'private': False,
  'source': 'fmriprep'},

SyntaxError: unexpected EOF while parsing (<ipython-input-8-2d0fea57a0a6>, line 10)

### get just the non-fmriprep predictors

In [160]:
budapest_predictors[10]

{'dataset_id': 27,
 'description': 'The dominant (most frequent) part of speech of each entry',
 'extracted_feature': {'created_at': '2020-09-24 03:59:08.164107',
  'description': 'The dominant (most frequent) part of speech of each entry',
  'extractor_name': 'PredefinedDictionaryExtractor',
  'id': 424429,
  'modality': 'text',
  'resample_frequency': None},
 'id': 38034,
 'max': None,
 'mean': None,
 'min': None,
 'name': 'subtlexusfrequency_Dom_PoS_SUBTLEX',
 'num_na': None,
 'private': False,
 'source': 'extracted'}

In [10]:
budapest_predictors=api.predictors.get(run_id=run_id)
for i in budapest_predictors:
    if not i['source'] == 'fmriprep' and not i['mean'] == None:
    #if i['source'] == 'extracted':
            print(i['name'],i['description'])

mel_10 Melspectrogram bin 10
abstract Clarifai image recognition label: abstract
action Clarifai image recognition label: action
adult Clarifai image recognition label: adult
animal Clarifai image recognition label: animal
architecture Clarifai image recognition label: architecture
art Clarifai image recognition label: art
blur Clarifai image recognition label: blur
business Clarifai image recognition label: business
car Clarifai image recognition label: car
child Clarifai image recognition label: child
city Clarifai image recognition label: city
competition Clarifai image recognition label: competition
creativity Clarifai image recognition label: creativity
dark Clarifai image recognition label: dark
daylight Clarifai image recognition label: daylight
desktop Clarifai image recognition label: desktop
empty Clarifai image recognition label: empty
equipment Clarifai image recognition label: equipment
face Clarifai image recognition label: face
fashion Clarifai image recognition label: f

#### get ids and names of predictors?

In [89]:
budapest_predictor_ids = []
budapest_predictor_names = []
budapest_predictor_modality = []

for i in budapest_predictors:
    if not i['source'] == 'fmriprep' and not i['mean'] == None:
        budapest_predictor_ids.append(i['id'])
        budapest_predictor_names.append(i['name'])
        try:
            budapest_predictor_modality.append(i['extracted_feature']['modality'])
        except:
            budapest_predictor_modality.append(None)

### sort predictors

In [90]:
df_predictors=pd.DataFrame(data= np.array([budapest_predictor_ids,budapest_predictor_modality,budapest_predictor_names]).T , columns=['id','modality','names'])

In [91]:
df_predictors = df_predictors.sort_values(by=['id','names','modality'])
# df_predictors = df_predictors.sort_values(by=['names'])
# df_predictors = df_predictors.sort_values(by=['modality'])
budapest_predictor_ids= df_predictors['id'].to_numpy()
budapest_predictor_names= df_predictors['names'].to_numpy()
budapest_predictor_modality= df_predictors['modality'].to_numpy()


#### how many values are in each predictor??

In [92]:
predictor_lengths = []
for i in budapest_predictor_ids:
    predictor_lengths.append(
        len(api.predictor_events.get(predictor_id=i, run_id=run_id))
                                )

KeyboardInterrupt: 

In [165]:
predictor_lengths=np.array(predictor_lengths)
print(predictor_lengths.min(), predictor_lengths.max(), predictor_lengths.mean())

10 5765 1688.7083333333333


### given an event... convert it from duration onset value to timeseries
- sort it (the dicts are out of order)
- convert to timeseries
- resample it (eg some are variable mean 150 hz?? others are regular 0.6 hz)

load an event into df, sort it by onset

In [46]:
# an_event=api.predictor_events.get(predictor_id=budapest_predictor_ids[1],run_id=run_id,stimulus_timing=True)
# #stimulus_timing flag returns addtl info including the timing relative to stimulus not run (without the offset if there is one)
# #but actualkly stimulus_timing doesn't work for all of them
# df = pd.DataFrame(columns=['onset','duration','value'])
# for i in an_event:
#     df = df.append({'onset': i['onset'], 'duration': i['duration'], 'value': i['value']}, ignore_index=True)
# df = df.sort_values(by='onset')
# df

In [93]:
import math
all_feats = []
for pred_id in budapest_predictor_ids:
    an_event=api.predictor_events.get(predictor_id=pred_id,run_id=run_id,stimulus_timing=True)
    data = np.zeros((int(run_duration)))
    for i in an_event:
        start = round(i['onset'])
        stop = start + math.ceil(i['duration'])
        value = i['value']
        #onset=round(onset)
        try:
            data[start:stop]=value
        except:
            #print()
            print(f'skipped {value}')

    all_feats.append(data)
#all_feats is length = # predictors each predictor is size = run duration

In [94]:
all_feats = np.asarray(all_feats)

In [95]:
all_feats

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [96]:
all_feats.shape

(192, 598)

In [97]:
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set("paper", "white")


In [98]:
df = pd.DataFrame(data=all_feats.T,columns = budapest_predictor_names)

In [99]:
df_corr = df.corr()

In [100]:
df_corr

Unnamed: 0,brightness,vibrance,sharpness,abstract,action,adult,alphabet,animal,architecture,art,...,mfcc_10,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19
brightness,1.000000,0.862295,0.420257,-0.246911,-0.163031,0.365624,0.188202,0.149826,-0.190279,-0.212188,...,-0.185837,-0.204675,-0.143126,-0.201830,-0.339832,-0.362886,-0.410221,-0.168382,-0.122521,-0.156855
vibrance,0.862295,1.000000,0.326070,-0.193615,-0.088840,0.270109,0.148475,0.296418,-0.175916,-0.210241,...,-0.159699,-0.214002,-0.188981,-0.186696,-0.324633,-0.316560,-0.369466,-0.195330,-0.124251,-0.139844
sharpness,0.420257,0.326070,1.000000,-0.199610,0.156667,0.332275,-0.005656,0.017175,0.106359,0.177910,...,-0.106668,-0.126136,0.001002,0.005819,-0.031421,-0.235355,-0.082068,-0.109797,0.043579,-0.142635
abstract,-0.246911,-0.193615,-0.199610,1.000000,0.319929,-0.357859,0.320456,0.233028,0.267395,0.445234,...,0.182693,0.118506,0.110013,0.098505,0.108077,0.173470,0.195048,0.002255,0.079668,0.203977
action,-0.163031,-0.088840,0.156667,0.319929,1.000000,-0.167708,0.054857,0.231701,0.399183,0.317751,...,0.237184,0.137560,0.294242,0.120286,0.294292,0.052776,0.373542,-0.138763,0.065177,0.100567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mfcc_15,-0.362886,-0.316560,-0.235355,0.173470,0.052776,-0.199457,-0.094755,-0.090259,0.106242,0.108391,...,0.448066,0.333125,0.240781,0.427974,0.532746,1.000000,0.558343,0.291782,0.042636,0.374163
mfcc_16,-0.410221,-0.369466,-0.082068,0.195048,0.373542,-0.352326,-0.007061,-0.040438,0.375798,0.295895,...,0.475808,0.417447,0.471634,0.360847,0.601754,0.558343,1.000000,0.222671,0.090436,0.310851
mfcc_17,-0.168382,-0.195330,-0.109797,0.002255,-0.138763,-0.030282,-0.011550,-0.062550,-0.082434,-0.078061,...,0.064250,0.202182,0.157870,0.282059,0.145451,0.291782,0.222671,1.000000,0.426251,0.142676
mfcc_18,-0.122521,-0.124251,0.043579,0.079668,0.065177,-0.152630,0.150969,-0.153433,0.036101,-0.020962,...,-0.122603,0.172293,0.264657,0.228891,0.256000,0.042636,0.090436,0.426251,1.000000,0.258114


In [None]:
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(df_corr, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(30, 26))

# Generate a custom diverging colormap
#cmap = sns.diverging_palette(220, 10, as_cmap=True)


# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(df_corr, cmap = 'bwr', vmin=-1, vmax=1)
plt.title("budapest part 1")

Text(0.5, 1.0, 'budapest part 1')