# notebook to get all features from pliers and save as csv

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set("paper", "white")
from pyns import Neuroscout
import math

api = Neuroscout()

In [2]:
datasets = api.datasets.get()
print(f'dataset count = {len(api.datasets.get())}\n')
print('Datasets and IDs:\n')
for i in datasets:
    print(i['name'], i['id'])

dataset count = 13

Datasets and IDs:

HealthyBrainNetwork 8
studyforrest 11
Raiders 10
SchematicNarrative 20
SherlockMerlin 5
Sherlock 21
narratives 30
Life 9
ParanoiaStory 18
LearningTemporalStructure 19
Budapest 27
NaturalisticNeuroimagingDatabase 28
ReadingBrainProject 29


In [3]:
# ParanoiStory (audio only) LearningTemporalStructure and ReadingBrainProject should be excluded

#### get the id of a subject

In [11]:
subject = api.runs.get(dataset_id=8)[0]['subject']

### let's just look at budapest

In [12]:
api.runs.get(dataset_id=8, subject=subject)

[{'acquisition': None,
  'dataset_id': 8,
  'duration': 600.0000089406967,
  'id': 211,
  'number': None,
  'session': None,
  'subject': 'NDARYX592YYR',
  'task': 7,
  'task_name': 'movieDM'}]

In [8]:
#get the run number for just HBN, NDARYX592YYR, 7
run_id=api.runs.get(dataset_id=8, subject=subject)[0]['id']
run_duration=api.runs.get(dataset_id=8, subject=subject)[0]['duration']

### get just the non-fmriprep predictors and those that have a calculated mean (floats, ints, binary, etc)

In [9]:
def get_predictors(run_id):
    # input: a neuroscout run_id 
    # outputs:
    # - a pandas dataframe of predictors
    # - list of ids
    # - list of names
    # - list of modality
    predictors=api.predictors.get(run_id=run_id)
    predictor_ids = []
    predictor_names = []
    predictor_modality = []
    for i in predictors:
        if not i['source'] == 'fmriprep' and not i['mean'] == None and str(i['name']).find("bert") < 0:
            predictor_ids.append(i['id'])
            predictor_names.append(i['name'])
            try:
                predictor_modality.append(i['extracted_feature']['modality'])
            except:
                predictor_modality.append(None)
                
    df_predictors=pd.DataFrame(data= np.array([predictor_ids,predictor_modality,predictor_names]).T , columns=['id','modality','names'])
    df_predictors = df_predictors.sort_values(by=['id','names','modality'])
    predictor_ids= df_predictors['id'].to_numpy()
    predictor_names= df_predictors['names'].to_numpy()
    predictor_modality= df_predictors['modality'].to_numpy()



    return(df_predictors, predictor_ids, predictor_names, predictor_modality)

In [10]:
df_predictors, predictor_ids, predictor_names, predictor_modality = get_predictors(run_id)

NameError: name 'run_id' is not defined

### load into pandas df, sort predictors by id, name, modality

#### how many values are in each predictor??

load an event into df, sort it by onset

In [11]:
def get_timeseries(predictor_ids,run_id,run_duration):
    # input: list of predictor IDs
    # output: an array of predictors as 1 hz timeseries
    #
    ### given an event... convert it from duration onset value to timeseries
    # - sort it (the dicts are out of order)
    # - convert to timeseries
    # - resample it to 1 hz now as a start
    all_feats = []
    for pred_id in predictor_ids:
        an_event=api.predictor_events.get(predictor_id=pred_id,run_id=run_id,stimulus_timing=True)
        data = np.zeros((int(run_duration)))
        for i in an_event:
            start = round(i['onset'])
            stop = start + math.ceil(i['duration'])
            value = i['value']
            #onset=round(onset)
            try:
                data[start:stop]=value
            except:
                #print()
                print(f'skipped {value}')

        all_feats.append(data)
    all_feats = np.asarray(all_feats)
    return(all_feats)
    #all_feats is length = # predictors each predictor is size = run duration

In [12]:
all_feats = get_timeseries(predictor_ids,run_id,run_duration)

df = pd.DataFrame(data=all_feats.T,columns =predictor_names)

In [14]:
df.to_csv('../sourcedata/data/HBN/features/DM_pliers_all.csv')