<a href="https://colab.research.google.com/github/laurenneal/capstone-visual-neuroscience/blob/Lauren/Feature_Engineering_from_Raw_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [86]:
import pandas as pd
import numpy as np
import sys
from numpy.ma.core import ceil, floor
from more_itertools import sliced

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# FUNCTIONS (Just for setting up functions, not running code)
This section contains functions that can be used on datasets for feature engineering

## Train Test Split on Stacks

In [87]:
def trainTestSplit(df, splt = [.7,.2,.1]):
  # splits on roi, rois accross stacks will have the same group (important if stacks are from the same video)
  # split is the train test valuidation split (proportions given in that order)
  if sum(splt) < 0.9999 or sum(splt) > 1.0001:
    sys.exit('Splt must add to 1')
  
  num_rois = len(df.index.unique(level='roi'))

  tr_ind = int(ceil(num_rois*splt[0]))
  ts_ind = int(ceil(num_rois*splt[1])) + tr_ind

  ar = np.arange(num_rois)
  np.random.shuffle(ar)
  ar = ar+1 # roi id's start from 1

  train = ar[:tr_ind]
  test = ar[tr_ind:ts_ind]
  val = ar[ts_ind:]

  for i in train:
    df.loc[pd.IndexSlice[:, :, i],'training'] = 'train'

  for i in test:
    df.loc[pd.IndexSlice[:, :, i],'training'] = 'test'

  for i in val:
    df.loc[pd.IndexSlice[:, :, i],'training'] = 'validate'

  return(df)

In [None]:
# df_temp = trainTestSplit(raw_df,splt=[.5,.3,.3]) # should give error
df_temp = trainTestSplit(raw_df,splt=[.5,.3,.2]) # need to pull in sample data
df_temp.sample(15)

NameError: ignored

## Stim 1 feature engineering

In [88]:
from operator import setitem
def stimResponse(df, stim, response, zero_point = 0):
  """
  Takes a df, the name of the col holding the stimulus, the name of the col holding the response, and the value to use as the zero point

  Returns the df with the relationship type and the relation value added

  A different function will calculate the relationship between time periods
  """


  # takes a stim column and response column and returns two new feature column
  # PR is Positive Response
  # NR is Negative Response
  # PS is Positive Stimulus
  # NS is Negative Stimulus
  df_in = df.copy()
  #
  PR = df_in[response] > zero_point
  NR = df_in[response] <= zero_point
  PS = df_in[stim] > zero_point
  NS = df_in[stim] <= zero_point

  PR = PR.rename('PR')
  NR = NR.rename('NR')
  PS = PS.rename('PS')
  NS = NS.rename('NS')

  # combine into df
  df = pd.concat([df_in[stim], df_in[response], PR,NR,PS,NS], axis=1)
  #set the data types back to boolean
  df = df.astype({'PR': bool,
                'PS': bool,
                'NR': bool,
                'PR': bool})

  # get min and max values for normalizing
  S_min = df[stim].min()
  S_max = df[stim].max()
  R_min = df[response].min()
  R_max = df[response].max()

  #add a column normalizing the stim and response
  df['stim_norm'] = df[stim].apply(lambda x: x/S_max if x>0 else x/S_min)
  df['resp_norm'] = df[response].apply(lambda x: x/R_max if x>0 else x/R_min)


  # multiply stim and resp
  df['relation'] = df['stim_norm'] * df['resp_norm']

  #Set a new column with the category each row's relationship falls under
  df.loc[(df['PR'] & df['PS']),'relation_type'] = 'PRPS'
  df.loc[(df['NR'] & df['NS']),'relation_type'] = 'NRNS'
  df.loc[(df['PR'] & df['NS']),'relation_type'] = 'PRNS'
  df.loc[(df['NR'] & df['PS']),'relation_type'] = 'NRPS'

  #UNCOMMENT this to create the len-4 matrix for the current time period directly

  # #one-hot encode the four categories
  # df = pd.get_dummies(df, columns=['relation_type'], prefix = stim)

  # #get a list of the columns created
  # dummy_cols = [col for col in df.columns if 'relation_type_' in col]

  # #multiply the dummy columns by the relation value to distribute the value to the appropriate column
  # for col in dummy_cols:
  #   df[col] = df[col] * df['relation']
  
  df_in['relation_type_' + stim] = df['relation_type']
  df_in['relation_' + stim] = df['relation']
  return(df_in)

In [None]:
#pass in the raw df, the name of the stimulus we want to use, the response column, and the zero-value (in this case zero)
df_temp_3 = stimResponse(raw_df, 'stim1', 'resp', 0)

NameError: ignored

In [None]:
df_temp_3.head()

In [89]:
def get_future_stim_and_relationship(df_in, stim, time_window): 
  """
  takes a df, the stimulus name we're comparing, and the time window we're looking to in the future (in frames)

  returns a new df with two extra columns containing the relationship type and the value at the future time period
  """

  #make copy of the df to avoid editing the one in memory
  df = df_in.copy()

  #procedurally generate the column names we need to reference
  current_relation_colname = 'relation_'+stim
  current_relation_type_colname = 'relation_type_'+stim

  #and the column names we're creating
  future_relation_colname = 'relation_'+stim+'_+'+str(time_window)
  future_relation_type_colname = 'relation_type_'+stim+'_+'+str(time_window)

  #reset the index so that we can join the new columns later
  df = df.reset_index()


  # for each of the new columns, do four steps

  #get the existing array of relation values or relation types

  #cut off the first x values, where x is the time window we want to look in the future (done by indexing [time_window:])
        #this will drop the time x values in the future to line up with the current time

  #pad the end of the array with zeros by the same number of values so that the lengths still match (done by concatenating with np.zeros(time_window))

  #set the new array as the new 'future' column
  df[future_relation_colname] = pd.concat([df[current_relation_colname] \
                                           .reset_index(drop=True)[time_window:], \
                                           pd.Series((np.zeros(time_window)))], ignore_index = True)
  
  #repeat for the relation type
  df[future_relation_type_colname] = pd.concat([df[current_relation_type_colname] \
                                           .reset_index(drop=True)[time_window:], \
                                           pd.Series((np.zeros(time_window)))], ignore_index = True)

  #need to stop the end of one stack from referencing the beginning of another

  #get the index of any row where the frame # is high enough that x-frames in the future would run into another stack
  #by checking that the frame number is greater than the max frame number - (time window-1) (corrects for index number)
  end_of_stack_index = df.loc[df['frame'] >= max(df['frame'])-(time_window-1)].index

  #use .loc to set the two new columns to None for those rows
  df.loc[end_of_stack_index, future_relation_colname] = None
  df.loc[end_of_stack_index, future_relation_type_colname] = None

  #set the index back to the way it was
  df = df.set_index(['stack','label','roi','frame'])

  return df

In [None]:
#check that the values match the records 10 in the future and that the last 10 rows in each stack are null for future values
df_temp_4 = get_future_stim_and_relationship(df_temp_3, 'stim1', 10)
df_temp_4.tail(30)

In [90]:
def current_future_relationship(df, stim, time_window):
  """
  Takes df with a column for stim and a column for stim + a given time window. Returns the relationship.
  """
  # column names for current row
  current_relation_colname = 'relation_'+stim
  current_relation_type_colname = 'relation_type_'+stim

  # column names for rows in the future
  future_relation_colname = 'relation_'+stim+'_+'+str(time_window)
  future_relation_type_colname = 'relation_type_'+stim+'_+'+str(time_window)

  #with this approach, we need to drop the last x frames from each stack, so dropna
  df = df.dropna()

  #make a new column with the combo of current relation type and future relation type
  df['full_relation_type'] = df[current_relation_type_colname] + '-' + df[future_relation_type_colname]

  #one-hot encode the overall relationship, then multiply the new columns by the product of the relation values
  df = pd.get_dummies(df, columns=['full_relation_type'], prefix = current_relation_colname)

  #get a list of the columns created
  dummy_cols = [col for col in df.columns if current_relation_colname in col]

  #multiply the dummy columns by the relation value to distribute the value to the appropriate column
  for col in dummy_cols:
    df[col] = df[col] * (df[current_relation_colname]*df[future_relation_colname])

  return df

In [None]:
df = df_temp_4

In [None]:
# # #make a new column with the combo of current relation type and future relation type
# df['full_relation_type'] = df['relation_type_stim1'] + '-' + df['relation_type_stim1_+10']
# df.head()

In [None]:
# #one-hot encode the overall relationship, then multiply the new columns by the product of the relation values

# df = pd.get_dummies(df, columns=['full_relation_type'], prefix = 'relation_stim1')

# #get a list of the columns created
# dummy_cols = [col for col in df.columns if 'relation_stim1' in col]

# #multiply the dummy columns by the relation value to distribute the value to the appropriate column
# for col in dummy_cols:
#   df[col] = df[col] * (df['relation_stim1']*df['relation_stim1_+10'])

# df.head(10) 

In [None]:
# df.to_csv('drive/MyDrive/DS6011_Capstone_VisualNeuroscience/Seeded_CNMF/Extracted_Features/210815_0_20220213T070259_sparse_stim1_draft.csv')

In [None]:
df_temp_5 = current_future_relationship(df_temp_4, 'stim1', 10)
df_temp_5.head()

In [91]:
! ls drive/MyDrive/DS6011_Capstone_VisualNeuroscience/Seeded_CNMF/Extracted_Features/

10_frames_stim1
210728_0_20220304T003321RAW_stimulus_data.csv
210728_0_20220304T003321_sparse_stim1_10f.csv
210728_0_20220304T003321_sparse_stim1_15f.csv
210728_0_20220304T003321_sparse_stim1_20f.csv
210728_0_20220304T003321_sparse_stim1_5f.csv
210728_0_20220309T002951RAW_stimulus_data.csv
210728_0_allStacks_20220309T002951_rawExtracts
210731_0_20220304T005413RAW_stimulus_data.csv
210731_0_20220304T005413_sparse_stim1_10f.csv
210731_0_20220304T005413_sparse_stim1_15f.csv
210731_0_20220304T005413_sparse_stim1_20f.csv
210731_0_20220304T005413_sparse_stim1_5f.csv
210731_0_20220308T120131RAW_stimulus_data.csv
210731_0_allStacks_20220308T120131_rawExtracts
210802_0_20220308T122044RAW_stimulus_data.csv
210802_0_allStacks_20220308T122044_rawExtracts
210808_0_20220309T012702RAW_stimulus_data.csv
210808_0_allStacks_20220309T012702_rawExtracts
210809_2_20220308T131617RAW_stimulus_data.csv
210809_2_allStacks_20220308T131617_rawExtracts
210815_0__20220213T070259RAW_stimulus_data.csv
210815_0_20220

# RUN FUNCTIONS ON DATA

In [None]:
# raw_df = pd.read_csv('drive/MyDrive/DS6011_Capstone_VisualNeuroscience/Seeded_CNMF/Extracted_Features/210815_0__20220213T070259RAW_stimulus_data.csv', index_col=['stack','label','roi','frame']).drop(columns='Unnamed: 0')
# raw_df

In [92]:
# get all raw DF's
from os import listdir
root = 'drive/MyDrive/DS6011_Capstone_VisualNeuroscience/Seeded_CNMF/Extracted_Features/'
raw_fnames_init = listdir(root)
raw_fnames_init = [x for x in raw_fnames_init if 'RAW_stimulus' in x]
raw_fnames_init

['210815_0__20220213T070259RAW_stimulus_data.csv',
 '210816_0_20220304T112124RAW_stimulus_data.csv',
 '210816_1_20220304T113821RAW_stimulus_data.csv',
 '210728_0_20220304T003321RAW_stimulus_data.csv',
 '210731_0_20220304T005413RAW_stimulus_data.csv',
 '210815_1_20220308T140826RAW_stimulus_data.csv',
 '210816_1_20220308T150735RAW_stimulus_data.csv',
 '210816_0_20220308T143121RAW_stimulus_data.csv',
 '210809_2_20220308T131617RAW_stimulus_data.csv',
 '210815_0_20220308T134319RAW_stimulus_data.csv',
 '210808_0_20220309T012702RAW_stimulus_data.csv',
 '210731_0_20220308T120131RAW_stimulus_data.csv',
 '210728_0_20220309T002951RAW_stimulus_data.csv',
 '210802_0_20220308T122044RAW_stimulus_data.csv']

In [93]:
from more_itertools.more import substrings
subs = ['20220308', '20220309']

def filter_raw(list, list2):
  temp=[]
  raw_fnames=[]
  for i in list2:
      ##raw_fnames_sub = [x for x in list if i in x]
      temp.append([x for x in list if i in x])
  for elem in temp:
    raw_fnames.extend(elem)
  return raw_fnames


In [94]:
raw_fnames = filter_raw(raw_fnames_init, subs)
raw_fnames

['210815_1_20220308T140826RAW_stimulus_data.csv',
 '210816_1_20220308T150735RAW_stimulus_data.csv',
 '210816_0_20220308T143121RAW_stimulus_data.csv',
 '210809_2_20220308T131617RAW_stimulus_data.csv',
 '210815_0_20220308T134319RAW_stimulus_data.csv',
 '210731_0_20220308T120131RAW_stimulus_data.csv',
 '210802_0_20220308T122044RAW_stimulus_data.csv',
 '210808_0_20220309T012702RAW_stimulus_data.csv',
 '210728_0_20220309T002951RAW_stimulus_data.csv']

In [95]:
# Dictionary of df's. One for each video. You can call each df my name
raw_dfs = {}
for p in raw_fnames:
  df = pd.read_csv(root + p, index_col=['stack','label','roi','frame']).drop(columns='Unnamed: 0')
  raw_dfs[p] = df
raw_dfs[raw_fnames[-1]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,filename,resp,stim1,stim2,stim3
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,tm9,1,0,210728_0_1_stackRaw_mc_tm2_tm9_syt_result_2022...,0.119422,0.600920,-110.553431,0.333161
1,tm9,1,1,210728_0_1_stackRaw_mc_tm2_tm9_syt_result_2022...,0.065380,0.239804,16.216478,0.033097
1,tm9,1,2,210728_0_1_stackRaw_mc_tm2_tm9_syt_result_2022...,0.036917,-0.382424,111.599505,0.052631
1,tm9,1,3,210728_0_1_stackRaw_mc_tm2_tm9_syt_result_2022...,0.156975,-0.450328,84.015116,0.007321
1,tm9,1,4,210728_0_1_stackRaw_mc_tm2_tm9_syt_result_2022...,0.195883,-0.408975,135.214141,0.012863
...,...,...,...,...,...,...,...,...
9,tm9,14,5508,210728_0_9_stackRaw_mc_tm2_tm9_syt_result_2022...,0.126178,-0.226009,-30.003356,0.013561
9,tm9,14,5509,210728_0_9_stackRaw_mc_tm2_tm9_syt_result_2022...,0.030026,0.313921,-65.032233,0.060451
9,tm9,14,5510,210728_0_9_stackRaw_mc_tm2_tm9_syt_result_2022...,0.061619,0.034199,172.441152,0.009707
9,tm9,14,5511,210728_0_9_stackRaw_mc_tm2_tm9_syt_result_2022...,0.270762,-0.058334,-114.288345,0.001702


In [96]:
raw_dfs.keys()

dict_keys(['210815_1_20220308T140826RAW_stimulus_data.csv', '210816_1_20220308T150735RAW_stimulus_data.csv', '210816_0_20220308T143121RAW_stimulus_data.csv', '210809_2_20220308T131617RAW_stimulus_data.csv', '210815_0_20220308T134319RAW_stimulus_data.csv', '210731_0_20220308T120131RAW_stimulus_data.csv', '210802_0_20220308T122044RAW_stimulus_data.csv', '210808_0_20220309T012702RAW_stimulus_data.csv', '210728_0_20220309T002951RAW_stimulus_data.csv'])

## For each df, run through pipeline

In [97]:
stim = 'stim1'
response = 'resp'
time_window = 10
for p in raw_fnames:
  df = raw_dfs[p]
  df = stimResponse(df, stim, response, zero_point = 0)
  df = get_future_stim_and_relationship(df, stim, time_window)
  df = current_future_relationship(df, stim, time_window)
  raw_dfs[p] = df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [98]:
raw_dfs[raw_fnames[0]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,filename,resp,stim1,stim2,stim3,relation_type_stim1,relation_stim1,relation_stim1_+10,relation_type_stim1_+10,relation_stim1_NRNS-NRNS,...,relation_stim1_NRPS-PRNS,relation_stim1_NRPS-PRPS,relation_stim1_PRNS-NRNS,relation_stim1_PRNS-NRPS,relation_stim1_PRNS-PRNS,relation_stim1_PRNS-PRPS,relation_stim1_PRPS-NRNS,relation_stim1_PRPS-NRPS,relation_stim1_PRPS-PRNS,relation_stim1_PRPS-PRPS
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,tm4,1,0,210815_1_1_stackRaw_mc_result_20220308T140826.h5,-0.306057,-0.238462,10.722960,0.115015,NRNS,4.109005e-05,5.000088e-07,PRNS,0.0,...,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0
1,tm4,1,1,210815_1_1_stackRaw_mc_result_20220308T140826.h5,0.112233,-0.238462,0.000000,0.000000,PRNS,5.104073e-07,1.336279e-09,PRPS,0.0,...,0.0,0.0,0.0,0.0,0.000000e+00,6.820467e-16,0.0,0.0,0.0,0.0
1,tm4,1,2,210815_1_1_stackRaw_mc_result_20220308T140826.h5,-0.400750,-0.238462,0.000000,0.000000,NRNS,4.191306e-05,1.805216e-07,PRPS,0.0,...,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0
1,tm4,1,3,210815_1_1_stackRaw_mc_result_20220308T140826.h5,-0.136589,-0.285537,-157.891452,0.002145,NRNS,4.645035e-06,8.857321e-09,PRPS,0.0,...,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0
1,tm4,1,4,210815_1_1_stackRaw_mc_result_20220308T140826.h5,-0.213385,-0.309075,-64.567770,0.001663,NRNS,1.704822e-05,5.355264e-08,PRPS,0.0,...,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,tm4,2,5134,210815_1_9_stackRaw_mc_result_20220308T140826.h5,0.296965,-0.100819,0.000000,0.000000,PRNS,4.640063e-07,6.410203e-10,PRNS,0.0,...,0.0,0.0,0.0,0.0,2.974375e-16,0.000000e+00,0.0,0.0,0.0,0.0
9,tm4,2,5135,210815_1_9_stackRaw_mc_result_20220308T140826.h5,0.541931,-0.064963,53.603302,0.003655,PRNS,8.843982e-07,2.321655e-09,PRNS,0.0,...,0.0,0.0,0.0,0.0,2.053268e-15,0.000000e+00,0.0,0.0,0.0,0.0
9,tm4,2,5136,210815_1_9_stackRaw_mc_result_20220308T140826.h5,0.054308,-0.029107,4.153462,0.008312,PRNS,2.991585e-09,2.210921e-11,PRNS,0.0,...,0.0,0.0,0.0,0.0,6.614159e-20,0.000000e+00,0.0,0.0,0.0,0.0
9,tm4,2,5137,210815_1_9_stackRaw_mc_result_20220308T140826.h5,0.486857,-0.029107,0.000000,0.000000,PRNS,3.332160e-07,4.730280e-09,PRNS,0.0,...,0.0,0.0,0.0,0.0,1.576205e-15,0.000000e+00,0.0,0.0,0.0,0.0


In [99]:
raw_dfs[raw_fnames[1]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,filename,resp,stim1,stim2,stim3,relation_type_stim1,relation_stim1,relation_stim1_+10,relation_type_stim1_+10,relation_stim1_NRNS-NRNS,...,relation_stim1_NRPS-PRNS,relation_stim1_NRPS-PRPS,relation_stim1_PRNS-NRNS,relation_stim1_PRNS-NRPS,relation_stim1_PRNS-PRNS,relation_stim1_PRNS-PRPS,relation_stim1_PRPS-NRNS,relation_stim1_PRPS-NRPS,relation_stim1_PRPS-PRNS,relation_stim1_PRPS-PRPS
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,tm4,1,0,210816_1_1_stackRaw_mc_result_20220308T150735.h5,0.153797,-0.426309,-3.620520,0.123511,PRNS,8.112342e-05,1.128304e-06,PRNS,0.0,...,0.0,0.0,0.0,0.0,9.153184e-11,0.000000e+00,0.0,0.0,0.0,0.0
1,tm4,1,1,210816_1_1_stackRaw_mc_result_20220308T150735.h5,0.280574,-0.426309,0.000000,0.000000,PRNS,1.803326e-05,1.118963e-09,PRPS,0.0,...,0.0,0.0,0.0,0.0,0.000000e+00,2.017855e-14,0.0,0.0,0.0,0.0
1,tm4,1,2,210816_1_1_stackRaw_mc_result_20220308T150735.h5,0.493612,-0.426309,0.000000,0.000000,PRNS,1.646111e-04,8.884157e-08,PRPS,0.0,...,0.0,0.0,0.0,0.0,0.000000e+00,1.462431e-11,0.0,0.0,0.0,0.0
1,tm4,1,3,210816_1_1_stackRaw_mc_result_20220308T150735.h5,0.667897,-0.520911,-176.994026,0.013512,PRNS,1.150323e-03,4.057404e-06,PRPS,0.0,...,0.0,0.0,0.0,0.0,0.000000e+00,4.667327e-09,0.0,0.0,0.0,0.0
1,tm4,1,4,210816_1_1_stackRaw_mc_result_20220308T150735.h5,0.497233,-0.615513,-145.677535,0.009002,PRNS,9.087288e-05,3.340377e-09,PRPS,0.0,...,0.0,0.0,0.0,0.0,0.000000e+00,3.035496e-13,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,tm1,10,5498,210816_1_10_stackRaw_mc_result_20220308T150735.h5,-0.151383,-0.443618,116.023402,0.014892,NRNS,2.698928e-04,1.107963e-06,PRPS,0.0,...,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0
10,tm1,10,5499,210816_1_10_stackRaw_mc_result_20220308T150735.h5,-0.196144,-0.443618,0.000000,0.000000,NRNS,4.228705e-04,1.512100e-06,PRPS,0.0,...,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0
10,tm1,10,5500,210816_1_10_stackRaw_mc_result_20220308T150735.h5,0.009124,-0.443618,0.000000,0.000000,PRNS,2.331602e-07,1.844294e-09,PRPS,0.0,...,0.0,0.0,0.0,0.0,0.000000e+00,4.300159e-16,0.0,0.0,0.0,0.0
10,tm1,10,5501,210816_1_10_stackRaw_mc_result_20220308T150735.h5,-0.186227,-0.364794,113.889704,0.017712,NRNS,2.957251e-04,1.391868e-06,PRPS,0.0,...,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0


In [100]:
raw_dfs[raw_fnames[2]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,filename,resp,stim1,stim2,stim3,relation_type_stim1,relation_stim1,relation_stim1_+10,relation_type_stim1_+10,relation_stim1_NRNS-NRNS,...,relation_stim1_NRPS-PRNS,relation_stim1_NRPS-PRPS,relation_stim1_PRNS-NRNS,relation_stim1_PRNS-NRPS,relation_stim1_PRNS-PRNS,relation_stim1_PRNS-PRPS,relation_stim1_PRPS-NRNS,relation_stim1_PRPS-NRPS,relation_stim1_PRPS-PRNS,relation_stim1_PRPS-PRPS
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,tm9,1,0,210816_0_1_stackRaw_mc_result_20220308T143121.h5,-0.312670,-0.903345,68.617929,0.021570,NRNS,1.277461e-02,3.681268e-04,NRNS,0.000005,...,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
1,tm9,1,1,210816_0_1_stackRaw_mc_result_20220308T143121.h5,-0.210353,-0.903345,0.000000,0.000000,NRNS,3.102312e-04,2.573695e-08,NRPS,0.000000,...,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
1,tm9,1,2,210816_0_1_stackRaw_mc_result_20220308T143121.h5,-0.130939,-0.903345,0.000000,0.000000,NRNS,6.197836e-04,1.366898e-06,NRPS,0.000000,...,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
1,tm9,1,3,210816_0_1_stackRaw_mc_result_20220308T143121.h5,-0.263520,-0.626360,130.245566,0.021945,NRNS,1.754881e-03,8.182815e-06,NRPS,0.000000,...,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
1,tm9,1,4,210816_0_1_stackRaw_mc_result_20220308T143121.h5,-0.188122,-0.349374,167.691310,0.039503,NRNS,1.411792e-04,1.694718e-07,NRPS,0.000000,...,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,tm2,12,5498,210816_0_10_stackRaw_mc_result_20220308T143121.h5,0.854108,-0.613958,97.230249,0.002670,PRNS,9.096179e-04,5.457828e-06,NRPS,0.000000,...,0.0,0.0,0.0,4.964538e-09,0.0,0.0,0.0,0.0,0.0,0.0
10,tm2,12,5499,210816_0_10_stackRaw_mc_result_20220308T143121.h5,0.450335,-0.613958,0.000000,0.000000,PRNS,3.169837e-04,2.988561e-06,NRPS,0.000000,...,0.0,0.0,0.0,9.473250e-10,0.0,0.0,0.0,0.0,0.0,0.0
10,tm2,12,5500,210816_0_10_stackRaw_mc_result_20220308T143121.h5,0.482010,-0.613958,0.000000,0.000000,PRNS,3.914126e-04,4.287191e-06,NRPS,0.000000,...,0.0,0.0,0.0,1.678061e-09,0.0,0.0,0.0,0.0,0.0,0.0
10,tm2,12,5501,210816_0_10_stackRaw_mc_result_20220308T143121.h5,0.360845,-0.215192,124.003790,0.066660,PRNS,1.678155e-05,7.127891e-08,NRPS,0.000000,...,0.0,0.0,0.0,1.196171e-12,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
raw_dfs[raw_fnames[3]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,filename,resp,stim1,stim2,stim3,relation_type_stim1,relation_stim1,relation_stim1_+10,relation_type_stim1_+10,relation_stim1_NRNS-NRNS,...,relation_stim1_NRPS-PRNS,relation_stim1_NRPS-PRPS,relation_stim1_PRNS-NRNS,relation_stim1_PRNS-NRPS,relation_stim1_PRNS-PRNS,relation_stim1_PRNS-PRPS,relation_stim1_PRPS-NRNS,relation_stim1_PRPS-NRPS,relation_stim1_PRPS-PRNS,relation_stim1_PRPS-PRPS
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,tm1,1,0,210809_2_1_stackRaw_mc_result_20220308T131617.h5,0.154626,0.203364,-103.861223,0.097277,PRPS,1.915974e-08,4.665061e-14,NRNS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.0,8.938138e-22,0.0,0.000000e+00,0.0
1,tm1,1,1,210809_2_1_stackRaw_mc_result_20220308T131617.h5,0.002451,0.203364,0.000000,0.000000,PRPS,1.228784e-11,1.949336e-16,NRNS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.0,2.395313e-27,0.0,0.000000e+00,0.0
1,tm1,1,2,210809_2_1_stackRaw_mc_result_20220308T131617.h5,0.005994,0.203364,0.000000,0.000000,PRPS,2.282803e-12,3.494068e-20,PRNS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.0,7.976269e-32,0.0
1,tm1,1,3,210809_2_1_stackRaw_mc_result_20220308T131617.h5,0.265182,-0.203048,-102.899807,0.032910,PRNS,2.571197e-08,1.320224e-14,NRNS,0.0,...,0.0,0.0,3.394557e-22,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.0,0.000000e+00,0.0
1,tm1,1,4,210809_2_1_stackRaw_mc_result_20220308T131617.h5,0.354837,-0.473989,-104.693881,0.049055,PRNS,5.623215e-07,1.450716e-12,PRNS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,8.157686e-19,0.0,0.000000e+00,0.0,0.000000e+00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,t5,9,5498,210809_2_10_stackRaw_mc_result_20220308T131617.h5,0.427613,-0.054739,33.185508,0.022966,PRNS,1.206089e-07,3.815518e-11,PRNS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,4.601856e-18,0.0,0.000000e+00,0.0,0.000000e+00,0.0
10,t5,9,5499,210809_2_10_stackRaw_mc_result_20220308T131617.h5,0.631440,-0.054739,0.000000,0.000000,PRNS,1.512944e-08,1.584012e-14,PRNS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,2.396522e-22,0.0,0.000000e+00,0.0,0.000000e+00,0.0
10,t5,9,5500,210809_2_10_stackRaw_mc_result_20220308T131617.h5,0.378177,-0.054739,0.000000,0.000000,PRNS,3.841358e-09,2.015074e-15,NRPS,0.0,...,0.0,0.0,0.000000e+00,7.740620e-24,0.000000e+00,0.0,0.000000e+00,0.0,0.000000e+00,0.0
10,t5,9,5501,210809_2_10_stackRaw_mc_result_20220308T131617.h5,0.082939,-0.072495,49.868443,0.000628,PRNS,8.622221e-10,3.201931e-15,NRPS,0.0,...,0.0,0.0,0.000000e+00,2.760776e-24,0.000000e+00,0.0,0.000000e+00,0.0,0.000000e+00,0.0


In [102]:
raw_dfs[raw_fnames[4]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,filename,resp,stim1,stim2,stim3,relation_type_stim1,relation_stim1,relation_stim1_+10,relation_type_stim1_+10,relation_stim1_NRNS-NRNS,...,relation_stim1_NRPS-PRNS,relation_stim1_NRPS-PRPS,relation_stim1_PRNS-NRNS,relation_stim1_PRNS-NRPS,relation_stim1_PRNS-PRNS,relation_stim1_PRNS-PRPS,relation_stim1_PRPS-NRNS,relation_stim1_PRPS-NRPS,relation_stim1_PRPS-PRNS,relation_stim1_PRPS-PRPS
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,tm2,1,0,210815_0_1_stackRaw_mc_mix1_syt_result_2022030...,1.066616,-0.395877,160.447960,0.110063,PRNS,1.215453e-07,1.056666e-15,PRNS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,1.284328e-22,0.000000e+00,0.0,0.0,0.0,0.000000e+00
1,tm2,1,1,210815_0_1_stackRaw_mc_mix1_syt_result_2022030...,0.681816,-0.395877,0.000000,0.000000,PRNS,1.002589e-05,3.551850e-09,PRPS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,3.561045e-14,0.0,0.0,0.0,0.000000e+00
1,tm2,1,2,210815_0_1_stackRaw_mc_mix1_syt_result_2022030...,0.388823,-0.395877,0.000000,0.000000,PRNS,1.245495e-05,6.438341e-08,PRPS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,8.018924e-13,0.0,0.0,0.0,0.000000e+00
1,tm2,1,3,210815_0_1_stackRaw_mc_mix1_syt_result_2022030...,0.052077,-0.138699,170.252676,0.030790,PRNS,1.767581e-08,3.795351e-11,PRPS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,6.708592e-19,0.0,0.0,0.0,0.000000e+00
1,tm2,1,4,210815_0_1_stackRaw_mc_mix1_syt_result_2022030...,0.585917,0.890013,-62.485104,0.122134,PRPS,5.881238e-05,5.159909e-08,PRPS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.0,0.0,3.034665e-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,tm1,13,5498,210815_0_10_stackRaw_mc_mix1_syt_result_202203...,0.257419,-0.289693,-42.557796,0.001677,PRNS,2.629884e-06,1.100248e-08,NRPS,0.0,...,0.0,0.0,0.000000e+00,2.893524e-14,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.000000e+00
10,tm1,13,5499,210815_0_10_stackRaw_mc_mix1_syt_result_202203...,0.308152,-0.273720,69.083034,0.001833,PRNS,1.984236e-08,2.887260e-15,PRPS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,5.729007e-23,0.0,0.0,0.0,0.000000e+00
10,tm1,13,5500,210815_0_10_stackRaw_mc_mix1_syt_result_202203...,0.221243,-0.270981,94.083375,0.018014,PRNS,1.304170e-07,3.211918e-12,NRNS,0.0,...,0.0,0.0,4.188887e-19,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.000000e+00
10,tm1,13,5501,210815_0_10_stackRaw_mc_mix1_syt_result_202203...,0.431414,-0.370507,93.189718,0.021557,PRNS,9.391020e-08,2.373404e-14,PRNS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,2.228869e-21,0.000000e+00,0.0,0.0,0.0,0.000000e+00


In [103]:
# train test split on movie (when saving out csv file)

## Remove outlier from 210815_0__20220213T070259RAW_stimulus_data.csv

In [104]:
#access the 11th roi in the first stack and drop all frames for that roi
raw_dfs['210815_0_20220308T134319RAW_stimulus_data.csv'] = raw_dfs['210815_0_20220308T134319RAW_stimulus_data.csv'].drop(raw_dfs['210815_0_20220308T134319RAW_stimulus_data.csv'].loc[1,:,11].index)
raw_dfs['210815_0_20220308T134319RAW_stimulus_data.csv']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,filename,resp,stim1,stim2,stim3,relation_type_stim1,relation_stim1,relation_stim1_+10,relation_type_stim1_+10,relation_stim1_NRNS-NRNS,...,relation_stim1_NRPS-PRNS,relation_stim1_NRPS-PRPS,relation_stim1_PRNS-NRNS,relation_stim1_PRNS-NRPS,relation_stim1_PRNS-PRNS,relation_stim1_PRNS-PRPS,relation_stim1_PRPS-NRNS,relation_stim1_PRPS-NRPS,relation_stim1_PRPS-PRNS,relation_stim1_PRPS-PRPS
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,tm2,1,0,210815_0_1_stackRaw_mc_mix1_syt_result_2022030...,1.066616,-0.395877,160.447960,0.110063,PRNS,1.215453e-07,1.056666e-15,PRNS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,1.284328e-22,0.000000e+00,0.0,0.0,0.0,0.000000e+00
1,tm2,1,1,210815_0_1_stackRaw_mc_mix1_syt_result_2022030...,0.681816,-0.395877,0.000000,0.000000,PRNS,1.002589e-05,3.551850e-09,PRPS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,3.561045e-14,0.0,0.0,0.0,0.000000e+00
1,tm2,1,2,210815_0_1_stackRaw_mc_mix1_syt_result_2022030...,0.388823,-0.395877,0.000000,0.000000,PRNS,1.245495e-05,6.438341e-08,PRPS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,8.018924e-13,0.0,0.0,0.0,0.000000e+00
1,tm2,1,3,210815_0_1_stackRaw_mc_mix1_syt_result_2022030...,0.052077,-0.138699,170.252676,0.030790,PRNS,1.767581e-08,3.795351e-11,PRPS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,6.708592e-19,0.0,0.0,0.0,0.000000e+00
1,tm2,1,4,210815_0_1_stackRaw_mc_mix1_syt_result_2022030...,0.585917,0.890013,-62.485104,0.122134,PRPS,5.881238e-05,5.159909e-08,PRPS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.0,0.0,3.034665e-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,tm1,13,5498,210815_0_10_stackRaw_mc_mix1_syt_result_202203...,0.257419,-0.289693,-42.557796,0.001677,PRNS,2.629884e-06,1.100248e-08,NRPS,0.0,...,0.0,0.0,0.000000e+00,2.893524e-14,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.000000e+00
10,tm1,13,5499,210815_0_10_stackRaw_mc_mix1_syt_result_202203...,0.308152,-0.273720,69.083034,0.001833,PRNS,1.984236e-08,2.887260e-15,PRPS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,5.729007e-23,0.0,0.0,0.0,0.000000e+00
10,tm1,13,5500,210815_0_10_stackRaw_mc_mix1_syt_result_202203...,0.221243,-0.270981,94.083375,0.018014,PRNS,1.304170e-07,3.211918e-12,NRNS,0.0,...,0.0,0.0,4.188887e-19,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.000000e+00
10,tm1,13,5501,210815_0_10_stackRaw_mc_mix1_syt_result_202203...,0.431414,-0.370507,93.189718,0.021557,PRNS,9.391020e-08,2.373404e-14,PRNS,0.0,...,0.0,0.0,0.000000e+00,0.000000e+00,2.228869e-21,0.000000e+00,0.0,0.0,0.0,0.000000e+00


## Save out DF's with train/test labels

In [105]:
# SET THESE VALUES EACH TIME

folder_name = '10_frames_stim1' # create a new folder name for saving out these files (make sure this folder already exists)
num_train = 5
num_test = 3
num_val = 1

In [106]:
arr = np.array(['train'] * num_train + ['test'] * num_test + ['val'] * num_val)
np.random.shuffle(arr)

# save out each DF with a label indicating train, test or validation
for i, p in enumerate(raw_fnames):
  p_split = p.split('.')[-2] # chop off csv
  
  raw_dfs[p].to_csv(root + folder_name + '/' + p_split + '_' + arr[i] + '.csv') #saves to new folder

In [107]:
listdir(root + folder_name)

['210728_0_20220304T003321RAW_stimulus_data_test.csv',
 '210816_1_20220304T113821RAW_stimulus_data_test.csv',
 '210731_0_20220304T005413RAW_stimulus_data_train.csv',
 '210815_0__20220213T070259RAW_stimulus_data_train.csv',
 '210816_0_20220304T112124RAW_stimulus_data_test.csv',
 '210815_0__20220213T070259RAW_stimulus_data_test.csv',
 '210815_0__20220213T070259RAW_stimulus_data_val.csv',
 '210816_0_20220304T112124RAW_stimulus_data_train.csv',
 '210816_1_20220304T113821RAW_stimulus_data_train.csv',
 '210728_0_20220304T003321RAW_stimulus_data_train.csv',
 '210815_1_20220308T140826RAW_stimulus_data_test.csv',
 '210816_1_20220308T150735RAW_stimulus_data_test.csv',
 '210731_0_20220304T005413RAW_stimulus_data_test.csv',
 '210816_0_20220308T143121RAW_stimulus_data_train.csv',
 '210815_1_20220308T140826RAW_stimulus_data_val.csv',
 '210816_1_20220308T150735RAW_stimulus_data_train.csv',
 '210809_2_20220308T131617RAW_stimulus_data_test.csv',
 '210815_0_20220308T134319RAW_stimulus_data_test.csv',
 '