<a href="https://colab.research.google.com/github/laurenneal/capstone-visual-neuroscience/blob/main/Feature_Engineering_from_Raw_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import sys
from numpy.ma.core import ceil, floor
from more_itertools import sliced

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! ls drive/MyDrive/DS6011_Capstone_VisualNeuroscience/Seeded_CNMF/Extracted_Features/

210815_0__20220213T070259RAW_stimulus_data.csv
210815_0_allStacks_mc_mix1_syt_rawExtracts


## Functions
This section contains functions that can be used on datasets for feature engineering

In [4]:
raw_df = pd.read_csv('drive/MyDrive/DS6011_Capstone_VisualNeuroscience/Seeded_CNMF/Extracted_Features/210815_0__20220213T070259RAW_stimulus_data.csv', index_col=['stack','label','roi','frame']).drop(columns='Unnamed: 0')
raw_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,filename,resp,stim1,stim2,stim3
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,tm2,1,0,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,-0.106575,-0.395877,160.447960,0.110063
1,tm2,1,1,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,0.244535,-0.395877,0.000000,0.000000
1,tm2,1,2,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,0.398458,-0.395877,0.000000,0.000000
1,tm2,1,3,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,0.619290,-0.138699,170.252676,0.030790
1,tm2,1,4,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,0.627615,0.890013,-62.485104,0.122134
...,...,...,...,...,...,...,...,...
10,tm4,15,5508,210815_0_10_stackRaw_mc_mix1_syt_result_202202...,0.578604,0.113874,0.000000,0.000000
10,tm4,15,5509,210815_0_10_stackRaw_mc_mix1_syt_result_202202...,0.417371,-0.289958,-96.522646,0.089182
10,tm4,15,5510,210815_0_10_stackRaw_mc_mix1_syt_result_202202...,0.324171,-0.370724,-89.799347,0.019588
10,tm4,15,5511,210815_0_10_stackRaw_mc_mix1_syt_result_202202...,0.255033,-0.370724,0.000000,0.000000


## Train Test Split on Stacks

In [31]:
def trainTestSplit(df, splt = [.7,.2,.1]):
  # splits on roi, rois accross stacks will have the same group (important if stacks are from the same video)
  # split is the train test valuidation split (proportions given in that order)
  if sum(splt) < 0.9999 or sum(splt) > 1.0001:
    sys.exit('Splt must add to 1')
  
  num_rois = len(df.index.unique(level='roi'))

  tr_ind = int(ceil(num_rois*splt[0]))
  ts_ind = int(ceil(num_rois*splt[1])) + tr_ind

  ar = np.arange(num_rois)
  np.random.shuffle(ar)
  ar = ar+1 # roi id's start from 1

  train = ar[:tr_ind]
  test = ar[tr_ind:ts_ind]
  val = ar[ts_ind:]

  for i in train:
    df.loc[pd.IndexSlice[:, :, i],'training'] = 'train'

  for i in test:
    df.loc[pd.IndexSlice[:, :, i],'training'] = 'test'

  for i in val:
    df.loc[pd.IndexSlice[:, :, i],'training'] = 'validate'

  return(df)

In [32]:
# df_temp = trainTestSplit(raw_df,splt=[.5,.3,.3]) # should give error
df_temp = trainTestSplit(raw_df,splt=[.5,.3,.2])
df_temp.sample(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,filename,resp,stim1,stim2,stim3,training
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,t5,12,145,210815_0_2_stackRaw_mc_mix1_syt_result_2022021...,-0.09849,0.61459,15.911532,0.05233888,train
5,tm4,9,483,210815_0_5_stackRaw_mc_mix1_syt_result_2022021...,1.529856,-0.193807,-51.724158,0.005638846,train
5,tm4,8,2079,210815_0_5_stackRaw_mc_mix1_syt_result_2022021...,1.469592,-0.88241,0.0,0.0,test
4,tm2,11,2410,210815_0_4_stackRaw_mc_mix1_syt_result_2022021...,1.839242,0.229121,67.955884,0.001565665,train
2,tm4,15,1733,210815_0_2_stackRaw_mc_mix1_syt_result_2022021...,-0.087449,-0.063588,-80.393714,4.81009e-07,test
7,tm2,2,3008,210815_0_7_stackRaw_mc_mix1_syt_result_2022021...,-0.196725,-0.083662,49.000808,0.02523414,validate
1,tm9,6,3182,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,-0.378269,0.090992,96.015098,0.006015696,train
7,tm4,8,3,210815_0_7_stackRaw_mc_mix1_syt_result_2022021...,-0.244456,0.031676,4.061709,0.01684923,test
1,tm2,14,498,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,0.153922,0.027916,0.0,0.0,validate
8,tm4,9,2057,210815_0_8_stackRaw_mc_mix1_syt_result_2022021...,-0.327165,0.32948,-87.924257,0.0008111958,train


## Combine Frames into Temporal Chunks

In [35]:
# select frame size
temporal_period_length = 200

In [37]:

def groupFrames(df, period):
  # inputs are data frame and temporal period length
  # df needs to be sorted correctly, the indexing should take care of that

  # check that all rois are the same size
  # if this is a problem we can write a more computationally heavy workthrough
  if (raw_df.groupby(['stack','roi']).count()['filename'].max() != raw_df.groupby(['stack','roi']).count()['filename'].min()):
    sys.exit('ROIs need to be the same length')

  num_stacks = len(df.index.unique(level='stack'))
  num_roi = len(df.index.unique(level='roi'))

  # get first label
  lbl_1 = raw_df.head(1).index.values[0][1]
  # use it to find the number of frames in a roi
  num_frames = raw_df.loc[(1,lbl_1,1),'filename'].count()
  # how many groups will ther be in each roi?
  num_pds = int(floor(num_frames/period))
  # how many left over
  remainder_pds = num_frames%period

  # iterate a column for one roi
  lst = [[li + 1] * period for li in range(0,num_pds)]
  lst = [li for sublist in lst for li in sublist] # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
  lst = lst + [lst[-1] + 1] * remainder_pds
  lst

  new_col = np.array(lst * num_stacks * num_roi)
  
  df['frame_group'] = new_col

  return(df)

In [38]:
df_temp2 = groupFrames(raw_df,temporal_period_length)
df_temp2

  return self._getitem_tuple(key)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,filename,resp,stim1,stim2,stim3,training,frame_group
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,tm2,1,0,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,-0.106575,-0.395877,160.447960,0.110063,test,1
1,tm2,1,1,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,0.244535,-0.395877,0.000000,0.000000,test,1
1,tm2,1,2,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,0.398458,-0.395877,0.000000,0.000000,test,1
1,tm2,1,3,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,0.619290,-0.138699,170.252676,0.030790,test,1
1,tm2,1,4,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,0.627615,0.890013,-62.485104,0.122134,test,1
...,...,...,...,...,...,...,...,...,...,...
10,tm4,15,5508,210815_0_10_stackRaw_mc_mix1_syt_result_202202...,0.578604,0.113874,0.000000,0.000000,test,28
10,tm4,15,5509,210815_0_10_stackRaw_mc_mix1_syt_result_202202...,0.417371,-0.289958,-96.522646,0.089182,test,28
10,tm4,15,5510,210815_0_10_stackRaw_mc_mix1_syt_result_202202...,0.324171,-0.370724,-89.799347,0.019588,test,28
10,tm4,15,5511,210815_0_10_stackRaw_mc_mix1_syt_result_202202...,0.255033,-0.370724,0.000000,0.000000,test,28


In [33]:
def stimResponse(stim, response):
  # takes a stim column and response column and returns two new feature column
  # PR is Positive Response
  # NR is Negative Response
  # PS is Positive Stimulus
  # NS is Negative Stimulus

  PR = response > 0
  NR = response <= 0
  PS = stim > 0
  NS = stim <= 0

  PR = PR.rename('PR')
  NR = NR.rename('NR')
  PS = PS.rename('PS')
  NS = NS.rename('NS')

  # combine into df
  df = pd.concat([stim, response, PR,NR,PS,NS], axis=1)

  # get min and max values for normalizing
  S_min = stim.min()
  S_max = stim.max()
  R_min = response.min()
  R_max = response.max()
  df['stim_norm'] = df['stim1'].apply(lambda x: x/S_max if x>0 else x/S_min)
  df['resp_norm'] = df['resp'].apply(lambda x: x/R_max if x>0 else x/R_min)
  df = df.astype({'PR': bool,
                  'PS': bool,
                  'NR': bool,
                  'PR': bool})
  
  # multiply stim and resp
  df['relation'] = df['stim_norm'] * df['resp_norm']
  df.loc[(df['PR'] & df['PS']),'relation_type'] = 'PRPS'
  df.loc[(df['NR'] & df['NS']),'relation_type'] = 'NRNS'
  df.loc[(df['PR'] & df['NS']),'relation_type'] = 'PRNS'
  df.loc[(df['NR'] & df['PS']),'relation_type'] = 'NRPS'

  #one-hot encode the four categories
  df = pd.get_dummies(df, columns=['relation_type'])
  dummy_cols = [col for col in df.columns if 'relation_type_' in col]
  print(dummy_cols)
  for col in dummy_cols:
    df[col] = df[col] * df['relation']
    
  # Change to four columns (for PRPS...) and add interaction with future frame
  return(df)

In [34]:
df_temp_t2 = stimResponse(df_temp_t['stim1'],df_temp_t['resp'])
df_temp_t2#[df_temp_t2['NR']].iloc[:,0]
# df_temp_t2[['relation','relation_type']]
# df_temp_3 = raw_df
# df_temp_3[['stim1_rel', 'stim1_rel_type']] = stimResponse(raw_df['stim1'],raw_df['resp'])
# df_temp_3

stim1    float64
resp     float64
PR          bool
NR          bool
PS          bool
NS          bool
dtype: object
stim1    float64
resp     float64
PR        object
NR        object
PS        object
NS        object
dtype: object


TypeError: ignored

In [14]:
df_temp_t = raw_df.sample(50)
# df_temp

In [192]:
# df_temp_t2[(df_temp_t2.PR == True) and (df_temp_t2.PS == True)]#['type'] = 'PRPS'
df_temp_t2.loc[(df_temp_t2['PR'] & df_temp_t2['PS']),'type'] = 'PRPS'
df_temp_t2.loc[(df_temp_t2['NR'] & df_temp_t2['NS']),'type'] = 'NRNS'
df_temp_t2.loc[(df_temp_t2['PR'] & df_temp_t2['NS']),'type'] = 'PRNS'
df_temp_t2.loc[(df_temp_t2['NR'] & df_temp_t2['PS']),'type'] = 'NRPS'
df_temp_t2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,stim1,resp,PR,NR,PS,NS,type
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5,tm2,14,207,0.480511,-0.234109,False,True,True,False,NRPS
8,tm1,7,3580,-0.144492,0.354413,True,False,False,True,PRNS
6,tm2,10,1095,0.003241,1.768867,True,False,True,False,PRPS
7,tm9,6,3292,-0.146369,-0.189285,False,True,False,True,NRNS
2,tm2,11,4106,-0.301201,0.740846,True,False,False,True,PRNS
9,tm4,15,622,0.053877,0.469881,True,False,True,False,PRPS
7,tm2,10,131,-0.102012,0.244015,True,False,False,True,PRNS
8,tm2,4,1275,0.091965,0.312462,True,False,True,False,PRPS
2,tm2,1,3189,0.163293,-0.04998,False,True,True,False,NRPS
1,tm4,3,4621,-0.229767,0.664271,True,False,False,True,PRNS


In [68]:
10001%10

1

In [40]:
raw_df.groupby('roi').max()

Unnamed: 0_level_0,filename,resp,stim1,stim2,stim3,training,frame_group
roi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,2.655949,0.984015,179.9591,0.446474,test,28
2,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,2.843598,0.976826,179.981163,0.404031,validate,28
3,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,3.412077,0.978226,179.975336,0.399451,train,28
4,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,2.627852,0.97863,179.977641,0.438806,test,28
5,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,2.965227,0.969341,179.972149,0.397681,train,28
6,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,2.825897,0.98283,179.993674,0.352636,train,28
7,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,3.956323,0.976128,179.998445,0.383183,train,28
8,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,3.103902,0.975246,179.964546,0.439268,test,28
9,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,3.456315,0.980053,179.99838,0.408045,train,28
10,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,3.971988,0.96845,179.99899,0.396471,test,28


In [41]:
raw_df.sort_values('resp', ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,filename,resp,stim1,stim2,stim3,training,frame_group
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,tm2,11,4163,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,14.493103,-0.758193,-30.063237,0.000060,train,21
1,tm2,11,4162,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,14.111589,-0.758193,-12.773923,0.000097,train,21
1,tm2,11,4164,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,14.021841,-0.187423,-166.237403,0.042734,train,21
1,tm2,11,4161,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,12.464707,-0.758193,70.954725,0.005342,train,21
1,tm2,11,4159,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,9.735276,-0.416768,62.764434,0.000026,train,21
1,tm2,11,...,...,...,...,...,...,...,...
1,tm2,11,3730,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,-1.431442,0.562750,65.301653,0.018584,train,19
1,tm2,11,3727,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,-1.479197,0.732240,-79.201351,0.022205,train,19
1,tm2,11,3729,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,-1.506416,0.732240,0.000000,0.000000,train,19
1,tm2,11,3788,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,-1.560779,0.216339,90.386882,0.002186,train,19


In [43]:
raw_df.groupby('label').max()

Unnamed: 0_level_0,filename,resp,stim1,stim2,stim3,training,frame_group
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
t5,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,2.089126,0.989168,179.991856,0.379431,train,28
tm1,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,3.956323,0.976128,179.998445,0.383183,train,28
tm2,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,14.493103,0.984015,179.99899,0.467988,validate,28
tm4,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,3.456315,0.983499,179.998876,0.439268,train,28
tm9,210815_0_9_stackRaw_mc_mix1_syt_result_2022021...,2.825897,0.98283,179.993674,0.352636,train,28


In [49]:
# raw_df.loc[pd.IndexSlice[:,'tm2']]
raw_df.loc[1, 'tm2'].groupby('roi').max()
# df.loc[pd.IndexSlice[:, :, i],'training'] = 'train'

  


Unnamed: 0_level_0,filename,resp,stim1,stim2,stim3,training,frame_group
roi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,2.379384,0.949333,179.482667,0.437928,test,28
2,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,2.394398,0.971157,179.981163,0.310473,validate,28
4,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,2.476549,0.97486,179.934541,0.304792,test,28
5,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,2.809133,0.949049,179.898534,0.235928,train,28
10,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,3.971988,0.955173,179.99899,0.282797,test,28
11,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,14.493103,0.973177,179.9472,0.307283,train,28
14,210815_0_1_stackRaw_mc_mix1_syt_result_2022021...,1.150222,0.965363,179.961154,0.381993,validate,28


In [47]:
raw_df.index

MultiIndex([( 1, 'tm2',  1,    0),
            ( 1, 'tm2',  1,    1),
            ( 1, 'tm2',  1,    2),
            ( 1, 'tm2',  1,    3),
            ( 1, 'tm2',  1,    4),
            ( 1, 'tm2',  1,    5),
            ( 1, 'tm2',  1,    6),
            ( 1, 'tm2',  1,    7),
            ( 1, 'tm2',  1,    8),
            ( 1, 'tm2',  1,    9),
            ...
            (10, 'tm4', 15, 5503),
            (10, 'tm4', 15, 5504),
            (10, 'tm4', 15, 5505),
            (10, 'tm4', 15, 5506),
            (10, 'tm4', 15, 5507),
            (10, 'tm4', 15, 5508),
            (10, 'tm4', 15, 5509),
            (10, 'tm4', 15, 5510),
            (10, 'tm4', 15, 5511),
            (10, 'tm4', 15, 5512)],
           names=['stack', 'label', 'roi', 'frame'], length=826950)