<a href="https://colab.research.google.com/github/laurenneal/capstone-visual-neuroscience/blob/main/Feature_Engineering_from_Raw_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import sys
from numpy.ma.core import ceil, floor
from more_itertools import sliced

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# FUNCTIONS (Just for setting up functions, not running code)
This section contains functions that can be used on datasets for feature engineering

## Train Test Split on Stacks

In [2]:
# def trainTestSplit(df, splt = [.7,.2,.1]):
#   # splits on roi, rois accross stacks will have the same group (important if stacks are from the same video)
#   # split is the train test valuidation split (proportions given in that order)
#   if sum(splt) < 0.9999 or sum(splt) > 1.0001:
#     sys.exit('Splt must add to 1')
  
#   num_rois = len(df.index.unique(level='roi'))

#   tr_ind = int(ceil(num_rois*splt[0]))
#   ts_ind = int(ceil(num_rois*splt[1])) + tr_ind

#   ar = np.arange(num_rois)
#   np.random.shuffle(ar)
#   ar = ar+1 # roi id's start from 1

#   train = ar[:tr_ind]
#   test = ar[tr_ind:ts_ind]
#   val = ar[ts_ind:]

#   for i in train:
#     df.loc[pd.IndexSlice[:, :, i],'training'] = 'train'

#   for i in test:
#     df.loc[pd.IndexSlice[:, :, i],'training'] = 'test'

#   for i in val:
#     df.loc[pd.IndexSlice[:, :, i],'training'] = 'validate'

#   return(df)

## Combine Frames into Temporal Chunks - NOT Working

In [3]:
# # select frame size
# temporal_period_length = 200

In [4]:

# def groupFrames(df, period):
#   # inputs are data frame and temporal period length
#   # df needs to be sorted correctly, the indexing should take care of that

#   # check that all rois are the same size
#   # if this is a problem we can write a more computationally heavy workthrough
#   if (raw_df.groupby(['stack','roi']).count()['filename'].max() != raw_df.groupby(['stack','roi']).count()['filename'].min()):
#     sys.exit('ROIs need to be the same length')

#   num_stacks = len(df.index.unique(level='stack'))
#   num_roi = len(df.index.unique(level='roi'))

#   # get first label
#   lbl_1 = raw_df.head(1).index.values[0][1]
#   # use it to find the number of frames in a roi
#   num_frames = raw_df.loc[(1,lbl_1,1),'filename'].count()
#   # how many groups will ther be in each roi?
#   num_pds = int(floor(num_frames/period))
#   # how many left over
#   remainder_pds = num_frames%period

#   # iterate a column for one roi
#   lst = [[li + 1] * period for li in range(0,num_pds)]
#   lst = [li for sublist in lst for li in sublist] # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
#   lst = lst + [lst[-1] + 1] * remainder_pds
#   lst

#   new_col = np.array(lst * num_stacks * num_roi)
  
#   df['frame_group'] = new_col

#   return(df)

In [16]:
def groupFrames(df_in, period):
  
  # inputs are data frame and number of periods per stack
  df = df_in.copy()

  df = df.reset_index()
  # set a unique id for each roi stack combo
  df['long_id'] = df['movie_ID'] + '+' + df['stack'].astype('str') + '+' + df['roi'].astype('str')
  df = df.set_index('long_id')
  long_ids = df.index.unique()

  # for each unique roi we calculate a grouping index
  new_col = []
  for id in long_ids:
    # number of rows for each unique id
    num_rows = df.loc[id].shape[0]
    
    # how many frames per division to split the roi into number of periods
    frm_pd = int(floor(num_rows/period))

    # iterate a column for one roi
    lst = [[li + 1] * frm_pd for li in range(0,period)]
    lst = [li for sublist in lst for li in sublist] # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists

    if len(lst) > num_rows:
      lst = lst[:num_rows] # remove some on the back if needed
    elif len(lst) < num_rows:
      lst = lst + [lst[-1]] * (num_rows - len(lst)) # add a few extra on the end if needed

    # checks to make sure the list has the correct number of rows
    if len(lst) != num_rows:
      sys.exit('Error 1: Somethings wrong with this code')

    new_col += lst

  # new col length should match the column length
  if len(new_col) != df.shape[0]:
    sys.exit('Error 2: Somethings wrong with this code')

  df['frame_group'] = new_col

  df = df.reset_index().drop(columns = 'long_id')

  return(df)

## Stim 1 feature engineering

For 16-len vector

In [47]:
from operator import setitem
def stimResponse(df, stim, response, zero_point = 0):
  """
  Takes a df, the name of the col holding the stimulus, the name of the col holding the response, and the value to use as the zero point

  Returns the df with the relationship type and the relation value added

  A different function will calculate the relationship between time periods
  """


  # takes a stim column and response column and returns two new feature column
  # PR is Positive Response
  # NR is Negative Response
  # PS is Positive Stimulus
  # NS is Negative Stimulus
  df_in = df.copy()
  #
  PR = df_in[response] > zero_point
  NR = df_in[response] <= zero_point
  PS = df_in[stim] > zero_point
  NS = df_in[stim] <= zero_point

  PR = PR.rename('PR')
  NR = NR.rename('NR')
  PS = PS.rename('PS')
  NS = NS.rename('NS')

  # combine into df
  df = pd.concat([df_in[stim], df_in[response], PR,NR,PS,NS], axis=1)
  #set the data types back to boolean
  df = df.astype({'PR': bool,
                'PS': bool,
                'NR': bool,
                'PR': bool})

  # get min and max values for normalizing
  S_min = df[stim].min()
  S_max = df[stim].max()
  R_min = df[response].min()
  R_max = df[response].max()

  #add a column normalizing the stim and response
  df['stim_norm'] = df[stim].apply(lambda x: x/S_max if x>0 else x/S_min)
  df['resp_norm'] = df[response].apply(lambda x: x/R_max if x>0 else x/R_min)


  # multiply stim and resp
  df['relation'] = df['stim_norm'] * df['resp_norm']

  #Set a new column with the category each row's relationship falls under
  df.loc[(df['PR'] & df['PS']),'relation_type'] = 'PRPS'
  df.loc[(df['NR'] & df['NS']),'relation_type'] = 'NRNS'
  df.loc[(df['PR'] & df['NS']),'relation_type'] = 'PRNS'
  df.loc[(df['NR'] & df['PS']),'relation_type'] = 'NRPS'
  
  df_in['relation_type_' + stim] = df['relation_type']
  df_in['relation_' + stim] = df['relation']
  return(df_in)

In [48]:
def get_future_stim_and_relationship(df_in, stim, time_window): 
  """
  takes a df, the stimulus name we're comparing, and the time window we're looking to in the future (in frames)

  returns a new df with two extra columns containing the relationship type and the value at the future time period
  """

  #make copy of the df to avoid editing the one in memory
  df = df_in.copy()

  #procedurally generate the column names we need to reference
  current_relation_colname = 'relation_'+stim
  current_relation_type_colname = 'relation_type_'+stim

  #and the column names we're creating
  future_relation_colname = 'relation_'+stim+'_+'+str(time_window)
  future_relation_type_colname = 'relation_type_'+stim+'_+'+str(time_window)

  #reset the index so that we can join the new columns later
  df = df.reset_index()


  # for each of the new columns, do four steps

  #get the existing array of relation values or relation types

  #cut off the first x values, where x is the time window we want to look in the future (done by indexing [time_window:])
        #this will drop the time x values in the future to line up with the current time

  #pad the end of the array with zeros by the same number of values so that the lengths still match (done by concatenating with np.zeros(time_window))

  #set the new array as the new 'future' column
  df[future_relation_colname] = pd.concat([df[current_relation_colname] \
                                           .reset_index(drop=True)[time_window:], \
                                           pd.Series((np.zeros(time_window)))], ignore_index = True)
  
  #repeat for the relation type
  df[future_relation_type_colname] = pd.concat([df[current_relation_type_colname] \
                                           .reset_index(drop=True)[time_window:], \
                                           pd.Series((np.zeros(time_window)))], ignore_index = True)

  #need to stop the end of one stack from referencing the beginning of another

  #get the index of any row where the frame # is high enough that x-frames in the future would run into another stack
  #by checking that the frame number is greater than the max frame number - (time window-1) (corrects for index number)
  end_of_stack_index = df.loc[df['frame'] >= max(df['frame'])-(time_window-1)].index

  #use .loc to set the two new columns to None for those rows
  df.loc[end_of_stack_index, future_relation_colname] = None
  df.loc[end_of_stack_index, future_relation_type_colname] = None

  #set the index back to the way it was
  df = df.set_index(['stack','label','roi','frame'])

  return df

In [49]:
def current_future_relationship(df, stim, time_window):
  """
  Takes df with a column for stim and a column for stim + a given time window. Returns the relationship.
  """
  # column names for current row
  current_relation_colname = 'relation_'+stim
  current_relation_type_colname = 'relation_type_'+stim

  # column names for rows in the future
  future_relation_colname = 'relation_'+stim+'_+'+str(time_window)
  future_relation_type_colname = 'relation_type_'+stim+'_+'+str(time_window)

  #with this approach, we need to drop the last x frames from each stack, so dropna
  df = df.dropna()

  #make a new column with the combo of current relation type and future relation type
  df['full_relation_type'] = df[current_relation_type_colname] + '-' + df[future_relation_type_colname]

  #one-hot encode the overall relationship, then multiply the new columns by the product of the relation values
  df = pd.get_dummies(df, columns=['full_relation_type'], prefix = current_relation_colname)

  #get a list of the columns created
  dummy_cols = [col for col in df.columns if current_relation_colname in col]

  #multiply the dummy columns by the relation value to distribute the value to the appropriate column
  for col in dummy_cols:
    df[col] = df[col] * (df[current_relation_colname]*df[future_relation_colname])

  return df

In [72]:
#for ease of use, package all four functions along with the dropna into one function for use later

def contrast_16_len_representation(df_in, stim, response, time_window, zero_point = 0):
  """
  helper function to run all of the stim1 functions that create the 16-length representation

  takes the df, stim name, response column name, time period to look in the future, and the zero point to compare positive and negative
  """

  df = df_in.copy()

  #get the current stim and response relationship
  df = stimResponse(df, stim, response, zero_point)
  #get the future stim and response relationship
  df = get_future_stim_and_relationship(df, stim, time_window)
  #get the relationship b/w the current and future stim/response columns
  df = current_future_relationship(df, stim, time_window)

  return df

4-len representation (assumes we will drop all negative response

In [51]:
from operator import setitem
def contrast_4_len_representation(df, stim, response, time_window, zero_point = 0):
  """
  Takes a df, the name of the col holding the stimulus, the name of the col holding the response, and the value to use as the zero point

  Returns the df with four extra columns holding the normalized response * stim1 for the four possible relationships

  the sign of the response is ignored because all frames with negative response will be dropped
  """


  # takes a stim column and returns two new feature column
  # PS is Positive Stimulus
  # NS is Negative Stimulus
  df_in = df.copy()

  #
  PS = df_in[stim] > zero_point
  NS = df_in[stim] <= zero_point

  PS = PS.rename('PS')
  NS = NS.rename('NS')

  # combine into df
  df = pd.concat([df_in[stim], df_in[response], PS,NS], axis=1)
  #set the data types back to boolean
  df = df.astype({'NS': bool,
                  'PS': bool})

  # get min and max values for normalizing
  S_min = df[stim].min()
  S_max = df[stim].max()
  R_min = df[response].min()
  R_max = df[response].max()

  #add a column normalizing the stim and response
  df['stim_norm'] = df[stim].apply(lambda x: x/S_max if x>0 else x/S_min)
  df['resp_norm'] = df[response].apply(lambda x: x/R_max if x>0 else x/R_min)


  # multiply stim and resp
  df['relation'] = df['stim_norm'] * df['resp_norm']

  #Set a new column with the category each row's relationship falls under
  df.loc[(df['PS']),'relation_type'] = 'PS'
  df.loc[(df['NS']),'relation_type'] = 'NS'
  
  df_in['relation_type_' + stim] = df['relation_type']
  df_in['relation_' + stim] = df['relation']

  df_in = get_future_stim_and_relationship(df_in, stim, time_window)
  df_in = df_in.dropna()

  df_in = current_future_relationship(df_in, stim, time_window)

  return(df_in)

# Stim 2 and 3 feature engineering

In [40]:
# def directional_stim_categories(df, stim, response):
#   """
#   DEPRECATED - this was for when the direction was a single attribute of the degree of motion


#   Takes a df, the name of the col holding the direction of motion stimululi, the name of the col holding the response
#   Returns the df with the stim broken out into five cols based on direction of motion

#   A different function will calculate the relationship between response and direction
#   """


#   # add a column that categorizes the direction into four quadrants (segments can be changed)
#   df_in = df.copy()

#   #set conditions checking the direction of the stimulus - there has to be a better way but this is it for now
#   conditions = [
#     ((-45 <= df_in[stim]) & (df_in[stim] < 0)), #northwest
#     ((0 < df_in[stim]) & (df_in[stim] < 45)), #northeast
#     ((45 <= df_in[stim]) & (df_in[stim] < 135)), #east
#     ((135 <= df_in[stim]) & (df_in[stim] <= 180)),#southeast
#     ((-180 <= df_in[stim]) & (df_in[stim] < -135)), #southwest
#     ((-135 <= df_in[stim]) & (df_in[stim] < -45)), #west
#     (df_in[stim] == 0) #no movement
#     ]

#   #set the names we assign to those conditions
#   values = ['up', 'up', 'right', 'down', 'down', 'left', 'no_motion']

#   #add a column to the df holding the direction category
#   df['direction'] = np.select(conditions, values)

#   #one-hot encode the quadrants
#   df = pd.get_dummies(df, columns=['direction'], prefix = 'direction')

#   #get the names of the 4 direction columns we just added
#   direction_cols = [col for col in df.columns if 'direction' in col]


#   return(df)

In [64]:
def direction_orientation_stim_features(df, stim_x, stim_y, stim_orientation, response):
  """
  DEPRECATED - this was for when the direction was a single attribute of the degree of motion


  Takes a df, the name of the col holding the direction of motion stimululi (x and y), the name of the col holding the response
  Returns the df with two three extra columns holding the resp*stim for each feature

  """


  df_in = df.copy()


  #multiply the response by the three directional features
  df['stim2_feature'] = df[response] * df[stim_x]

  df['stim3_feature'] = df[response] * df[stim_y]

  df['stim5_feature'] = df[response] * df[stim_orientation]


  return(df)

In [65]:
direction_orientation_stim_features(raw_dfs[raw_fnames[0]], 'stim2', 'stim3', 'stim5', 'resp')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,movie_ID,resp,stim1,stim2,stim3,stim4,stim5,stim6,stim7,stim8,stim2_feature,stim3_feature,stim5_feature
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,tm9,1,0,210728_0,0.119422,1.827970,-0.668979,-1.323903,1.969955,0.0,0.000488,0.0,0.0,-0.079891,-0.158103,0.0
1,tm9,1,1,210728_0,0.065380,0.676591,1.133205,0.209498,-0.011115,0.0,0.083134,0.0,0.0,0.074088,0.013697,0.0
1,tm9,1,2,210728_0,0.036917,-1.276472,-0.881429,1.220991,0.512083,0.0,0.308143,0.0,0.0,-0.032540,0.045075,0.0
1,tm9,1,3,210728_0,0.156975,-1.676237,0.505447,1.208871,-0.439756,0.0,0.574456,0.0,0.0,0.079342,0.189762,0.0
1,tm9,1,4,210728_0,0.195883,-1.351235,-0.832430,1.249990,0.074422,0.0,0.656676,0.0,0.0,-0.163059,0.244851,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,tm9,14,5508,210728_0,0.126178,-0.589866,0.878659,-0.854775,-0.210759,0.0,0.061990,0.0,0.0,0.110867,-0.107854,0.0
9,tm9,14,5509,210728_0,0.030026,0.622075,-0.084702,-1.416621,1.015806,0.0,-0.690350,0.0,0.0,-0.002543,-0.042535,0.0
9,tm9,14,5510,210728_0,0.061619,0.177987,-1.219253,0.914307,-0.824696,0.0,-0.865998,0.0,0.0,-0.075130,0.056339,0.0
9,tm9,14,5511,210728_0,0.270762,-0.198916,0.915779,-0.799899,-0.628389,0.0,-0.946322,0.0,0.0,0.247958,-0.216582,0.0


# RUN FUNCTIONS ON DATA

In [7]:
# get all raw DF's
from os import listdir
root = 'drive/MyDrive/DS6011_Capstone_VisualNeuroscience/Seeded_CNMF/Extracted_Features/'
raw_fnames = listdir(root + 'pre-feature extract/')
raw_fnames = [x for x in raw_fnames if 'RAW_extracted_data' in x]
raw_fnames

['210728_0_RAW_extracted_data.csv',
 '210731_0_RAW_extracted_data.csv',
 '210802_0_RAW_extracted_data.csv',
 '210808_0_RAW_extracted_data.csv',
 '210809_2_RAW_extracted_data.csv',
 '210815_0_RAW_extracted_data.csv',
 '210815_1_RAW_extracted_data.csv',
 '210816_0_RAW_extracted_data.csv',
 '210816_1_RAW_extracted_data.csv']

In [81]:
# Dictionary of df's. One for each video. You can call each df my name
raw_dfs = {}
for p in raw_fnames:
  df = pd.read_csv(root + 'pre-feature extract/' + p, index_col=['stack','label','roi','frame']).drop(columns='Unnamed: 0')
  raw_dfs[p] = df
raw_dfs[raw_fnames[-1]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,movie_ID,resp,stim1,stim2,stim3,stim4,stim5,stim6,stim7,stim8
stack,label,roi,frame,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,tm4,1,0,210816_1,0.153797,-2.195615,-1.954008,0.382234,3.567930,0.000000,0.000699,0.000000,0.000000
1,tm4,1,1,210816_1,0.280574,-2.195615,0.777023,-0.001856,-0.540388,0.000000,-0.077014,0.000000,0.000000
1,tm4,1,2,210816_1,0.493612,-2.195615,0.777023,-0.001856,-0.540388,0.000000,-0.340523,0.000000,0.000000
1,tm4,1,3,210816_1,0.667897,-1.966966,-0.498707,1.864190,-0.438672,0.000000,-0.878509,0.000000,0.000000
1,tm4,1,4,210816_1,0.497233,-1.738318,-1.713994,1.108878,-0.225283,0.000000,-1.669368,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,tm1,10,5508,210816_1,0.336650,1.353177,0.772721,0.007225,-0.536081,-1.417802,0.451827,-0.353452,0.084494
9,tm1,10,5509,210816_1,0.297701,1.411397,-0.710081,-1.866442,-0.351771,-1.562954,1.437190,-0.353452,0.084494
9,tm1,10,5510,210816_1,0.438173,1.423041,-0.597663,-1.871432,-0.497068,-1.569130,1.767656,-0.353452,0.321161
9,tm1,10,5511,210816_1,0.337997,1.423041,0.772721,0.007225,-0.536081,-1.546306,1.726848,-0.353452,0.321161


In [82]:
raw_dfs.keys()

dict_keys(['210728_0_RAW_extracted_data.csv', '210731_0_RAW_extracted_data.csv', '210802_0_RAW_extracted_data.csv', '210808_0_RAW_extracted_data.csv', '210809_2_RAW_extracted_data.csv', '210815_0_RAW_extracted_data.csv', '210815_1_RAW_extracted_data.csv', '210816_0_RAW_extracted_data.csv', '210816_1_RAW_extracted_data.csv'])

## For each df, run through pipeline

In [83]:
#Set parameters here

response = 'resp'
time_window = 5

#declare if we're chunking the stacks into smaller segments
chunk_stacks = True
#if we are, declare how many chunks we're breaking each stack into
periods_per_stack = 10

#declare if we're only keeping positive response values or if we're using all
#if we're removing them, we get the 4-length stim1 feature, otherwise it's the 16-length stim1
only_keep_positive_resp = True



#loop through the different movies' df's and run the feature extraction functions on each
for p in raw_fnames:
  df = raw_dfs[p]

  #extract directional and orientation features
  df = direction_orientation_stim_features(df, stim_x = 'stim2', stim_y = 'stim3', stim_orientation = 'stim5', response = response)


  #ADD OTHER FEATURE EXTRACTION FUNCTIONS HERE before the contrast ones remove some rows and potentially throw things off





  #check if we're keeping negative response, then process the two contrast stimuli with whatever the choice is
  #each of these approaches will remove the final x frames from each stack, where x is the time_window declared above
  if only_keep_positive_resp == True:
    df = contrast_4_len_representation(df, 'stim1', response, time_window, zero_point = 0)
    df = contrast_4_len_representation(df, 'stim6', response, time_window, zero_point = 0)
    #remove the last frames in each stack that didn't have a future frame to look into
    df = df.dropna()
  else:
    df = contrast_16_len_representation(df, 'stim1', response, time_window, zero_point = 0)
    df = contrast_16_len_representation(df, 'stim6', response, time_window, zero_point = 0)
    #remove the last frames in each stack that didn't have a future frame to look into
    df = df.dropna()

  #if we're removing negative response frames, remove them here
  if only_keep_positive_resp == True:
    df = df.loc[df['resp'] > 0]


  #if we're chunking stacks into smaller segments, run groupFrames to assign frames to a group
  if chunk_stacks == True:
    df = groupFrames(df, periods_per_stack)

  
  raw_dfs[p] = df

In [87]:
#look at one of the df's to see if all the columns are there
print(raw_dfs[raw_fnames[0]].columns)
raw_dfs[raw_fnames[0]]

Index(['stack', 'label', 'roi', 'frame', 'movie_ID', 'resp', 'stim1', 'stim2',
       'stim3', 'stim4', 'stim5', 'stim6', 'stim7', 'stim8', 'stim2_feature',
       'stim3_feature', 'stim5_feature', 'relation_type_stim1',
       'relation_stim1', 'relation_stim1_+5', 'relation_type_stim1_+5',
       'relation_stim1_NS-NS', 'relation_stim1_NS-PS', 'relation_stim1_PS-NS',
       'relation_stim1_PS-PS', 'relation_type_stim6', 'relation_stim6',
       'relation_stim6_+5', 'relation_type_stim6_+5', 'relation_stim6_NS-NS',
       'relation_stim6_NS-PS', 'relation_stim6_PS-NS', 'relation_stim6_PS-PS',
       'frame_group'],
      dtype='object')


Unnamed: 0,stack,label,roi,frame,movie_ID,resp,stim1,stim2,stim3,stim4,...,relation_stim1_PS-PS,relation_type_stim6,relation_stim6,relation_stim6_+5,relation_type_stim6_+5,relation_stim6_NS-NS,relation_stim6_NS-PS,relation_stim6_PS-NS,relation_stim6_PS-PS,frame_group
0,1,tm9,1,0,210728_0,0.119422,1.827970,-0.668979,-1.323903,1.969955,...,0.000000e+00,PS,4.856838e-13,2.497951e-16,NS,0.000000e+00,0.000000e+00,1.213215e-28,0.0,1
1,1,tm9,1,1,210728_0,0.065380,0.676591,1.133205,0.209498,-0.011115,...,0.000000e+00,PS,1.112616e-08,3.954954e-11,NS,0.000000e+00,0.000000e+00,4.400346e-19,0.0,1
2,1,tm9,1,2,210728_0,0.036917,-1.276472,-0.881429,1.220991,0.512083,...,0.000000e+00,PS,5.882838e-08,3.046802e-10,NS,0.000000e+00,0.000000e+00,1.792384e-17,0.0,1
3,1,tm9,1,3,210728_0,0.156975,-1.676237,0.505447,1.208871,-0.439756,...,0.000000e+00,PS,1.561918e-06,1.444193e-09,NS,0.000000e+00,0.000000e+00,2.255711e-15,0.0,1
4,1,tm9,1,4,210728_0,0.195883,-1.351235,-0.832430,1.249990,0.074422,...,0.000000e+00,PS,2.560257e-06,1.536247e-09,NS,0.000000e+00,0.000000e+00,3.933188e-15,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499434,9,tm9,14,5498,210728_0,0.460237,1.083291,-1.113454,-1.038412,0.984346,...,0.000000e+00,NS,1.099666e-05,1.540014e-08,PS,0.000000e+00,1.693501e-13,0.000000e+00,0.0,10
499435,9,tm9,14,5499,210728_0,0.632090,2.014169,-0.812166,-1.263069,-0.316891,...,3.496641e-14,NS,3.614308e-06,8.524720e-11,PS,0.000000e+00,3.081096e-16,0.000000e+00,0.0,10
499436,9,tm9,14,5500,210728_0,0.411466,1.376301,-0.201387,-1.421431,-0.502525,...,0.000000e+00,NS,1.233379e-09,9.723598e-19,NS,1.199288e-27,0.000000e+00,0.000000e+00,0.0,10
499437,9,tm9,14,5501,210728_0,0.724411,0.126393,-1.211222,0.924204,0.352490,...,0.000000e+00,PS,1.222336e-05,1.025256e-10,NS,0.000000e+00,0.000000e+00,1.253207e-15,0.0,10


## assign DF's with train/test labels

In [21]:
# # SET THESE VALUES EACH TIME

# folder_name = '5_frames_stim1' # create a new folder name for saving out these files (make sure this folder already exists)
# num_train = 3
# num_test = 2
# num_val = 0

In [22]:
# arr = np.array(['train'] * num_train + ['test'] * num_test + ['val'] * num_val)
# np.random.shuffle(arr)

# # save out each DF with a label indicating train, test or validation
# for i, p in enumerate(raw_fnames):
#   p_split = p.split('.')[-2] # chop off csv
  
#   raw_dfs[p].to_csv(root + folder_name + '/' + p_split + '_' + arr[i] + '.csv') #saves to new folder

## Save data - assign the folder manually, probably label the csv's based on the settings we used

In [23]:
#skipped train/test because it was giving me trouble

folder_name = '5_frames_stim1'

for i, p in enumerate(raw_fnames):
  p_split = p.split('.')[-2] # chop off csv
  
  raw_dfs[p].to_csv(root + folder_name + '/' + p_split + '_features.csv') #saves to new folder

In [24]:
listdir(root + folder_name)

['210809_2_RAW_extracted_data_features.csv',
 '210728_0_RAW_extracted_data_features.csv',
 '210802_0_RAW_extracted_data_features.csv',
 '210808_0_RAW_extracted_data_features.csv',
 '210731_0_RAW_extracted_data_features.csv',
 '210815_1_RAW_extracted_data_features.csv',
 '210816_0_RAW_extracted_data_features.csv',
 '210815_0_RAW_extracted_data_features.csv',
 '210816_1_RAW_extracted_data_features.csv']