# STEP 1: 1_preprocess_data.ipynb

Press SHIFT + ENTER to run code

### USER INPUT!
Specify where to store the csvs. change the data_root_dir

In [2]:
## Define project
project_name = 'project_ACC_MiniscopeSNI_3WeekMorphine'

## Say where your data holding your DLC-analyzed CSVs is stored
    # i.e. Apple: '/Users/justinjames/LUPE_Corder-Lab/'+project_XXX+'/dlc_csvs'
data_root_dir = '/Users/justinjames/LUPE_Corder-Lab/project_ACC_MiniscopeSNI_3WeekMorphine/dlc_csvs'

## Breakdown how your data is organized in the folders-- name of folders that are groups? within groups, name condition folders
    # i.e. groups = ['Group1', 'Group2','Group3']
    # i.e. conditions = ['control','experiment']
groups = ['NoInjury','SNI']
conditions = ['Combined']

### Updating meta.py for project

In [2]:
import os

def update_meta_file(project_name):
    meta_file_path = '../utils/meta.py'
    
    groups_var = f"groups_{project_name} = {groups}"
    conditions_var = f"conditions_{project_name} = {conditions}"
    
    # Read the current contents of the meta file
    if os.path.exists(meta_file_path):
        with open(meta_file_path, 'r') as file:
            lines = file.readlines()
    else:
        lines = []

    # Check if the variables are already defined and update them if necessary
    groups_defined = False
    conditions_defined = False
    for i, line in enumerate(lines):
        if line.startswith(f"groups_{project_name} ="):
            lines[i] = groups_var + '\n'
            groups_defined = True
        elif line.startswith(f"conditions_{project_name} ="):
            lines[i] = conditions_var + '\n'
            conditions_defined = True

    # If the variables are not defined, add them to the end of the file
    if not groups_defined:
        lines.append(groups_var + '\n')
    if not conditions_defined:
        lines.append(conditions_var + '\n')

    # Write the updated contents back to the meta file
    with open(meta_file_path, 'w') as file:
        file.writelines(lines)
    
    print(f'Updated {meta_file_path} with project-specific groups and conditions.')

# Example usage
update_meta_file(project_name)

Updated ../utils/meta.py with project-specific groups and conditions.


### Main Code: store all data in dictionary
WAIT UNTIL PROCESSING DATA FINISHES

In [3]:
###### RUN DEPENDENCIES ######
import glob
import pickle
import os
import sys
if not os.path.join(os.path.abspath(''), '../') in sys.path:
    sys.path.append(os.path.join(os.path.abspath(''), '../'))
import numpy as np
import pandas as pd
from tqdm import notebook
from utils.feature_utils import filter_pose_noise

###### MAIN CODE ######
filenames = {key: [] for key in groups}
data = {key: [] for key in groups}
for group in notebook.tqdm(groups):
    filenames[group] = {key: [] for key in conditions}
    data[group] = {key: [] for key in conditions}
    for condition in notebook.tqdm(conditions):
        
        filenames[group][condition] = glob.glob(str.join('/', 
                                                              (data_root_dir,
                                                               f'{group}', 
                                                               f'{condition}', 
                                                               '*.csv')))
        data[group][condition] = {os.path.splitext(os.path.basename(csv))[0]: [] 
                                  for csv in filenames[group][condition]}
        
        for csv in notebook.tqdm(filenames[group][condition]):
            temp_df = pd.read_csv(csv, header=[0, 1, 2, 3], sep=",", index_col=0)
            selected_pose_idx = np.arange(temp_df.shape[1])
            idx_llh = selected_pose_idx[2::3]
            # the loaded sleap file has them too, so exclude for both
            idx_selected = [i for i in selected_pose_idx if i not in idx_llh]
            currdf_filt, _ = filter_pose_noise(temp_df, idx_selected=idx_selected, idx_llh=idx_llh, 
                                               llh_value=0.1)
            file_name = os.path.splitext(os.path.basename(csv))[0]
            data[group][condition][file_name] = currdf_filt

###### WAIT UNTIL PROCESSING DATA FINISHES ######

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
###### STORE ALL DATA IN DICTIONARY ######  
directory = f"../processed_dataset/{project_name}/"
os.makedirs(directory, exist_ok=True)

raw_data_pkl_filename = f"../processed_dataset/{project_name}/raw_data_{project_name}.pkl"
with open(raw_data_pkl_filename, 'wb') as f:
    pickle.dump(data, f)

print(f'{raw_data_pkl_filename} is created and saved!')

../processed_dataset/project_ACC_MiniscopeSNI_3WeekMorphine/raw_data_project_ACC_MiniscopeSNI_3WeekMorphine.pkl is created and saved!


In [3]:
###### CHECK DATA STORED CORRECTLY IN DICTIONARY ######
from utils.classification import load_behaviors, load_data
data = load_data(f"../processed_dataset/{project_name}/raw_data_{project_name}.pkl")
data

{'NoInjury': {'Combined': {'MM105_Morphine_Basler_acA2040-120um__23670847__20241218_113306366DLC_resnet50_LUPE_MALEDec5shuffle1_350000': array([[184.80142021, 497.38641334, 187.97276437, ..., 628.30914342,
           298.12312961, 654.56782484],
          [184.84799552, 498.49660438, 187.28578293, ..., 626.08816373,
           295.40284586, 653.36587071],
          [184.26515245, 497.50362778, 187.33456075, ..., 628.50035614,
           298.8609525 , 654.72610331],
          ...,
          [333.96572546, 145.86093712, 341.28974462, ...,  99.38473845,
           542.40511364, 108.85613585],
          [333.5993017 , 147.70040703, 340.33030939, ...,  98.28577018,
           543.57034099, 107.09849763],
          [334.16852427, 152.22038001, 341.35654068, ...,  96.16453373,
           542.29028153, 106.44597602]]),
   'MM103_Morphine_Basler_acA2040-120um__23670847__20241218_101558370DLC_resnet50_LUPE_MALEDec5shuffle1_350000': array([[510.61209893, 367.27414048, 518.06395102, ..., 400.49400

# MOVE TO STEP 2!
2_preprocess_get_features.ipynb 