# STEP 1: 1_preprocess_data.ipynb

Press SHIFT + ENTER to run code

### USER INPUT!
Specify where to store the csvs. change the data_root_dir

In [1]:
## Define project
project_name = 'project_acc_MiniscopeCap'

## Say where your data holding your DLC-analyzed CSVs is stored
    # i.e. Apple: '/Users/justinjames/LUPE_Corder-Lab/+project_XXX+/dlc_csvs'
data_root_dir = '/Users/justinjames/LUPE_Corder-Lab/project_acc_MiniscopeCap/dlc_csvs'

## Breakdown how your data is organized in the folders-- name of folders that are groups? within groups, name condition folders
    # i.e. groups = ['Group1', 'Group2','Group3']
    # i.e. conditions = ['control','experiment']
groups = ['Combined']
conditions = ['A_baseline','B_exp_cap','C_baseline_morphine','D_exp_morphine-cap']

### Updating meta.py for project

In [2]:
import os

def update_meta_file(project_name):
    meta_file_path = '../utils/meta.py'
    
    groups_var = f"groups_{project_name} = {groups}"
    conditions_var = f"conditions_{project_name} = {conditions}"
    
    # Read the current contents of the meta file
    if os.path.exists(meta_file_path):
        with open(meta_file_path, 'r') as file:
            lines = file.readlines()
    else:
        lines = []

    # Check if the variables are already defined and update them if necessary
    groups_defined = False
    conditions_defined = False
    for i, line in enumerate(lines):
        if line.startswith(f"groups_{project_name} ="):
            lines[i] = groups_var + '\n'
            groups_defined = True
        elif line.startswith(f"conditions_{project_name} ="):
            lines[i] = conditions_var + '\n'
            conditions_defined = True

    # If the variables are not defined, add them to the end of the file
    if not groups_defined:
        lines.append(groups_var + '\n')
    if not conditions_defined:
        lines.append(conditions_var + '\n')

    # Write the updated contents back to the meta file
    with open(meta_file_path, 'w') as file:
        file.writelines(lines)
    
    print(f'Updated {meta_file_path} with project-specific groups and conditions.')

# Example usage
update_meta_file(project_name)

Updated ../utils/meta.py with project-specific groups and conditions.


### Main Code: store all data in dictionary
WAIT UNTIL PROCESSING DATA FINISHES

In [3]:
###### RUN DEPENDENCIES ######
import glob
import pickle
import os
import sys
if not os.path.join(os.path.abspath(''), '../') in sys.path:
    sys.path.append(os.path.join(os.path.abspath(''), '../'))
import numpy as np
import pandas as pd
from tqdm import notebook
from utils.feature_utils import filter_pose_noise

###### MAIN CODE ######
filenames = {key: [] for key in groups}
data = {key: [] for key in groups}
for group in notebook.tqdm(groups):
    filenames[group] = {key: [] for key in conditions}
    data[group] = {key: [] for key in conditions}
    for condition in notebook.tqdm(conditions):
        
        filenames[group][condition] = glob.glob(str.join('/', 
                                                              (data_root_dir,
                                                               f'{group}', 
                                                               f'{condition}', 
                                                               '*.csv')))
        data[group][condition] = {key: [] for key in [f'file{i}' 
                                                           for i in range(len(filenames[group][condition]))]}
        for i, csv in enumerate(notebook.tqdm(filenames[group][condition])):
            temp_df = pd.read_csv(csv, header=[0, 1, 2, 3], sep=",", index_col=0)
            selected_pose_idx = np.arange(temp_df.shape[1])
            idx_llh = selected_pose_idx[2::3]
            # the loaded sleap file has them too, so exclude for both
            idx_selected = [i for i in selected_pose_idx if i not in idx_llh]
            currdf_filt, _ = filter_pose_noise(temp_df, idx_selected=idx_selected, idx_llh=idx_llh, 
                                               llh_value=0.1)
            data[group][condition][f'file{i}'] = currdf_filt

###### WAIT UNTIL PROCESSING DATA FINISHES ######

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [4]:
###### STORE ALL DATA IN DICTIONARY ######  
directory = f"../processed_dataset/{project_name}/"
os.makedirs(directory, exist_ok=True)

raw_data_pkl_filename = f"../processed_dataset/{project_name}/raw_data_{project_name}.pkl"
with open(raw_data_pkl_filename, 'wb') as f:
    pickle.dump(data, f)

print(f'{raw_data_pkl_filename} is created and saved!')

../processed_dataset/project_acc_MiniscopeCap/raw_data_project_acc_MiniscopeCap.pkl is created and saved!


In [5]:
###### CHECK DATA STORED CORRECTLY IN DICTIONARY ######
from utils.classification import load_behaviors, load_data
data = load_data(f"../processed_dataset/{project_name}/raw_data_{project_name}.pkl")
data

{'Combined': {'A_baseline': {'file0': array([[309.83081508, 579.99483061, 319.83498585, ..., 610.65679789,
           515.34495378, 607.28549612],
          [308.82217073, 580.8638652 , 320.65303707, ..., 611.1807282 ,
           517.88944221, 607.61251211],
          [306.7515583 , 580.48179293, 320.06240487, ..., 612.27098238,
           521.70951915, 605.60188457],
          ...,
          [109.92903638, 424.57951093,  94.662992  , ..., 408.39288974,
           181.34879708, 392.89077282],
          [109.92903638, 424.57951093,  94.662992  , ..., 406.38709331,
           178.15924931, 394.57894969],
          [109.92903638, 424.57951093,  94.662992  , ..., 408.38613772,
           179.72269332, 393.83204317]]),
   'file1': array([[602.54467046, 391.63392997, 606.25422907, ..., 321.06584269,
           603.39918017, 382.88936496],
          [602.54467046, 391.63392997, 606.25422907, ..., 321.2814796 ,
           604.4606787 , 380.10105974],
          [602.54467046, 391.63392997, 606.

# MOVE TO STEP 2!
2_preprocess_get_features.ipynb 