1. importing all of the needed packages :

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import os
import sys
import re
import shutil
sys.path.append('..\\neurokin\\')
from neurokin.kinematic_data import KinematicDataRun
from neurokin.utils.neural import processing, importing
from neurokin.utils.experiments import neural_states_helper, neural_correlates_plot, spider_factory
from neurokin.locomotion_states import NeuralCorrelatesStates
from neurokin.utils.helper.load_config import read_config
from neurokin.utils.experiments.neural_states_helper import get_runs_list
from IPython.core.pylabtools import figsize
from matplotlib.pyplot import figure
import plotly.express as px

  from .autonotebook import tqdm as notebook_tqdm


2. get the experiment structure :

In [2]:
experiment_structure_path = "C:/Users/User/Documents/data_labrotation/gait_feats_SpringAndFall_2023/experiment_structure_SpringAndFall_for_gait.yaml" 

In [3]:
experiment_structure = read_config(experiment_structure_path, converts_keys_to_string=True)

3. create the metadata for your dataset (date,subject,condition and Run-number of the run) :

In [4]:
trial_list = get_runs_list(experiment_structure, skip_subjects=[], skip_conditions=[])
trial_data = []
for trial in trial_list:
    date, subject, condition, run = trial
    trial_data.append(trial)
metadata = pd.DataFrame(trial_data, columns=["date", "subject", "condition", "run"])

4. define which animals are healthy :

In [5]:
healthy = ["NWE00131", "NWE00158", "NWE00166","NWE00188", "NWE00189", "NWE00198", "NWE00197"]

5. define parameters you want to have/not have in your dataset :

In [6]:
green_param = ["FILENAME", "SIDE", "ANIMAL", "CONDITION 1", "CONDITION 2", "DURATION", "VELOCITY", "ST DUR", "SW DUR", "%ST DUR", "DOUBLE STANCE", "HIP-END BW", "HIP-END FW", "STEP HEIGHT nor", "MAX ENDPOINT Vel", "T MAX ENDPOINT Vel", "Acc ENDPOINT", "Vel ENDPOINT", "Angle Vel ENPOINT", "FOOT SW LAT", "STEP WIDTH", "MAX HipY nor", "MIN HipY nor", "AMP HipY", "SD MidTrunk XY", "SD MidTrunk XY Vel", "SD MidHip Z", "SD MidHip Y", "SD MidSh Y", "SD MidSh Z", "SD Shoulders", "SD Hips", "L VirtCOM fw", "L VirtCOM lat", "L VirtCOM vert", "L VirtCOM 3D", "MIN ELE 1", "MIN ELE 2", "MIN ELE 3", "MIN ELE 4", "MIN ELE 6", "MAX ELE 1", "MAX ELE 2", "MAX ELE 3", "MAX ELE 4", "MAX ELE 6", "MAX JOINT 1", "MAX JOINT 2", "MAX JOINT 3", "MAX LIMBLAT", "MIN JOINT 1", "MIN JOINT 2", "MIN JOINT 3", "MIN LIMBLAT", "AMP ELE 1", "AMP ELE 2", "AMP ELE 3", "AMP ELE 4", "AMP LIMB", "AMP JOINT 1", "AMP JOINT 2", "AMP JOINT 3", "AMP LIMBLAT", "MIN SPEEDLIMB 1", "MIN SPEEDJOINT 1", "MIN SPEEDJOINT 2", "MIN SPEEDJOINT 3", "MAX SPEEDLIMB 1", "MAX SPEEDJOINT 1", "MAX SPEEDJOINT 2", "MAX SPEEDJOINT 3", "AMP SPEEDLIMB 1", "AMP SPEEDJOINT 1", "AMP SPEEDJOINT 2", "AMP SPEEDJOINT 3", "PH2 PH1", "PH3 PH2", "PH4 PH3", "R CREST-THIGH", "R THIGH-LEG", "R LEG-FOOT", "R HIP-KNEE", "R KNEE-ANKLE", "R ANKLE-MTP", "CREST-THIGH timingMIN", "CREST-THIGH timingMAX", "THIGH-LEG timingMIN", "THIGH-LEG timingMAX", "LEG-FOOT timingMIN", "LEG-FOOT timingMAX", "STEP HEIGHT", "PH1", "AMP1", "PH2", "AMP2", "PH3", "AMP3", "PH4", "AMP4", "LAG CREST-THIGH", "LAG THIGH-LEG", "LAG LEG-FOOT", "LAG HIP-KNEE", "LAG KNEE-ANKLE", "LAG ANKLE-MTP", "SD DURATION", "SD L STRIDE", "SD ST DUR", "SD SW DUR", "SD STEP WIDTH", "SD SD MidTrunk XY", "SD SD MidTrunk XY Vel", "SD STEP HEIGHT nor", "SD HIP-END BW", "SD HIP-END FW", "SD ST contra HL", "SD L PATH", "SD MAX ENDPOINT Vel", "SD T MAX ENDPOINT Vel", "SD PH2 PH1", "SD PH3 PH2", "SD PH4 PH3", "SD DOUBLE STANCE", "SD Acc ENDPOINT", "SD Vel ENDPOINT", "SD Angle Vel ENPOINT", "L STRIDE", "L STEP", "L PATH"]
green_param_sum = ["FILENAME", "SIDE", "ANIMAL", "DURATION", "VELOCITY", "ST DUR", "SW DUR", "%ST DUR", "DOUBLE STANCE", "HIP-END BW", "HIP-END FW", "STEP HEIGHT nor", "MAX ENDPOINT Vel", "T MAX ENDPOINT Vel", "Acc ENDPOINT", "Vel ENDPOINT", "Angle Vel ENPOINT", "FOOT SW LAT", "STEP WIDTH", "MAX HipY nor", "MIN HipY nor", "AMP HipY", "SD MidTrunk XY", "SD MidTrunk XY Vel", "SD MidHip Z", "SD MidHip Y", "SD MidSh Y", "SD MidSh Z", "SD Shoulders", "SD Hips", "L VirtCOM fw", "L VirtCOM lat", "L VirtCOM vert", "L VirtCOM 3D", "MIN ELE 1", "MIN ELE 2", "MIN ELE 3", "MIN ELE 4", "MIN ELE 6", "MAX ELE 1", "MAX ELE 2", "MAX ELE 3", "MAX ELE 4", "MAX ELE 6", "MAX JOINT 1", "MAX JOINT 2", "MAX JOINT 3", "MAX LIMBLAT", "MIN JOINT 1", "MIN JOINT 2", "MIN JOINT 3", "MIN LIMBLAT", "AMP ELE 1", "AMP ELE 2", "AMP ELE 3", "AMP ELE 4", "AMP LIMB", "AMP JOINT 1", "AMP JOINT 2", "AMP JOINT 3", "AMP LIMBLAT", "MIN SPEEDLIMB 1", "MIN SPEEDJOINT 1", "MIN SPEEDJOINT 2", "MIN SPEEDJOINT 3", "MAX SPEEDLIMB 1", "MAX SPEEDJOINT 1", "MAX SPEEDJOINT 2", "MAX SPEEDJOINT 3", "AMP SPEEDLIMB 1", "AMP SPEEDJOINT 1", "AMP SPEEDJOINT 2", "AMP SPEEDJOINT 3", "PH2 PH1", "PH3 PH2", "PH4 PH3", "R CREST-THIGH", "R THIGH-LEG", "R LEG-FOOT", "R HIP-KNEE", "R KNEE-ANKLE", "R ANKLE-MTP", "CREST-THIGH timingMIN", "CREST-THIGH timingMAX", "THIGH-LEG timingMIN", "THIGH-LEG timingMAX", "LEG-FOOT timingMIN", "LEG-FOOT timingMAX", "STEP HEIGHT", "PH1", "AMP1", "PH2", "AMP2", "PH3", "AMP3", "PH4", "AMP4", "LAG CREST-THIGH", "LAG THIGH-LEG", "LAG LEG-FOOT", "LAG HIP-KNEE", "LAG KNEE-ANKLE", "LAG ANKLE-MTP", "L STRIDE", "L STEP", "L PATH"]
to_drop = [ "COND1", "COND2", "COND3","GAIT_TYPE","LIMB","PATHNAME","FILENAME", "ANIMAL", "CONDITION 1", "CONDITION 2", "MIN ELE 1", "MIN ELE 2", "MAX ELE 1","MAX ELE 2", "MAX JOINT 1",  "MAX JOINT 2", "MIN JOINT 1", "MIN JOINT 2", "AMP ELE 1", "AMP ELE 2", "AMP JOINT 1", "AMP JOINT 2", "MIN SPEEDJOINT 1", "MIN SPEEDJOINT 2", "AMP SPEEDJOINT 1", "AMP SPEEDJOINT 2", "AMP3","AMP4", 'ANIMAL.1', 'COND1.1', 'COND2.1', 'SIDE.1', 'SPEED']
columns = green_param + ["ANIMAL_ID", "EXPERIMENT"]

6. define, from what folder you are taking your raw gait files from

In [7]:
source_folder = "C:/Users/User/Downloads/gait_only_test"
SUM = "SUM/" # this will be used in the following loop

7. this loop iterates through my folder structure (which is: dates/animals/runs), then:
- in the "run" folder there are different files with the raw features of that run. We want to take our data from that run from the file that ends with "GAIT_SUM.txt"!
- From that one file of each and every run we go through, we "take out" all of the gait features (with its values from that run ofc)  we want to. Then, we put these gait features together with our metadata (corresponding date for that run etc.).
- at the end of every loop iteration we append the data that corresponds to that run to the df_list. Then outside the loop, the data from that list will be put together into the whole dataframe.
- This final dataframe contains all the raw values for all of our picked gait features + metadata for every day, every animal and every run!

In [8]:
df_list = []
metadata.rename(columns={"subject": "ANIMAL", "run": "RUN"}, inplace=True)

dates = [f.name for f in os.scandir(source_folder) if f.is_dir() and f.name.isnumeric()]
for date in dates:
    path_animal = os.path.join(source_folder, date)
    animals = [f.name for f in os.scandir(path_animal) if f.is_dir()]
    for animal in animals:
        path_run = os.path.join(path_animal, animal)
        for run in os.listdir(path_run):
            run_path = os.path.join(path_run, run)
            path_to_SUM = os.path.join(run_path, SUM)
            print(path_to_SUM)

            if not os.path.exists(path_to_SUM):
                print(f"no SUM folder for run {run}")
                continue

            sums = next(os.walk(path_to_SUM))[2]
            print(sums)

            file = next((os.path.join(path_to_SUM, i) for i in sums if i.endswith("_GAIT_SUM.txt")), None)
            print(file)

            if file:
                temp_df = pd.read_csv(file, sep="\t")
                temp_df = temp_df[green_param_sum]
                temp_df["date"] = date
                temp_df["ANIMAL"] = animal
                temp_df["RUN"] = os.path.basename(os.path.normpath(run_path))
                temp_df = temp_df.merge(
                    metadata[["date", "ANIMAL", "RUN", "condition"]], on=["date", "ANIMAL", "RUN"], how="left")
                temp_df["healthy"] = animal in healthy
                df_list.append(temp_df)

df_raw_Dataframe = pd.concat(df_list, ignore_index=True)

C:/Users/User/Downloads/gait_only_test\230525\NWE00130\01\SUM/
['cropped_runway01_GAIT_SUM.txt', 'cropped_runway01_MEAN_SUM.txt']
C:/Users/User/Downloads/gait_only_test\230525\NWE00130\01\SUM/cropped_runway01_GAIT_SUM.txt
C:/Users/User/Downloads/gait_only_test\230525\NWE00130\07\SUM/
['runway07_GAIT_SUM.txt', 'runway07_MEAN_SUM.txt']
C:/Users/User/Downloads/gait_only_test\230525\NWE00130\07\SUM/runway07_GAIT_SUM.txt
C:/Users/User/Downloads/gait_only_test\230525\NWE00130\09\SUM/
['runway09_GAIT_SUM.txt', 'runway09_MEAN_SUM.txt']
C:/Users/User/Downloads/gait_only_test\230525\NWE00130\09\SUM/runway09_GAIT_SUM.txt
C:/Users/User/Downloads/gait_only_test\230525\NWE00130\11\SUM/
['cropped_runway11_GAIT_SUM.txt', 'cropped_runway11_MEAN_SUM.txt']
C:/Users/User/Downloads/gait_only_test\230525\NWE00130\11\SUM/cropped_runway11_GAIT_SUM.txt
C:/Users/User/Downloads/gait_only_test\230525\NWE00130\12\SUM/
no SUM folder for run 12


8. If we decide to drop more gait features we don't have to let the whole loop run again, we can just:
- define what columns (=features) we want to drop and then:
- drop them

In [12]:
cols_to_drop = df_raw_Dataframe.columns[df_raw_Dataframe.columns.str.contains('LAG|timing')]
df_raw_post_dropping = df_raw_Dataframe.drop(cols_to_drop, axis=1)

9. you can also drop animals you want to exclude + check if you successfully dropped them : 

In [14]:
df_raw_without_certain_animal = df_raw_post_dropping.drop(df_raw_post_dropping[df_raw_Dataframe["ANIMAL"].isin(["NWE00189", "NWE00191", "NWE00193"])].index, axis=0)
print(df_raw_without_certain_animal["ANIMAL"].unique())

['NWE00130']


10. save your dataframe :

In [17]:
df_raw_without_certain_animal.to_csv("df_raw_Dataframe_test.csv", index=False)