## Imports

In [1]:
import pandas as pd
import pandas as pd
from tqdm.notebook import tqdm
import json
import nimblephysics as nimble
import os
from pprint import pprint
import numpy as np
from typing import List, Tuple, Union, Dict

## Generate/Load our initial dataframe

In [4]:


def extract_frame_data(frame) -> Dict[str, Union[np.ndarray, float, bool, int, List[Tuple[str, np.ndarray]]]]:
    data = {
        'acc': frame.acc,
        'customValues': frame.customValues,
        'groundContactCenterOfPressure': frame.groundContactCenterOfPressure,
        'groundContactForce': frame.groundContactForce,
        'groundContactTorque': frame.groundContactTorque,
        'groundContactWrenches': frame.groundContactWrenches,
        'pos': frame.pos,
        'probablyMissingGRF': frame.probablyMissingGRF,
        't': frame.t,
        'tau': frame.tau,
        'trial': frame.trial,
        'vel': frame.vel,
    }
    return data

def objects_to_dataframe(objects_list):
    # Convert each object to a dictionary and coerce attributes to numeric
    dicts = []

    # For each object in the entire object list
    for obj in objects_list:
        dicts.append(extract_frame_data(obj))

    return pd.DataFrame(dicts)

def get_indexed_filenames():
    data_dict = {}
    with open("data/filename_lookup.json", 'r') as json_file:
        data_dict = json.load(json_file)
    return data_dicty

def frame_from_filename(filename:str):
    subject = nimble.biomechanics.SubjectOnDisk(filename)
    frames = subject.readFrames(trial = 0, startFrame = 0, numFramesToRead = subject.getTrialLength(0))
    returnable_frames = []
    for frame in frames:
        returnable_frames.append(frame)
    return frames

def frames_from_file_index(indexed_filenames, index):
    indexed_filenames = get_indexed_filenames()
    filename = indexed_filenames[str(index)]
    return frame_from_filename(filename)

def build_df_from_index_list(list:int):
    returnable_df = pd.DataFrame()
    for key in list:
        # print("Key"+str(key)+":")
        objects_list = frames_from_file_index(key, 0)
        video_df = objects_to_dataframe(objects_list)
        video_df["capture_id"] = int(key)
        returnable_df = pd.concat([returnable_df,video_df], ignore_index=True)
    return returnable_df

        
def print_prioritize(df, column_name, ascending):
    def human_readable_size(bytes_count):
        """Convert bytes count to human-readable format (KB, MB, GB)."""
        if bytes_count < 1024:
            return f"{bytes_count} Bytes"
        elif bytes_count < 1024**2:
            return f"{bytes_count / 1024:.2f} KB"
        elif bytes_count < 1024**3:
            return f"{bytes_count / (1024**2):.2f} MB"
        else:
            return f"{bytes_count / (1024**3):.2f} GB"
    
    def move_to_start(lst, item):
        """Move the specified item to the start of the list."""
        try:
            # Remove the item from its current position
            lst.pop(lst.index(item))
            # Insert the item at the beginning
            lst.insert(0, item)
        except ValueError:
            # Item not found in the list
            pass
        return lst

    sorted_df = df.sort_values(by=column_name, ascending=ascending)
    # Set the capture_id to be the first column in a new list
    sorted_df_cols = move_to_start(sorted_df.columns.tolist(), column_name)
    sorted_df = sorted_df[sorted_df_cols]
    display(sorted_df)
    print(human_readable_size(df.memory_usage(index=True).sum()))



def check_column_entry_lengths(df, col_name):
    """
    Check if all entries in a specified column of a DataFrame have the same length.
    
    Parameters:
    - df: DataFrame to check.
    - col_name: Name of the column to check.
    
    Returns:
    - True if all entries have the same length, False otherwise.
    """
    try:
        benchmark_length = len(df.iloc[0][col_name])  # Get the length of the first entry as a benchmark
    except:
        return None
    
    for _, row in df.iterrows():
        try:
            if len(row[col_name]) != benchmark_length:
                return False
        except TypeError:
            return None
    return True

def check_all_columns(df):
    running = True
    index = 0
    col_list = df.columns.to_list()
    for column_name in col_list:
        print(str(index) + " of " + str(len(col_list)))
        value = check_column_entry_lengths(df, column_name)
        if(value == None):
            index+=1
            continue
        if(value != True):
            print("Mismatch on column name " + str(column_name))
        running = running & value
        index+=1
    
    return running

In [5]:
generate = False

if(generate):
    master_df = build_df_from_index_list(range(0,471))
    master_df.to_feather('data/master_df.feather')
else:
    master_df = pd.read_feather('data/master_df.feather')

print_prioritize(master_df,"capture_id",True)


Unnamed: 0,capture_id,acc,customValues,groundContactCenterOfPressure,groundContactForce,groundContactTorque,groundContactWrenches,pos,probablyMissingGRF,t,tau,trial,vel
0,0,"[0.07944295234533374, -0.14233809008311724, -0...",[],"[0.19148424347104598, 1.6416103821579498e-16, ...","[-42.31021778135695, 334.30669186175777, 0.673...","[0.0, -0.5983554238454732, -3.663870272824084e...","[894.1701490259624, 113.49134275886644, 63.626...","[-0.03442652275386738, -0.05660401440999792, 1...",False,0,"[-20.409695147401738, -1052.5871785298316, -4....",0,"[0.02727797619934335, 0.019350238804365972, -0..."
2446,0,"[-1.0544206403102396, 1.4587226607515273, 1.46...",[],"[0.11385119545315059, 1.6533089567633877e-16, ...","[-9.244513409738918, 741.0123019484798, -4.965...","[0.0, 1.193563760997347, 7.308470197418391e-17...","[1991.977524804748, 20.977857396707932, 83.700...","[-0.174160415190306, -0.001910912597236232, 1....",False,2446,"[446.92949828738153, -1110.0719308182647, -176...",0,"[-0.19056346702984017, -0.07871812068550657, 0..."
2447,0,"[-0.7948493631479863, 1.0054669723706744, 1.23...",[],"[0.11351888733470222, 1.6532250567384332e-16, ...","[-6.717095255651685, 738.1488966537651, -5.414...","[0.0, 0.9838082050834023, 6.024087846651458e-1...","[1983.1388534293394, 11.468297548308843, 83.19...","[-0.17617215072518966, -0.002551111610053316, ...",False,2447,"[442.90055920899437, -1103.9386645254053, -173...",0,"[-0.20110767343294259, -0.0641308940779913, 0...."
2448,0,"[-0.5596023549767256, 0.4797983394541183, 1.16...",[],"[0.1130912497459631, 1.6533152567383156e-16, -...","[-4.310145305000052, 734.5785922609158, -5.053...","[0.0, 0.6661948711264738, 4.0792670826670983e-...","[1973.051124865101, 6.886301884664892, 82.2795...","[-0.17826283360335266, -0.003090313322298169, ...",False,2448,"[439.07308603263357, -1095.9710617722112, -170...",0,"[-0.20905616706442243, -0.054076224354284545, ..."
2449,0,"[-0.3510482711129319, -0.05928244017687745, 1....",[],"[0.11249924951677043, 1.6536236900746868e-16, ...","[-2.1418369936209523, 730.9093078577862, -4.06...","[0.0, 0.4012281455171133, 2.456813820676806e-1...","[1965.2455913187032, -10.64560322752224, 81.99...","[-0.18040874316068733, -0.0035815919879817735,...",False,2449,"[435.8380936395277, -1087.6579504607098, -166....",0,"[-0.21465219061418966, -0.049278240959743355, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730360,470,"[2.3219100162631676, 1.184392745328962, -4.594...",[],"[0.0, 0.0, 0.0, 0.997835410560576, 1.657102142...","[0.0, 0.0, 0.0, 29.75949585093685, 767.1257479...","[0.0, 0.0, 0.0, 0.0, -5.472455401201316, -3.35...","[0.4388017110161448, 6.0240015870132275, -1.24...","[0.08559806578111638, -0.026652591568600115, 1...",False,1230,"[-467.75368408094215, -1175.0609906646405, 193...",0,"[0.0680604972089549, -0.2200747521224266, 0.13..."
1730361,470,"[2.4025113610004882, 0.9732566101592548, -6.09...",[],"[0.0, 0.0, 0.0, 0.9983454415677105, 1.65852525...","[0.0, 0.0, 0.0, 27.701101923160643, 756.993248...","[0.0, 0.0, 0.0, 0.0, -5.435986178062141, -3.32...","[1.745845541960544, 4.685486746831004, -0.6647...","[0.086508605360868, -0.02873394653542214, 1.56...",False,1231,"[-462.0168359617381, -1152.2973191363935, 191....",0,"[0.09127959737158658, -0.20823082466913695, 0...."
1730362,470,"[2.2723863776713853, 0.6314221489335773, -7.19...",[],"[0.0, 0.0, 0.0, 0.9988874980972453, 1.66042666...","[0.0, 0.0, 0.0, 25.513705152366388, 746.894032...","[0.0, 0.0, 0.0, 0.0, -5.3519624677344035, -3.2...","[2.045923898367649, 5.855524553992517, -0.6134...","[0.08766126485471215, -0.030717015335645265, 1...",False,1232,"[-454.633087405962, -1129.085379706646, 190.48...",0,"[0.11530471098159148, -0.1984982585675444, 0.0..."
1730364,470,"[1.3957557260470954, -0.12344825747973884, -7....",[],"[0.0, 0.0, 0.0, 0.9995804893243598, 1.66410368...","[0.0, 0.0, 0.0, 23.027594019701965, 726.881922...","[0.0, 0.0, 0.0, 0.0, -5.162409959286373, -3.16...","[2.259753924475126, 3.625859095928727, -0.6378...","[0.09061815845730564, -0.03453253059408014, 1....",False,1234,"[-436.7658910043896, -1082.9116904515936, 191....",0,"[0.15720733531883566, -0.18983303899141943, -0..."


160.30 MB


## Data Checks

In [6]:
# Ensuring all items in each column are of identical length
# They are!

# print(check_all_columns(master_df))
# print_prioritize(master_df, 'capture_id', 1)

# for column_name in master_df.columns.to_list():
#     print(type(master_df.iloc[0][column_name]))

In [7]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

def process_dataframe(df):
    new_df = df.copy()  # Copy original dataframe to avoid modifying it in-place
    columns_to_drop = []
    
    # Using tqdm for progress bar
    for column in tqdm(new_df.columns, desc="Processing columns"):
        col_data = new_df[column]
        
        # If the column is of type numpy.ndarray
        if isinstance(col_data[0], np.ndarray):
            # Create a dictionary of new columns from the ndarray elements
            new_cols = {f"{column}_{idx}": new_df[column].apply(lambda x: x[idx]) 
                        for idx in range(len(col_data[0]))}
            new_df = pd.concat([new_df, pd.DataFrame(new_cols)], axis=1)
            columns_to_drop.append(column)
            
    # Drop original ndarray columns
    new_df.drop(columns=columns_to_drop, inplace=True)
    
    return new_df



In [8]:
generate = True

if(generate):
    expanded_df = process_dataframe(master_df)
    expanded_df.to_feather('data/expanded_df.feather')
else:
    expanded_df = pd.read_feather('data/expanded_df.feather')


Processing columns:   0%|          | 0/13 [00:00<?, ?it/s]

In [9]:
print_prioritize(expanded_df,"capture_id",True)
print(list(expanded_df.columns.to_list()))


Unnamed: 0,capture_id,probablyMissingGRF,t,trial,acc_0,acc_1,acc_2,acc_3,acc_4,acc_5,...,vel_27,vel_28,vel_29,vel_30,vel_31,vel_32,vel_33,vel_34,vel_35,vel_36
0,0,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
2446,0,False,2446,0,-1.054421,1.458723,1.463783,-0.424775,0.492813,-0.289940,...,-0.000018,-4.276018e-16,-2.600966e-16,-0.001459,0.002403,0.000787,-0.000292,0.000026,7.381276e-16,6.910588e-16
2447,0,False,2447,0,-0.794849,1.005467,1.239124,-0.378761,0.403067,-0.252580,...,-0.000018,-2.021298e-16,-1.251423e-16,-0.001470,0.002427,0.000795,-0.000293,0.000026,3.435991e-16,3.184652e-16
2448,0,False,2448,0,-0.559602,0.479798,1.163037,-0.339946,0.306562,-0.198702,...,-0.000018,3.391564e-17,1.091310e-17,-0.001481,0.002450,0.000802,-0.000294,0.000026,-7.242868e-17,-5.895163e-17
2449,0,False,2449,0,-0.351048,-0.059282,1.230309,-0.305596,0.214869,-0.121215,...,-0.000018,1.114771e-16,5.299588e-17,-0.001492,0.002474,0.000810,-0.000295,0.000026,-2.047832e-16,-1.747548e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730360,470,False,1230,0,2.321910,1.184393,-4.594802,0.224136,0.645754,-0.861366,...,0.000071,2.263637e-17,-5.191000e-17,0.005080,-0.008288,-0.002732,-0.000037,-0.000093,-3.791631e-18,-6.668604e-17
1730361,470,False,1231,0,2.402511,0.973257,-6.097918,0.406883,0.477909,-0.584315,...,0.000071,-9.760222e-17,2.665371e-16,0.005141,-0.008340,-0.002751,-0.000029,-0.000093,2.378403e-17,3.610971e-16
1730362,470,False,1232,0,2.272386,0.631422,-7.190776,0.556776,0.297030,-0.298500,...,0.000072,-2.182258e-16,6.013829e-16,0.005203,-0.008393,-0.002770,-0.000021,-0.000094,5.880708e-17,7.836323e-16
1730364,470,False,1234,0,1.395756,-0.123448,-7.494172,0.671359,-0.108951,0.111634,...,0.000072,8.376712e-16,-2.304551e-15,0.005328,-0.008497,-0.002808,-0.000003,-0.000095,-2.091977e-16,-2.975930e-15


2.34 GB
['probablyMissingGRF', 't', 'trial', 'capture_id', 'acc_0', 'acc_1', 'acc_2', 'acc_3', 'acc_4', 'acc_5', 'acc_6', 'acc_7', 'acc_8', 'acc_9', 'acc_10', 'acc_11', 'acc_12', 'acc_13', 'acc_14', 'acc_15', 'acc_16', 'acc_17', 'acc_18', 'acc_19', 'acc_20', 'acc_21', 'acc_22', 'acc_23', 'acc_24', 'acc_25', 'acc_26', 'acc_27', 'acc_28', 'acc_29', 'acc_30', 'acc_31', 'acc_32', 'acc_33', 'acc_34', 'acc_35', 'acc_36', 'groundContactCenterOfPressure_0', 'groundContactCenterOfPressure_1', 'groundContactCenterOfPressure_2', 'groundContactCenterOfPressure_3', 'groundContactCenterOfPressure_4', 'groundContactCenterOfPressure_5', 'groundContactForce_0', 'groundContactForce_1', 'groundContactForce_2', 'groundContactForce_3', 'groundContactForce_4', 'groundContactForce_5', 'groundContactTorque_0', 'groundContactTorque_1', 'groundContactTorque_2', 'groundContactTorque_3', 'groundContactTorque_4', 'groundContactTorque_5', 'groundContactWrenches_0', 'groundContactWrenches_1', 'groundContactWrenches_