In [41]:
import pandas as pd
import pandas as pd
from tqdm.notebook import tqdm
import json
import nimblephysics as nimble
import os
import csv
from pprint import pprint
import numpy as np
from typing import List, Tuple, Union, Dict

In [42]:
def print_prioritize(df, column_name, ascending):
    def human_readable_size(bytes_count):
        """Convert bytes count to human-readable format (KB, MB, GB)."""
        if bytes_count < 1024:
            return f"{bytes_count} Bytes"
        elif bytes_count < 1024**2:
            return f"{bytes_count / 1024:.2f} KB"
        elif bytes_count < 1024**3:
            return f"{bytes_count / (1024**2):.2f} MB"
        else:
            return f"{bytes_count / (1024**3):.2f} GB"
    
    def move_to_start(lst, item):
        """Move the specified item to the start of the list."""
        try:
            # Remove the item from its current position
            lst.pop(lst.index(item))
            # Insert the item at the beginning
            lst.insert(0, item)
        except ValueError:
            # Item not found in the list
            pass
        return lst

    sorted_df = df.sort_values(by=column_name, ascending=ascending)
    # Set the capture_id to be the first column in a new list
    sorted_df_cols = move_to_start(sorted_df.columns.tolist(), column_name)
    sorted_df = sorted_df[sorted_df_cols]
    display(sorted_df)
    print(human_readable_size(df.memory_usage(index=True).sum()))


In [43]:
def extract_filename_from_path(path: str) -> str:
    """
    Extracts the filename from the given path.
    
    Args:
    - path (str): The path string from which the filename needs to be extracted.

    Returns:
    - str: The filename extracted from the path.

    Raises:
    - ValueError: If the path does not end in .bin or if there's any other issue.
    """
    # Check if the path is a string
    if not isinstance(path, str):
        raise ValueError("Input must be a string representation of a path.")

    # Split the path string by space, to avoid any extraneous information.
    # Pick the last part, as it should contain the actual path
    path_parts = path.split()
    actual_path = path_parts[-1] if path_parts else path

    # Split by '/' to get individual components of the path
    components = actual_path.split('/')

    # The filename should be the last component
    filename = components[-1] if components else None

    # Check if the filename ends with .bin
    if not filename or not filename.endswith('.bin'):
        raise ValueError("The provided path does not end with a valid .bin filename.")

    return filename

def associate_with_IDs(filenames:list[str], filename_lookup:dict):
    returnable = {}
    for filename_to_find_tpl in filenames:
        for filename_id in filename_lookup:
            filename_here = filename_lookup[filename_id]
            if filename_to_find_tpl[0] == extract_filename_from_path(filename_here):
                returnable[filename_id] = {
                    "name": filename_here,
                    "acc": filename_to_find_tpl[1]
                }

    return returnable

bad_filenames = []
good_filenames = []

with open('data/filename_lookup.json', 'r') as json_file:
    filename_lookup = json.load(json_file)

with open('data/Aidan - Badset.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        bad_filenames.append((extract_filename_from_path(row[0]), row[1]))

with open('data/Aidan - Goodset.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        good_filenames.append((extract_filename_from_path(row[0]), row[1]))

bad_lookup = associate_with_IDs(bad_filenames, filename_lookup)
good_lookup = associate_with_IDs(good_filenames, filename_lookup)


pprint(bad_lookup)
print(len(bad_lookup.keys()))


{'106': {'acc': '0.1171875',
         'name': 'data/processed/standardized/rajagopal_no_arms/data/protected/us-west-2:d09a02cd-9c43-4b51-8bcc-3a27fa943a85/data/Healthy '
                 'Balance '
                 'Study/S12DA4/fd2031bebc75d90a02dfccb6c05c079d87b1a57a4a2f48c559f8ffff94fcac9c/fd2031bebc75d90a02dfccb6c05c079d87b1a57a4a2f48c559f8ffff94fcac9c.bin'},
 '126': {'acc': '0.2506775068',
         'name': 'data/processed/standardized/rajagopal_no_arms/data/protected/us-west-2:0851f4c9-0032-4a42-a03e-da86f0f2ac35/data/Dephy_NB01/43f6752b4bebd0333d1e7e18026b9507e9fbdff19d83bcdf9104d70153a610b5/43f6752b4bebd0333d1e7e18026b9507e9fbdff19d83bcdf9104d70153a610b5.bin'},
 '130': {'acc': '0.3070866142',
         'name': 'data/processed/standardized/rajagopal_no_arms/data/protected/us-west-2:0851f4c9-0032-4a42-a03e-da86f0f2ac35/data/Dephy_NB02/e068176dc739ca7a95ce6bf4dfec796922897ec75ef9bed9e294ea69c8accdaa/e068176dc739ca7a95ce6bf4dfec796922897ec75ef9bed9e294ea69c8accdaa.bin'},
 '178': {'ac

In [44]:
pprint(good_lookup)
print(len(good_lookup.keys()))


{'101': {'acc': '0.9219294553',
         'name': 'data/processed/standardized/rajagopal_no_arms/data/protected/us-west-2:d09a02cd-9c43-4b51-8bcc-3a27fa943a85/data/Healthy '
                 'Balance '
                 'Study/S08DP2/53ac7744fd908d6f34af9cd5e83600ae10a62bec753ab410bc00edd631263b95/53ac7744fd908d6f34af9cd5e83600ae10a62bec753ab410bc00edd631263b95.bin'},
 '104': {'acc': '0.9094332011',
         'name': 'data/processed/standardized/rajagopal_no_arms/data/protected/us-west-2:d09a02cd-9c43-4b51-8bcc-3a27fa943a85/data/Healthy '
                 'Balance '
                 'Study/S11DA1/181c97543311c9f277c56c142ab48b5abf52f141477a14bc1cec480ea9cee1e2/181c97543311c9f277c56c142ab48b5abf52f141477a14bc1cec480ea9cee1e2.bin'},
 '105': {'acc': '0.9660924819',
         'name': 'data/processed/standardized/rajagopal_no_arms/data/protected/us-west-2:d09a02cd-9c43-4b51-8bcc-3a27fa943a85/data/Healthy '
                 'Balance '
                 'Study/S11DN2/533d3eb66ddbe5e2781dfc28839147

In [66]:
generate = True
train_df = pd.DataFrame()
test_df = pd.DataFrame()
expanded_df = pd.read_feather('data/expanded_df.feather')
good_list = list(good_lookup.keys())
bad_list = list(bad_lookup.keys())
good_list_int = [int(item) for item in good_list]
bad_list_int = [int(item) for item in bad_list]
unlabeled_list = [int(item) for item in list(filename_lookup.keys()) if int(item) not in good_list_int or int(item) not in bad_list_int]
print(unlabeled_list, end='')
print(len(unlabeled_list))

print(bad_list_int, end='')
print(len(bad_list_int))

print(good_list_int, end='')
print(len(good_list_int))

pprint(list(filename_lookup.keys()))
pprint(len(list(filename_lookup.keys())))


train_df = expanded_df[expanded_df['capture_id'].isin(good_list_int+bad_list_int)]
test_df = expanded_df[expanded_df['capture_id'].isin(unlabeled_list)]


print_prioritize(train_df,"capture_id",True)
first_occurrences_all = train_df.drop_duplicates(subset='capture_id', keep='first')
print_prioritize(first_occurrences_all,"capture_id",True)


print_prioritize(test_df,"capture_id",True)
first_occurrences_all = test_df.drop_duplicates(subset='capture_id', keep='first')
print_prioritize(first_occurrences_all,"capture_id",True)


[0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 21, 26, 28, 29, 30, 36, 40, 42, 49, 51, 55, 59, 60, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 90, 91, 93, 94, 95, 96, 97, 98, 99, 100, 102, 103, 107, 109, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 127, 128, 129, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 180, 181, 182, 183, 184, 185, 187, 191, 192, 195, 196, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 234, 235, 236, 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 

Unnamed: 0,capture_id,probablyMissingGRF,t,trial,acc_0,acc_1,acc_2,acc_3,acc_4,acc_5,...,vel_27,vel_28,vel_29,vel_30,vel_31,vel_32,vel_33,vel_34,vel_35,vel_36
14716,4,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
17162,4,False,2446,0,-1.054421,1.458723,1.463783,-0.424775,0.492813,-0.289940,...,-0.000018,-4.276018e-16,-2.600966e-16,-0.001459,0.002403,0.000787,-0.000292,0.000026,7.381276e-16,6.910588e-16
17163,4,False,2447,0,-0.794849,1.005467,1.239124,-0.378761,0.403067,-0.252580,...,-0.000018,-2.021298e-16,-1.251423e-16,-0.001470,0.002427,0.000795,-0.000293,0.000026,3.435991e-16,3.184652e-16
17164,4,False,2448,0,-0.559602,0.479798,1.163037,-0.339946,0.306562,-0.198702,...,-0.000018,3.391564e-17,1.091310e-17,-0.001481,0.002450,0.000802,-0.000294,0.000026,-7.242868e-17,-5.895163e-17
17165,4,False,2449,0,-0.351048,-0.059282,1.230309,-0.305596,0.214869,-0.121215,...,-0.000018,1.114771e-16,5.299588e-17,-0.001492,0.002474,0.000810,-0.000295,0.000026,-2.047832e-16,-1.747548e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1726681,469,False,1230,0,2.321910,1.184393,-4.594802,0.224136,0.645754,-0.861366,...,0.000071,2.263637e-17,-5.191000e-17,0.005080,-0.008288,-0.002732,-0.000037,-0.000093,-3.791631e-18,-6.668604e-17
1726682,469,False,1231,0,2.402511,0.973257,-6.097918,0.406883,0.477909,-0.584315,...,0.000071,-9.760222e-17,2.665371e-16,0.005141,-0.008340,-0.002751,-0.000029,-0.000093,2.378403e-17,3.610971e-16
1726683,469,False,1232,0,2.272386,0.631422,-7.190776,0.556776,0.297030,-0.298500,...,0.000072,-2.182258e-16,6.013829e-16,0.005203,-0.008393,-0.002770,-0.000021,-0.000094,5.880708e-17,7.836323e-16
1726685,469,False,1234,0,1.395756,-0.123448,-7.494172,0.671359,-0.108951,0.111634,...,0.000072,8.376712e-16,-2.304551e-15,0.005328,-0.008497,-0.002808,-0.000003,-0.000095,-2.091977e-16,-2.975930e-15


536.76 MB


Unnamed: 0,capture_id,probablyMissingGRF,t,trial,acc_0,acc_1,acc_2,acc_3,acc_4,acc_5,...,vel_27,vel_28,vel_29,vel_30,vel_31,vel_32,vel_33,vel_34,vel_35,vel_36
14716,4,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
25753,7,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
55185,15,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
62543,17,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
66222,18,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1699698,462,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
1703377,463,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
1707056,464,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
1718093,467,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21


149.40 KB


Unnamed: 0,capture_id,probablyMissingGRF,t,trial,acc_0,acc_1,acc_2,acc_3,acc_4,acc_5,...,vel_27,vel_28,vel_29,vel_30,vel_31,vel_32,vel_33,vel_34,vel_35,vel_36
0,0,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
2446,0,False,2446,0,-1.054421,1.458723,1.463783,-0.424775,0.492813,-0.289940,...,-0.000018,-4.276018e-16,-2.600966e-16,-0.001459,0.002403,0.000787,-0.000292,0.000026,7.381276e-16,6.910588e-16
2447,0,False,2447,0,-0.794849,1.005467,1.239124,-0.378761,0.403067,-0.252580,...,-0.000018,-2.021298e-16,-1.251423e-16,-0.001470,0.002427,0.000795,-0.000293,0.000026,3.435991e-16,3.184652e-16
2448,0,False,2448,0,-0.559602,0.479798,1.163037,-0.339946,0.306562,-0.198702,...,-0.000018,3.391564e-17,1.091310e-17,-0.001481,0.002450,0.000802,-0.000294,0.000026,-7.242868e-17,-5.895163e-17
2449,0,False,2449,0,-0.351048,-0.059282,1.230309,-0.305596,0.214869,-0.121215,...,-0.000018,1.114771e-16,5.299588e-17,-0.001492,0.002474,0.000810,-0.000295,0.000026,-2.047832e-16,-1.747548e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730360,470,False,1230,0,2.321910,1.184393,-4.594802,0.224136,0.645754,-0.861366,...,0.000071,2.263637e-17,-5.191000e-17,0.005080,-0.008288,-0.002732,-0.000037,-0.000093,-3.791631e-18,-6.668604e-17
1730361,470,False,1231,0,2.402511,0.973257,-6.097918,0.406883,0.477909,-0.584315,...,0.000071,-9.760222e-17,2.665371e-16,0.005141,-0.008340,-0.002751,-0.000029,-0.000093,2.378403e-17,3.610971e-16
1730362,470,False,1232,0,2.272386,0.631422,-7.190776,0.556776,0.297030,-0.298500,...,0.000072,-2.182258e-16,6.013829e-16,0.005203,-0.008393,-0.002770,-0.000021,-0.000094,5.880708e-17,7.836323e-16
1730364,470,False,1234,0,1.395756,-0.123448,-7.494172,0.671359,-0.108951,0.111634,...,0.000072,8.376712e-16,-2.304551e-15,0.005328,-0.008497,-0.002808,-0.000003,-0.000095,-2.091977e-16,-2.975930e-15


1.83 GB


Unnamed: 0,capture_id,probablyMissingGRF,t,trial,acc_0,acc_1,acc_2,acc_3,acc_4,acc_5,...,vel_27,vel_28,vel_29,vel_30,vel_31,vel_32,vel_33,vel_34,vel_35,vel_36
0,0,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
3679,1,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
7358,2,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
11037,3,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
18395,5,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1696019,461,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
1710735,465,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
1714414,466,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21
1721772,468,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,0.066402,...,-0.000065,-1.103162e-21,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21


520.76 KB
