In [115]:
import pandas as pd
import pandas as pd
from tqdm.notebook import tqdm
import json
import nimblephysics as nimble
import os
import csv
from pprint import pprint
import numpy as np
from typing import List, Tuple, Union, Dict

In [116]:
def print_prioritize(df, column_name, ascending):
    def human_readable_size(bytes_count):
        """Convert bytes count to human-readable format (KB, MB, GB)."""
        if bytes_count < 1024:
            return f"{bytes_count} Bytes"
        elif bytes_count < 1024**2:
            return f"{bytes_count / 1024:.2f} KB"
        elif bytes_count < 1024**3:
            return f"{bytes_count / (1024**2):.2f} MB"
        else:
            return f"{bytes_count / (1024**3):.2f} GB"
    
    def move_to_start(lst, item):
        """Move the specified item to the start of the list."""
        try:
            # Remove the item from its current position
            lst.pop(lst.index(item))
            # Insert the item at the beginning
            lst.insert(0, item)
        except ValueError:
            # Item not found in the list
            pass
        return lst

    sorted_df = df.sort_values(by=column_name, ascending=ascending)
    # Set the capture_id to be the first column in a new list
    sorted_df_cols = move_to_start(sorted_df.columns.tolist(), column_name)
    sorted_df = sorted_df[sorted_df_cols]
    display(sorted_df)
    print(human_readable_size(df.memory_usage(index=True).sum()))


In [117]:
def extract_filename_from_path(path: str) -> str:
    """
    Extracts the filename from the given path.
    
    Args:
    - path (str): The path string from which the filename needs to be extracted.

    Returns:
    - str: The filename extracted from the path.

    Raises:
    - ValueError: If the path does not end in .bin or if there's any other issue.
    """
    # Check if the path is a string
    if not isinstance(path, str):
        raise ValueError("Input must be a string representation of a path.")

    # Split the path string by space, to avoid any extraneous information.
    # Pick the last part, as it should contain the actual path
    path_parts = path.split()
    actual_path = path_parts[-1] if path_parts else path

    # Split by '/' to get individual components of the path
    components = actual_path.split('/')

    # The filename should be the last component
    filename = components[-1] if components else None

    # Check if the filename ends with .bin
    if not filename or not filename.endswith('.bin'):
        raise ValueError("The provided path does not end with a valid .bin filename.")

    return filename

def associate_with_IDs(filenames:list[str], filename_lookup:dict):
    returnable = {}
    for filename_to_find_tpl in filenames:
        for filename_id in filename_lookup:
            filename_here = filename_lookup[filename_id]
            if filename_to_find_tpl[0] == extract_filename_from_path(filename_here):
                returnable[filename_id] = {
                    "name": filename_here,
                    "acc": filename_to_find_tpl[1]
                }

    return returnable

bad_filenames = []
good_filenames = []

with open('data/filename_lookup.json', 'r') as json_file:
    filename_lookup = json.load(json_file)

with open('data/Aidan - Badset.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        bad_filenames.append((extract_filename_from_path(row[0]), row[1]))

with open('data/Aidan - Goodset.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        good_filenames.append((extract_filename_from_path(row[0]), row[1]))

bad_lookup = associate_with_IDs(bad_filenames, filename_lookup)
good_lookup = associate_with_IDs(good_filenames, filename_lookup)


KeyboardInterrupt: 

In [None]:
# pprint(good_lookup)
# print(len(good_lookup.keys()))


In [None]:
generate = True
dp = True

train_df = pd.DataFrame()
test_df = pd.DataFrame()
if(generate):
    expanded_df = pd.read_feather('data/expanded_df.feather')
    good_list = list(good_lookup.keys())
    bad_list = list(bad_lookup.keys())
    good_list_int = [int(item) for item in good_list]
    bad_list_int = [int(item) for item in bad_list]
    unlabeled_list = [int(item) for item in list(filename_lookup.keys()) if int(item) not in good_list_int and int(item) not in bad_list_int]

    #Train Dataframe
    train_df = expanded_df[expanded_df['capture_id'].isin(good_list_int+bad_list_int)].copy() #train_df is still unlabeled
    train_df['label'] = -1
    # Wrap the iterator with tqdm
    for index, row in tqdm(train_df.iterrows(), total=train_df.shape[0], desc="Building Train Dataset"):
        train_df.at[index,'frame_id'] = int(index)
        if row['capture_id'] in good_list_int:
            train_df.at[index, 'label'] = 1
        elif row['capture_id'] in bad_list_int:
            train_df.at[index, 'label'] = 0


    #Test Dataframe
    test_df = expanded_df[expanded_df['capture_id'].isin(unlabeled_list)].copy()
    # test_df['label'] = None
    for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0], desc="Building Test Dataset"):
        test_df.at[index,'frame_id'] = int(index)



    #Save them both TODO: Indexing
    train_df= train_df.reset_index()
    test_df=test_df.reset_index()
    train_df.to_feather('data/train_df.feather')
    test_df.to_feather('data/test_df.feather')


train_df = pd.read_feather('data/train_df.feather')
test_df = pd.read_feather('data/test_df.feather')

if(dp):
    print("Test Dataframe: ")
    print_prioritize(test_df,"capture_id",True)
    print("Unique Videos in Test Dataframe: ")
    first_occurrences_all = test_df.drop_duplicates(subset='capture_id', keep='first')
    print_prioritize(first_occurrences_all,"capture_id",True)
    print("Train Dataframe: ")
    print_prioritize(train_df,"capture_id",True)
    print("Unique Videos in Train Dataframe: ")
    first_occurrences_all = train_df.drop_duplicates(subset='capture_id', keep='first')
    print_prioritize(first_occurrences_all,"capture_id",True)




Building Train Dataset:   0%|          | 0/386295 [00:00<?, ?it/s]

Building Test Dataset:   0%|          | 0/1346514 [00:00<?, ?it/s]

Test Dataframe: 


Unnamed: 0,capture_id,index,probablyMissingGRF,t,trial,acc_0,acc_1,acc_2,acc_3,acc_4,...,vel_29,vel_30,vel_31,vel_32,vel_33,vel_34,vel_35,vel_36,label,frame_id
0,0,0,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,,0.0
2446,0,2446,False,2446,0,-1.054421,1.458723,1.463783,-0.424775,0.492813,...,-2.600966e-16,-0.001459,0.002403,0.000787,-0.000292,0.000026,7.381276e-16,6.910588e-16,,2446.0
2447,0,2447,False,2447,0,-0.794849,1.005467,1.239124,-0.378761,0.403067,...,-1.251423e-16,-0.001470,0.002427,0.000795,-0.000293,0.000026,3.435991e-16,3.184652e-16,,2447.0
2448,0,2448,False,2448,0,-0.559602,0.479798,1.163037,-0.339946,0.306562,...,1.091310e-17,-0.001481,0.002450,0.000802,-0.000294,0.000026,-7.242868e-17,-5.895163e-17,,2448.0
2449,0,2449,False,2449,0,-0.351048,-0.059282,1.230309,-0.305596,0.214869,...,5.299588e-17,-0.001492,0.002474,0.000810,-0.000295,0.000026,-2.047832e-16,-1.747548e-16,,2449.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1344065,470,1730360,False,1230,0,2.321910,1.184393,-4.594802,0.224136,0.645754,...,-5.191000e-17,0.005080,-0.008288,-0.002732,-0.000037,-0.000093,-3.791631e-18,-6.668604e-17,,1730360.0
1344066,470,1730361,False,1231,0,2.402511,0.973257,-6.097918,0.406883,0.477909,...,2.665371e-16,0.005141,-0.008340,-0.002751,-0.000029,-0.000093,2.378403e-17,3.610971e-16,,1730361.0
1344067,470,1730362,False,1232,0,2.272386,0.631422,-7.190776,0.556776,0.297030,...,6.013829e-16,0.005203,-0.008393,-0.002770,-0.000021,-0.000094,5.880708e-17,7.836323e-16,,1730362.0
1344069,470,1730364,False,1234,0,1.395756,-0.123448,-7.494172,0.671359,-0.108951,...,-2.304551e-15,0.005328,-0.008497,-0.002808,-0.000003,-0.000095,-2.091977e-16,-2.975930e-15,,1730364.0


1.85 GB
Unique Videos in Test Dataframe: 


Unnamed: 0,capture_id,index,probablyMissingGRF,t,trial,acc_0,acc_1,acc_2,acc_3,acc_4,...,vel_29,vel_30,vel_31,vel_32,vel_33,vel_34,vel_35,vel_36,label,frame_id
0,0,0,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,,0.0
3679,1,3679,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,,3679.0
7358,2,7358,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,,7358.0
11037,3,11037,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,,11037.0
14716,5,18395,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,,18395.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1328119,461,1696019,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,,1696019.0
1331798,465,1710735,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,,1710735.0
1335477,466,1714414,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,,1714414.0
1339156,468,1721772,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,,1721772.0


529.34 KB
Train Dataframe: 


Unnamed: 0,capture_id,index,probablyMissingGRF,t,trial,acc_0,acc_1,acc_2,acc_3,acc_4,...,vel_29,vel_30,vel_31,vel_32,vel_33,vel_34,vel_35,vel_36,label,frame_id
0,4,14716,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,0,14716.0
2446,4,17162,False,2446,0,-1.054421,1.458723,1.463783,-0.424775,0.492813,...,-2.600966e-16,-0.001459,0.002403,0.000787,-0.000292,0.000026,7.381276e-16,6.910588e-16,0,17162.0
2447,4,17163,False,2447,0,-0.794849,1.005467,1.239124,-0.378761,0.403067,...,-1.251423e-16,-0.001470,0.002427,0.000795,-0.000293,0.000026,3.435991e-16,3.184652e-16,0,17163.0
2448,4,17164,False,2448,0,-0.559602,0.479798,1.163037,-0.339946,0.306562,...,1.091310e-17,-0.001481,0.002450,0.000802,-0.000294,0.000026,-7.242868e-17,-5.895163e-17,0,17164.0
2449,4,17165,False,2449,0,-0.351048,-0.059282,1.230309,-0.305596,0.214869,...,5.299588e-17,-0.001492,0.002474,0.000810,-0.000295,0.000026,-2.047832e-16,-1.747548e-16,0,17165.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383846,469,1726681,False,1230,0,2.321910,1.184393,-4.594802,0.224136,0.645754,...,-5.191000e-17,0.005080,-0.008288,-0.002732,-0.000037,-0.000093,-3.791631e-18,-6.668604e-17,0,1726681.0
383847,469,1726682,False,1231,0,2.402511,0.973257,-6.097918,0.406883,0.477909,...,2.665371e-16,0.005141,-0.008340,-0.002751,-0.000029,-0.000093,2.378403e-17,3.610971e-16,0,1726682.0
383848,469,1726683,False,1232,0,2.272386,0.631422,-7.190776,0.556776,0.297030,...,6.013829e-16,0.005203,-0.008393,-0.002770,-0.000021,-0.000094,5.880708e-17,7.836323e-16,0,1726683.0
383850,469,1726685,False,1234,0,1.395756,-0.123448,-7.494172,0.671359,-0.108951,...,-2.304551e-15,0.005328,-0.008497,-0.002808,-0.000003,-0.000095,-2.091977e-16,-2.975930e-15,0,1726685.0


542.65 MB
Unique Videos in Train Dataframe: 


Unnamed: 0,capture_id,index,probablyMissingGRF,t,trial,acc_0,acc_1,acc_2,acc_3,acc_4,...,vel_29,vel_30,vel_31,vel_32,vel_33,vel_34,vel_35,vel_36,label,frame_id
0,4,14716,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,0,14716.0
3679,7,25753,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,0,25753.0
7358,15,55185,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,1,55185.0
11037,17,62543,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,1,62543.0
14716,18,66222,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,1,66222.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367900,462,1699698,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,0,1699698.0
371579,463,1703377,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,0,1703377.0
375258,464,1707056,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,0,1707056.0
378937,467,1718093,False,0,0,0.079443,-0.142338,-0.328894,-0.023894,-0.063634,...,-7.820643e-22,-0.005859,0.008296,0.002743,-0.000304,0.000092,8.313443e-21,3.007802e-21,0,1718093.0


151.86 KB


In [119]:
pd.set_option('display.max_columns', 200)  # or set it to a specific number if you want
for column_name, dtype in test_df.dtypes.items():
    print(f"Column {column_name} has data type: {dtype}")

Column index has data type: int64
Column probablyMissingGRF has data type: bool
Column t has data type: int64
Column trial has data type: int64
Column capture_id has data type: int64
Column acc_0 has data type: float64
Column acc_1 has data type: float64
Column acc_2 has data type: float64
Column acc_3 has data type: float64
Column acc_4 has data type: float64
Column acc_5 has data type: float64
Column acc_6 has data type: float64
Column acc_7 has data type: float64
Column acc_8 has data type: float64
Column acc_9 has data type: float64
Column acc_10 has data type: float64
Column acc_11 has data type: float64
Column acc_12 has data type: float64
Column acc_13 has data type: float64
Column acc_14 has data type: float64
Column acc_15 has data type: float64
Column acc_16 has data type: float64
Column acc_17 has data type: float64
Column acc_18 has data type: float64
Column acc_19 has data type: float64
Column acc_20 has data type: float64
Column acc_21 has data type: float64
Column acc_22