In [15]:
import os 
import pandas as pd
import numpy as np
import pickle
import glob
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
pd.options.display.max_rows = 80
pd.options.display.max_columns = 80

In [8]:
# collect all csv files in the folder where the file name is not a single alphabet
# and sort them by the file name
all_files = glob.glob('../ASL-Sensor-Dataglove-Dataset/*/*.csv')
all_files = [file for file in all_files if len(file.split('/')[-1].split('.')[0]) > 1]
all_files.sort()

In [39]:
# collect label name
label_names = [file.split('/')[-1].split('.')[0] for file in all_files]
label_names = list(set(label_names))
label_names.sort()
print(label_names)

['bad', 'deaf', 'fine', 'good', 'goodbye', 'hello', 'hungry', 'me', 'no', 'please', 'sorry', 'thankyou', 'yes', 'you']


In [40]:
# map each label name to a number
label_map = {label: i for i, label in enumerate(label_names)}
print(label_map)

{'bad': 0, 'deaf': 1, 'fine': 2, 'good': 3, 'goodbye': 4, 'hello': 5, 'hungry': 6, 'me': 7, 'no': 8, 'please': 9, 'sorry': 10, 'thankyou': 11, 'yes': 12, 'you': 13}


In [41]:
all_file_list = []
for file_dir in all_files:
    file = pd.read_csv(file_dir)
    file_label = file_dir.split('/')[-1].split('.')[0]
    file['gesture_label'] = file_label
    file['gesture_label_num'] = label_map[file_label]
    all_file_list.append(file)
# concat
all_file = pd.concat(all_file_list, axis=0)
# drop timestamp
all_file = all_file.drop(columns=['timestamp'])
    

In [42]:
all_file

Unnamed: 0,user_id,flex_1,flex_2,flex_3,flex_4,flex_5,Qw,Qx,Qy,Qz,GYRx,GYRy,GYRz,ACCx,ACCy,ACCz,ACCx_body,ACCy_body,ACCz_body,ACCx_world,ACCy_world,ACCz_world,gesture_label,gesture_label_num
0,1,11.0,9.0,-7.0,-7.0,-3.0,0.268066,0.754700,0.018860,0.598450,0.091603,0.022901,0.030534,9.347803,4.825830,-1.942773,0.594556,0.638818,-0.570630,-0.539526,-0.119629,0.882861,bad,0
1,1,12.0,10.0,-6.0,-6.0,0.0,0.266846,0.754822,0.019409,0.598816,0.068702,0.030534,0.030534,9.313110,4.821045,-1.928418,0.555078,0.644800,-0.552686,-0.535937,-0.148340,0.848169,bad,0
2,1,-10.0,8.0,-5.0,-6.0,-4.0,0.265930,0.754761,0.019836,0.599243,0.045802,0.038168,0.030534,9.279614,4.785156,-1.911670,0.517993,0.617285,-0.537134,-0.523975,-0.145947,0.800317,bad,0
3,1,12.0,10.0,-3.0,-7.0,-1.0,0.265198,0.754578,0.020081,0.599854,0.022901,0.045802,0.022901,9.253296,4.718164,-1.898511,0.485693,0.559863,-0.529956,-0.509619,-0.111255,0.745288,bad,0
4,1,12.0,7.0,-7.0,-5.0,-4.0,0.264648,0.754150,0.020203,0.600586,0.015267,0.061069,0.022901,9.234156,4.603320,-1.884155,0.460571,0.453394,-0.527563,-0.485693,-0.031104,0.677100,bad,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,25,-20.0,-52.0,58.0,10.0,17.0,0.961243,0.065125,-0.182922,-0.195618,0.061069,0.030534,0.022901,3.631934,1.455884,8.842969,0.434253,-0.471338,-0.216528,0.287109,-0.592163,-0.150732,you,13
1496,25,-20.0,-53.0,60.0,41.0,19.0,0.961365,0.066345,-0.182678,-0.194885,0.083969,0.038168,0.022901,3.574512,1.574316,8.821436,0.385205,-0.373242,-0.236865,0.288306,-0.480908,-0.167480,you,13
1497,25,-17.0,-53.0,56.0,10.0,16.0,0.961426,0.067749,-0.182495,-0.194336,0.076336,0.022901,0.015267,3.424976,1.784863,8.820239,0.244043,-0.186621,-0.236865,0.233276,-0.253613,-0.177051,you,13
1498,25,-15.0,-54.0,58.0,10.0,24.0,0.961365,0.069031,-0.182556,-0.194092,0.053435,0.000000,0.000000,3.280225,2.028906,8.851343,0.102881,0.033496,-0.200977,0.175854,0.001196,-0.144751,you,13


# Chunk Time Series Data

In [46]:
chunk_size = 100
overlap_size = 50 # 50% overlap. 50% non-overlap.
# split the data into chunks, but make sure the chunk stays within the same gesture, and user
def split_data(data, chunk_size, overlap_size):
    # data is a dataframe
    # chunk_size is the number of rows in each chunk
    # overlap_size is the number of rows in the overlap
    # return a list of dataframes
    data_list = []

    # provide a global_id for each sample
    global_id = 0
    for user in data['user_id'].unique():
        print('user: ', user)
        for gesture in data['gesture_label'].unique():
            user_gesture_data = data[(data['user_id'] == user) & (data['gesture_label'] == gesture)]
            num_rows = user_gesture_data.shape[0]
            for i in range(0, num_rows, chunk_size-overlap_size):
                if i + chunk_size > num_rows:
                    break
                user_data = user_gesture_data.iloc[i:i+chunk_size, :].copy()
                user_data['GLOBAL_ID'] = global_id
                global_id += 1

                # place GLOBAL_ID at the first column
                cols = user_data.columns.tolist()
                cols = cols[-1:] + cols[:-1]
                user_data = user_data[cols]
                data_list.append(user_data)
    
    return data_list


In [69]:
split_data_list = split_data(all_file, chunk_size, overlap_size)

user:  1
user:  2
user:  3
user:  4
user:  5
user:  6
user:  7
user:  8
user:  9
user:  10
user:  11
user:  12
user:  13
user:  14
user:  15
user:  16
user:  17
user:  18
user:  19
user:  20
user:  21
user:  22
user:  23
user:  24
user:  25


In [70]:
# concat all the dataframes in the list
split_data_all = pd.concat(split_data_list, axis=0)

In [71]:
# split train and test user
unique_users = split_data_all['user_id'].unique()
# use 20% of the users as test users
test_users = random.sample(list(unique_users), int(len(unique_users)*0.2))
train_users = [user for user in unique_users if user not in test_users]


In [72]:
# create 5 fold split for train users
train_users_split = np.array_split(train_users, 5)
# create new column called cv_num to indicate which fold the data belongs to
split_data_all['cv_num'] = -1
for i, train_users in enumerate(train_users_split):
    split_data_all.loc[split_data_all['user_id'].isin(train_users), 'cv_num'] = i
# place cv_num after GLOBAL_ID
cols = split_data_all.columns.tolist()
rearrange_cols = ['GLOBAL_ID', 'user_id', 'cv_num', 'gesture_label', 'gesture_label_num', 'flex_1', 'flex_2', 'flex_3', 'flex_4', 'flex_5', 'Qw', 'Qx', 'Qy', 'Qz', 'GYRx', 'GYRy', 'GYRz', 'ACCx', 'ACCy', 'ACCz', 
                  'ACCx_body', 'ACCy_body', 'ACCz_body', 'ACCx_world', 'ACCy_world', 'ACCz_world']
print(cols)


['GLOBAL_ID', 'user_id', 'flex_1', 'flex_2', 'flex_3', 'flex_4', 'flex_5', 'Qw', 'Qx', 'Qy', 'Qz', 'GYRx', 'GYRy', 'GYRz', 'ACCx', 'ACCy', 'ACCz', 'ACCx_body', 'ACCy_body', 'ACCz_body', 'ACCx_world', 'ACCy_world', 'ACCz_world', 'gesture_label', 'gesture_label_num', 'cv_num']


In [79]:
# split into label and features
label_column = ['GLOBAL_ID', 'user_id', 'cv_num', 'gesture_label', 'gesture_label_num']
# columns not in label_column and GLOBAL_ID
feature_columns = ['GLOBAL_ID']+[col for col in cols if col not in label_column]

# train set 
train_data = split_data_all[split_data_all['user_id'].isin(train_users)]
# test set
test_data = split_data_all[split_data_all['user_id'].isin(test_users)]
label_file = split_data_all[label_column]
label_file = label_file.groupby('GLOBAL_ID').head(1)

train_feature_file = train_data[feature_columns]
test_feature_file = test_data[feature_columns]

In [83]:
label_file.reset_index(drop=True, inplace=True)

In [84]:
label_file

Unnamed: 0,GLOBAL_ID,user_id,cv_num,gesture_label,gesture_label_num
0,0,1,0,bad,0
1,1,1,0,bad,0
2,2,1,0,bad,0
3,3,1,0,bad,0
4,4,1,0,bad,0
...,...,...,...,...,...
10145,10145,25,-1,you,13
10146,10146,25,-1,you,13
10147,10147,25,-1,you,13
10148,10148,25,-1,you,13


In [96]:
# convert to dictionary for easy access
train_feature_file_dict = {}
test_feature_file_dict = {}

for global_id in train_feature_file.GLOBAL_ID.unique():
    train_feature_file_dict[global_id] = train_feature_file[train_feature_file['GLOBAL_ID'] == global_id].iloc[:, 1:].values
    
for global_id in test_feature_file.GLOBAL_ID.unique():
    test_feature_file_dict[global_id] = test_feature_file[test_feature_file['GLOBAL_ID'] == global_id].iloc[:, 1:].values

In [98]:
os.mkdir('asl_data')

In [99]:
# save the data
with open('asl_data/asl_train_feature_file_dict.pkl', 'wb') as f:
    pickle.dump(train_feature_file_dict, f)

with open('asl_data/asl_test_feature_file_dict.pkl', 'wb') as f:
    pickle.dump(test_feature_file_dict, f)

label_file.to_csv('asl_data/asl_label_file.csv', index=False)