In [1]:
import os 
import pandas as pd
import numpy as np
import pickle
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
pd.options.display.max_rows = 80

In [2]:
data_dir = 'raw_data/'

## load raw data processed using matlab

In [3]:
csv_files = os.listdir(data_dir)
csv_files = [file for file in csv_files if file.endswith('.csv')]
csv_files.sort()

In [4]:
#load label file
label_file = open("processed_data/label_dict.pkl", "rb")
label_dict = pickle.load(label_file)

In [5]:
df_lists = []
for idx, file in enumerate(csv_files):
    if idx % 100 ==0:
        print(file)
    participant_file = pd.read_csv(data_dir+file,header=None)
    # Drop time stamp
    participant_file = participant_file.drop([0,4],axis=1)

    column_names= ['acc_x','acc_y','acc_z','gy_x','gy_y','gy_z','user_id','recording_idx','exercise_idx']
    participant_file.columns = column_names
    participant_file['global_file_idx'] = idx
    participant_file['label'] = participant_file['exercise_idx'].map(label_dict)
    df_lists.append(participant_file)
    
print('Complete loading')
# concat all files
total_file = pd.concat(df_lists,axis=0)
total_file = total_file.sort_values(by=['user_id','recording_idx','exercise_idx'])

participant_10_10_32.csv
participant_13_12_32.csv
participant_14_2_74.csv
participant_15_27_32.csv
participant_16_22_32.csv
participant_17_1_62.csv
participant_18_8_32.csv
participant_1_1_69.csv
participant_1_71_32.csv
participant_22_1_15.csv
participant_25_11_32.csv
participant_28_10_32.csv
participant_2_7_32.csv
participant_32_1_36.csv
participant_33_36_32.csv
participant_35_1_58.csv
participant_37_1_68.csv
participant_3_1_55.csv
participant_42_1_34.csv
participant_45_1_13.csv
participant_48_10_32.csv
participant_4_21_32.csv
participant_52_1_20.csv
participant_53_11_32.csv
participant_55_1_25.csv
participant_57_1_63.csv
participant_58_1_42.csv
participant_59_2_20.csv
participant_60_1_61.csv
participant_63_18_32.csv
participant_65_1_26.csv
participant_67_1_74.csv
participant_69_21_32.csv
participant_70_2_23.csv
participant_73_15_32.csv
participant_75_2_23.csv
participant_77_1_45.csv
participant_79_2_60.csv
participant_7_36_32.csv
participant_80_2_12.csv
participant_81_5_32.csv
partici

In [6]:
total_file.to_csv('processed_data/total_file.csv',index=False)

In [7]:
total_file = pd.read_csv('processed_data/total_file.csv')

# EDA

In [8]:
print(f"Total of {total_file.global_file_idx.max()} activity files")
print(f"Total of {len(total_file.label.unique())} unique activity files")

Total of 4686 activity files
Total of 73 unique activity files


In [9]:
print(f"Value counts for each label")
value_counts_by_exercise = dict(total_file.groupby('global_file_idx').head(1)['label'].value_counts())

Value counts for each label


In [10]:
value_counts_by_exercise

{'Non-Exercise': 2147,
 'Tap Right Device': 246,
 'Tap Left Device': 244,
 'Device on Table': 122,
 'Plank': 69,
 'Wall Squat': 62,
 'Jump Rope': 60,
 'Two-arm Dumbbell Curl (both arms, not alternating)': 47,
 'Dip': 46,
 'Shoulder Press (dumbbell)': 45,
 'V-up': 45,
 'Pushup (knee or foot variation)': 44,
 'Burpee': 44,
 'Squat (arms in front of body, parallel to ground)': 44,
 'Lunge (alternating both legs, weight optional)': 43,
 'Sit-up (hands positioned behind head)': 43,
 'Overhead Triceps Extension': 42,
 'Russian Twist': 38,
 'Seated Back Fly': 37,
 'Static stretch': 37,
 'Fast Alternating Punches': 36,
 'Arm Band Adjustment': 36,
 'Lateral Raise': 35,
 'Squat Rack Shoulder Press': 33,
 'Crunch': 33,
 'Kettlebell Swing': 32,
 'Dumbbell Squat (hands at side)': 32,
 'Static Stretch (at your own pace)': 31,
 'Dynamic Stretch (at your own pace)': 31,
 'Butterfly Sit-up': 31,
 'Chest Press (rack)': 31,
 'Medicine Ball Slam': 31,
 'Triceps Kickback (knee on bench) (label spans both a

### For this analysis, I will use activity with counts more than 24 and below 70.

In [11]:
### Morover, I will sample a subset of exercise 
keep_exercise_list = []
for k,v in value_counts_by_exercise.items():
    if (v > 24) and (v<70): 
        keep_exercise_list.append(k)

In [12]:
subset_file = total_file[total_file['label'].isin(keep_exercise_list)]
len(subset_file.global_file_idx.unique())

1796

In [13]:
subset_file

Unnamed: 0,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,user_id,recording_idx,exercise_idx,global_file_idx,label
0,-0.281651,-1.113392,-0.263325,97.198950,-79.207690,-37.931333,1,1,2,657,Arm Band Adjustment
1,-0.246586,-1.219532,-0.171332,149.248055,-71.905772,-40.862477,1,1,2,657,Arm Band Adjustment
2,-0.162572,-1.045398,-0.115079,184.728648,-66.590564,-38.083363,1,1,2,657,Arm Band Adjustment
3,-0.087621,-0.855620,-0.065408,163.429178,-64.595030,-33.841993,1,1,2,657,Arm Band Adjustment
4,-0.066858,-0.929883,-0.003323,140.445108,-63.963457,-33.275897,1,1,2,657,Arm Band Adjustment
...,...,...,...,...,...,...,...,...,...,...,...
14003500,-1.398996,-0.737366,0.813252,96.990376,164.537161,-57.745435,94,1,73,4600,Wall Ball
14003501,-1.415407,-0.591479,0.565676,110.204738,126.683723,-77.125949,94,1,73,4600,Wall Ball
14003502,-1.120440,-0.739180,0.596311,110.211897,95.945094,-102.964842,94,1,73,4600,Wall Ball
14003503,-0.834413,-0.764200,0.762249,111.750415,80.130047,-123.997280,94,1,73,4600,Wall Ball


In [14]:
# Lets check the average number of sample points per recording
counts_per_sample = subset_file.groupby('global_file_idx').count()
print(f"Min:{counts_per_sample.min()[0]}, Max:{counts_per_sample.max()[0]}, Mean:{counts_per_sample.mean()[0]}")

Min:28, Max:31169, Mean:3694.9192650334076


In [15]:
# Leave only records with at least 160 data points
globalidx_list = counts_per_sample[counts_per_sample['label']>=160].index.tolist()
second_subset_file = subset_file[subset_file['global_file_idx'].isin(globalidx_list)]

In [16]:
counts_per_sample = second_subset_file.groupby('global_file_idx').count()
print(f"Min:{counts_per_sample.min()[0]}, Max:{counts_per_sample.max()[0]}, Mean:{counts_per_sample.mean()[0]}")

Min:160, Max:31169, Mean:3709.0447177193964


In [17]:
second_subset_file

Unnamed: 0,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,user_id,recording_idx,exercise_idx,global_file_idx,label
0,-0.281651,-1.113392,-0.263325,97.198950,-79.207690,-37.931333,1,1,2,657,Arm Band Adjustment
1,-0.246586,-1.219532,-0.171332,149.248055,-71.905772,-40.862477,1,1,2,657,Arm Band Adjustment
2,-0.162572,-1.045398,-0.115079,184.728648,-66.590564,-38.083363,1,1,2,657,Arm Band Adjustment
3,-0.087621,-0.855620,-0.065408,163.429178,-64.595030,-33.841993,1,1,2,657,Arm Band Adjustment
4,-0.066858,-0.929883,-0.003323,140.445108,-63.963457,-33.275897,1,1,2,657,Arm Band Adjustment
...,...,...,...,...,...,...,...,...,...,...,...
14003500,-1.398996,-0.737366,0.813252,96.990376,164.537161,-57.745435,94,1,73,4600,Wall Ball
14003501,-1.415407,-0.591479,0.565676,110.204738,126.683723,-77.125949,94,1,73,4600,Wall Ball
14003502,-1.120440,-0.739180,0.596311,110.211897,95.945094,-102.964842,94,1,73,4600,Wall Ball
14003503,-0.834413,-0.764200,0.762249,111.750415,80.130047,-123.997280,94,1,73,4600,Wall Ball


In [18]:
final_df = second_subset_file

## Split Train And Test

In [19]:
random.seed(10)
# Split by users into train and val
unique_user_ids = final_df.user_id.unique()
print(f'There are total of {len(unique_user_ids)} number of users')
random.shuffle(unique_user_ids)

# 70% train 30% test
train_len = int(len(unique_user_ids) * 0.7)
train_ids = unique_user_ids[:train_len]
test_ids = unique_user_ids[train_len:]

print(f'Train User ID LEN: {len(train_ids)} number of users')
print(f'Test User ID LEN: {len(test_ids)} number of users')

There are total of 94 number of users
Train User ID LEN: 65 number of users
Test User ID LEN: 29 number of users


## From the train_ids, I will split the ids into 5 for 5-fold cross validation training

In [20]:
train_ids

array([17, 51, 53, 48, 14, 66, 41,  7, 75, 79, 87,  8, 91, 68, 26,  4, 69,
       12, 19, 61, 65, 64, 73, 92, 22, 71, 44, 78, 77, 57, 89, 72, 88, 15,
       11, 33, 45, 70, 82, 31, 28, 58, 85, 43, 83, 52, 24, 35, 20, 13, 80,
       81,  1, 38,  3, 25, 40, 29, 16, 56, 50, 30,  9, 76, 39])

In [21]:
fivefold_map = {}
fold_num = 0
for train_id in train_ids:
    fivefold_map[train_id] = fold_num
    fold_num +=1
    if fold_num == 5:
        fold_num=0
    
    

## Lets check the number of unique activities in both Train and Test

In [22]:
train_df = final_df[final_df['user_id'].isin(train_ids)]
test_df = final_df[final_df['user_id'].isin(test_ids)]
# The number of classes in both train and test should equal
assert(len(train_df.exercise_idx.unique()) == len(test_df.exercise_idx.unique()))
print(f"Number of unique activities in both train and test are equal, the number of unique activities are: {len(train_df.exercise_idx.unique())}")

Number of unique activities in both train and test are equal, the number of unique activities are: 53


### Lets chunk the final_df

In [23]:
final_df.loc[0,"user_id"]

1

In [24]:
def chunkdata(df,non_overlap_step,maximum_num_chunk_per_global_file=5, window_size=160):
    """
    Args:
        non_overlap_step: how much are we going to non_overlap the data when chunking? if 40 is given, it would index [0: 160, 40: 200, 80: 220]
        if you do not want to overlap at all, non_overlap_step == window_size
        maximum_num_chunk_per_global_file:  how many chunks are we going to get at most per global file?
        window_size: The size of the chunk
    """
    
    assert window_size >=160
    assert non_overlap_step <= window_size
    feature_columns = ['acc_x', 'acc_y', 'acc_z', 'gy_x', 'gy_y', 'gy_z','user_id']
    label_columns = ['user_id','recording_idx', 'exercise_idx', 'global_file_idx', 'label']

    
    df_train_lists = []
    df_label_lists = []
    
    df_copy = df.copy()
    GLOBAL_ID = 0
    
    for user_id in tqdm(df_copy.user_id.unique()):
        for global_file_id in df_copy.loc[df_copy['user_id']==user_id]['global_file_idx'].unique():
            train_data = df_copy.loc[(df_copy['user_id']==user_id)&(df_copy['global_file_idx']==global_file_id),feature_columns]
            train_label = df_copy.loc[(df_copy['user_id']==user_id)&(df_copy['global_file_idx']==global_file_id),label_columns]
            for i in range(len(train_data)):
                if i*non_overlap_step+window_size <=len(train_data):
                    data_chunk = train_data.iloc[i*non_overlap_step:i*non_overlap_step+window_size,:].copy()
                    data_chunk['GLOBAL_ID'] = GLOBAL_ID
                    label_chunk = train_label.iloc[0,:].copy()
                    label_chunk['GLOBAL_ID'] = GLOBAL_ID
                    GLOBAL_ID+=1
                    df_train_lists.append(data_chunk)
                    df_label_lists.append(label_chunk)
                if i+1 ==maximum_num_chunk_per_global_file:
                    break

                
    data = pd.concat(df_train_lists).reset_index(drop=True)
    label = pd.concat(df_label_lists,axis=1).T.reset_index(drop=True)
    
    return data,label



In [None]:
data,label = chunkdata(final_df,non_overlap_step=160, maximum_num_chunk_per_global_file=5, window_size=160)

 51%|█████     | 48/94 [00:43<00:30,  1.50it/s]

In [None]:
train_data = data[data['user_id'].isin(train_ids)]
train_label = data[data['user_id'].isin(train_ids)]
test_data = data[data['user_id'].isin(test_ids)]
test_label = data[data['user_id'].isin(test_ids)]

In [None]:
train_data

In [None]:
train_label

In [None]:
train_label['CV_VAL'] = train_label.copy().user_id.map(fivefold_map)

In [None]:
train_label

In [None]:
test_label

In [None]:
with open('processed_data/MS_train_features_chunk160_window160.pkl', 'wb') as f:
    pickle.dump(train_data, f)

In [None]:
with open('processed_data/MS_train_labels_chunk160_window160.pkl', 'wb') as f:
    pickle.dump(train_label, f)

In [None]:
with open('processed_data/MS_test_features_chunk160_window160.pkl', 'wb') as f:
    pickle.dump(test_data, f)

In [None]:
with open('processed_data/MS_test_labels_chunk160_window160.pkl', 'wb') as f:
    pickle.dump(test_label, f)