In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import os


# Read the dataset

We will read in the posutre, physical activity and the behavioral pattern dataset

In [2]:
# read the physical activity data
physical_activity_df = pd.read_csv("/Users/hale/Desktop/FinalProjectF22-1/data/PAAWS/HAR/physical_activity_df.csv")
physical_activity_df.head(5)

Unnamed: 0,user_id,timestamp,x_mean,x_std,x_min,x_max,x_median,x_skew,x_fft_dc,x_fft_mean,...,z_fft_IQR,z_fft_neg_count,z_fft_pos_count,z_fft_above_mean,z_fft_num_peaks,z_fft_skew,z_fft_kurtosis,z_fft_energy,z_fft_sma,label
0,11,1638232000.0,0.0,0.0,0.0,0.0,0.0,0.0,,,...,,0,0,0,0,,,,,PA_Type_VideoUnavailable
1,11,1638232000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,,,0.0,0.0,PA_Type_VideoUnavailable
2,11,1638232000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,,,0.0,0.0,PA_Type_VideoUnavailable
3,11,1638232000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,,,0.0,0.0,PA_Type_VideoUnavailable
4,11,1638232000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,,,0.0,0.0,PA_Type_VideoUnavailable


In [3]:
# get all the labels
physical_activity_df['label'].unique()

array(['PA_Type_VideoUnavailable', 'Standing_With_Movement',
       'Sitting_With_Movement', 'Walking', 'Puttering_Around',
       'Walking_Up_Stairs', 'Walking_Down_Stairs',
       'Kneeling_With_Movement', 'PA_Type_Other', 'Washing_Hands',
       'Walking_Fast', 'Doing_Resistance_Training_Other',
       'Synchronizing_Sensors', 'Applying_Makeup', 'Walking_Slow',
       'Walking_Treadmill', 'Folding_Clothes', 'PA_Type_Too_Complex',
       'Lying_With_Movement', 'PA_Type_Unlabeled',
       'Doing_Resistance_Training_Free_Weights', 'Loading', 'Sweeping',
       'Putting_Clothes_Away', 'Brushing_Teeth', 'Organizing_Shelf',
       'Flossing_Teeth', 'Lying_Still', 'Brushing', 'Standing_Still',
       'Sitting_Still', 'Playing_Frisbee', 'Shoveling_Mud_Snow',
       'Ironing', 'Washing_Face', 'Blowdrying_Hair',
       'Cycling_Active_Pedaling_Regular_Bicycle', 'Vacuuming',
       'Watering_Plants'], dtype=object)

# Cleaning up the Labels

In [4]:
# concatenating all labels with the word stairs in it
physical_activity_df['label'] = physical_activity_df['label'].apply(lambda x: "Stairs" if "Stairs" in x else x)
physical_activity_df['label'] = physical_activity_df['label'].apply(lambda x: "Still" if "Still" in x else x)
physical_activity_df['label'] = physical_activity_df['label'].apply(lambda x: "Cycling" if "Cycl" in x else x)
physical_activity_df['label'] = physical_activity_df['label'].apply(lambda x: "Resistance_Training" if "Resistance_Training" in x else x)
physical_activity_df['label'] = physical_activity_df['label'].apply(lambda x: "Brushing" if "Brushing" in x else x)
physical_activity_df['label'] = physical_activity_df['label'].apply(lambda x: "Washing" if "Washing" in x else x)
physical_activity_df['label'] = physical_activity_df['label'].apply(lambda x: "Walking" if "Walking" in x else x)

# combine sweeping, ironing, vacuuming into Cleaning
physical_activity_df['label'] = physical_activity_df['label'].apply(lambda x: "Chores" if x in ['Sweeping', 'Ironing', 'Vacuuming', 'Loading'] else x)

removed_labels = ['PA_Type_Too_Complex', "Synchronizing_Sensors", "PA_Type_VideoUnavailable", "PA_Type_Unlabeled", "PA_Type_Other",
                'Watering_Plants', 'Folding_Clothes', 'Applying_Makeup', 'Shoveling_Mud_Snow', 'Blowdrying_Hair', 'Playing_Frisbee',
                'Organizing_Shelf', 'Flossing_Teeth', "Chores", "Brushing", "Washing", "Putting_Clothes_Away", "Kneeling_With_Movement"]
physical_activity_df = physical_activity_df[~physical_activity_df['label'].isin(removed_labels)]
print(physical_activity_df['label'].value_counts())

Sitting_With_Movement     437029
Standing_With_Movement    155868
Walking                    41301
Lying_With_Movement        35207
Puttering_Around           33777
Still                       5288
Stairs                      2064
Resistance_Training         1091
Kneeling_With_Movement       876
Cycling                      397
Name: label, dtype: int64


In [5]:
# remove any nan values
physical_activity_df.dropna()
physical_activity_df.head()

Unnamed: 0,user_id,timestamp,x_mean,x_std,x_min,x_max,x_median,x_skew,x_fft_dc,x_fft_mean,...,z_fft_IQR,z_fft_neg_count,z_fft_pos_count,z_fft_above_mean,z_fft_num_peaks,z_fft_skew,z_fft_kurtosis,z_fft_energy,z_fft_sma,label
227,11,1638234000.0,0.005225,0.000908,0.003,0.0076,0.0052,-0.007385,4.1798,0.01128,...,0.006211,0,400,85,127,5.152292,32.432553,0.000479,0.009134,Standing_With_Movement
228,11,1638234000.0,0.004999,0.000966,0.0018,0.0072,0.005,-0.140348,3.999,0.010649,...,0.005316,0,400,86,126,4.163887,19.925474,0.000385,0.00854,Standing_With_Movement
229,11,1638234000.0,0.004934,0.000859,0.0026,0.007,0.005,-0.064244,3.9468,0.009504,...,0.004707,0,400,75,129,4.778815,25.786287,0.000507,0.008862,Standing_With_Movement
230,11,1638234000.0,0.004969,0.001127,0.0026,0.0086,0.005,0.31878,3.9756,0.012202,...,0.005544,0,400,69,127,4.590528,23.769533,0.000603,0.010098,Standing_With_Movement
231,11,1638235000.0,0.004722,0.000883,0.0026,0.0074,0.0046,0.449522,3.7776,0.009738,...,0.005176,0,400,72,129,4.574819,23.515873,0.000484,0.009048,Standing_With_Movement


In [6]:
# save the dataframe to a csv
physical_activity_df.to_csv('filtered_labels.csv', index=False) 