In [4]:
import os, sys
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.cluster as cluster
import sklearn.metrics as metrics
import sklearn.preprocessing as preprocessing
from generate_labels_summary import get_labels_synced_actigraphy_file, get_labels_columes, labels_type


Root directory:  /Users/hale/Desktop/NEU-CLASS/FinalProjectF22/code/process/../../


# Step 1: Read the filtered actigraphy data with labels

In [None]:
subject_id = 10
which_labels = 1
labels_type = {1: "PhysicalActivity",
                2: "BehavioralPattern",
                3: "HighLevelBehavioralPattern",
                4: "Posture"}

subject_df = get_labels_synced_actigraphy_file(subject_id, which_labels, is_dominant_hand=True)
subject_df.head()

Remove all the columes except for the ones with stand, sit, walk, stair, cycling, sit, lying

In [None]:
# remove all the labels columes that do not contains stand, sit, staitr, cycling, sit, lying, run
#first get the timestamp, and the features columns
cols = subject_df.columns
# print(cols)
original_cols = [col for col in cols if col[0] == "x" or col[0] == "y" or col[0] == "z" or col == "timestamp"]
# print(original_cols)
new_subject_df = subject_df[original_cols]
final_labels = []
for labels in get_labels_columes(subject_df):
    # if labels.lower() contains any of the following words, add it to the new df
    kept_labels = ["stand", "sit", "stair", "cycling", "sit", "lying", "run"]
    for words in kept_labels:
        if words in labels.lower():
            new_subject_df[labels] = subject_df[labels]
            final_labels.append(labels)
            break

# remove all the rows that do not contain any labels
new_subject_df = new_subject_df.dropna(axis=0, how="all", subset=get_labels_columes(new_subject_df))
new_subject_df.head()

Instead of making each labels a columes, change the format so that the labels are concatenate into one big dataframe, with each features are represented as one columes and one "class columes" which indicate the label.

In [None]:
#create a dataframe which only contains the timestamp and features columns
final_subject_df = pd.DataFrame()
for label in final_labels:
    label_df = new_subject_df[[col for col in new_subject_df.columns if col not in final_labels]]
    label_df[label] = new_subject_df[label]
    #drop all the row with value in the label colume is not 1
    label_df = label_df[label_df[label] == 1]
    # drop all other labels colume except the one we are working on
    label_df = label_df[[col for col in label_df.columns if col not in final_labels or col == label]]
    # add a class column and set it to the label
    label_df["class"] = label
    # drop label colume
    label_df = label_df[[col for col in label_df.columns if col != label]]
    #  add label df to the final df
    final_subject_df = final_subject_df.append(label_df)

# print out the summary number of class
print(final_subject_df["class"].value_counts())
final_subject_df.head()

In [None]:
# combining labels with stair in the name in final_subject-df
final_subject_df["class"] = final_subject_df["class"].apply(lambda x: "STAIR" if "stair" in x.lower() else x)
# combining labels with stand in the name in final_subject-df
final_subject_df["class"] = final_subject_df["class"].apply(lambda x: "STILL" if "still" in x.lower() else x)
# get the summary of class
final_subject_df["class"].value_counts()

From the look of it, this seems to channel the summary labels files very well. I want to add one more columes to indicate the hour of the timestamp, since this might be improtant.

In [None]:
# add one columes indicating the hour of the timestamp, in EDT time
#first, we must convert timestamp to datetime (in EDT time)
final_subject_df["hour"] = pd.to_datetime(final_subject_df["timestamp"], unit="s")
final_subject_df["hour"] = final_subject_df["hour"].apply(lambda x: x.hour)
# put the hour colume in the front
cols = final_subject_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
final_subject_df = final_subject_df[cols]

final_subject_df.head()

Now, we want to use PCA to reduce the dimension of our data, and plot it the see the discriminality of the different labels.

In [None]:
# prepare the data for clustering
# first, remove the timestamp colume
final_subject_df = final_subject_df[[col for col in final_subject_df.columns if col != "timestamp"]]
# remove the class columes and save it to a variable
class_df = final_subject_df["class"]
final_subject_df = final_subject_df[[col for col in final_subject_df.columns if col != "class"]]    
# then, convert the dataframe to numpy array
final_subject_df = final_subject_df.values
# then, normalize the data
final_subject_df = preprocessing.normalize(final_subject_df)
# then, convert the numpy array to dataframe
final_subject_df = pd.DataFrame(final_subject_df)
final_subject_df.head()

In [None]:
# perform dimensionality reduction
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(final_subject_df)
pca_final_subject_df = pca.transform(final_subject_df)

# add the class back to the dataframe
pca_final_subject_df = pd.DataFrame(pca_final_subject_df)
pca_final_subject_df["class"] = class_df

# plot the data
plt.figure(figsize=(8, 8))
for label in pca_final_subject_df["class"].unique():
    label_df = pca_final_subject_df[pca_final_subject_df["class"] == label]
    plt.scatter(label_df[0], label_df[1], label=label)
plt.legend()
plt.show()



In [None]:
# add class colume back to the final_subject-df
final_subject_df = pd.DataFrame(final_subject_df)
final_subject_df["class"] = class_df
#remove nan labels
final_subject_df = final_subject_df.dropna(axis=0, how="any", subset=["class"])

# let's try to train a classifier to predict the class
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(final_subject_df[[col for col in final_subject_df.columns if col != "class"]], final_subject_df["class"], test_size=0.2, random_state=42)

# train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# predict the class
y_pred = rf.predict(X_test)

# calculate the accuracy
accuracy_score(y_test, y_pred)

In [None]:
# get