# <center> BMI 500: Human Activity Recognition

## Week - 13 

In [1]:
import os
import numpy as np
import pandas as pd
from pandas import Series
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from collections import defaultdict
from itertools import product

### Pre-processing Function

In [2]:
def preprocess_data(data, kernel_size):
    p_data = np.zeros(data.shape)
    for ch in range(data.shape[1]):
        kps_seq_ch = data[:, ch]
        kps_seq_ch = Series(kps_seq_ch).rolling(kernel_size, min_periods=1, center=True).mean().to_numpy()
        p_data[:, ch] = kps_seq_ch
    return p_data

### Segmentation Function

In [3]:
def segment_data(data, win_len):
    win_len = int(30 * win_len)
    win_step = int(30 * 0.5)
    sample_windows = []

    for start_time in range(0, data.shape[0], win_step):
        end_time = start_time + win_len
        if end_time > data.shape[0]:
            end_time = data.shape[0]
            start_time = end_time - win_len
        frame = data[start_time:end_time]
        assert frame.shape[0] == win_len, (start_time, end_time, data.shape[0])
        sample_windows.append(frame)

    return np.array(sample_windows)

### Feature Extraction Function

In [4]:
def feature_extraction(sample_windows):
    N, T, D = sample_windows.shape
    feats = []
    for i in range(N):
        frame = sample_windows[i]
        feat = []
        for ch in range(D):
            frame_ch = frame[:,ch]
            # mean feature
            mean_ch = np.mean(frame_ch)
            feat.append(mean_ch)
            # std feature
            std_ch = np.std(frame_ch)
            feat.append(std_ch)
        feats.append(feat)
    feats = np.array(feats)
    return feats

### Loading all the data files from the POSE folder

In [5]:
def process_pose_data(directory):
    data_dict = defaultdict(list)
    for file_name in os.listdir(directory):
        # Extract subject number
        subject_number = int(file_name.split('_')[1][1:])
        # Extract activity label
        label_act = int(file_name.split('_')[0][1:]) - 1

        data3D = np.load(os.path.join(directory, file_name))
        data = data3D.reshape(data3D.shape[0], -1)
        
        data_prep = preprocess_data(data, 5)
        data_seg = segment_data(data_prep, 1.5)
        features = feature_extraction(data_seg)

        data_dict[subject_number].append((features, [label_act] * len(features)))
    return data_dict

In [6]:
def load_and_preprocess_data(file_path, kernel_size):
    data3D = np.load(file_path)
    data = data3D.reshape(data3D.shape[0], -1)
    return preprocess_data(data, kernel_size)

def extract_and_store_features(data, win_length, subject_number, activity_label, data_dict):
    segmented_data = segment_data(data, win_length)
    features = feature_extraction(segmented_data)
    num_segments = segmented_data.shape[0]
    data_dict[subject_number].append((features, [activity_label] * num_segments))

def process_pose_files(directory, kernel_size, win_length):
    data_dictionary = defaultdict(list)
    file_names = os.listdir(directory)

    for file_name in file_names:
        subject_number = int(file_name[5:7])
        activity_label = int(file_name[1:3]) - 1
        file_path = os.path.join(directory, file_name)

        preprocessed_data = load_and_preprocess_data(file_path, kernel_size)
        extract_and_store_features(preprocessed_data, win_length, subject_number, activity_label, data_dictionary)

    return data_dictionary

In [7]:
# Parameters
kernel_size = 15
win_length = 1.5
pose_directory = 'pose'

# Process pose data files
pose_data_dict = process_pose_files(pose_directory, kernel_size, win_length)

### Spliting the data into TRAIN, VALIDATION and TEST

In [8]:
def stack_features(data_dict, subject_range, sample_range):
    features_list = []
    labels_list = []

    for subject in subject_range:
        for sample in sample_range:
            features, labels = data_dict[subject][sample]
            features_list.append(features)
            labels_list.extend(labels)

    stacked_features = np.vstack(features_list)
    stacked_labels = np.hstack(labels_list)

    return stacked_features, stacked_labels

In [9]:
# ranges for train, validation, and test sets
train_subjects = range(1, 6)
val_subjects = range(6, 8)
test_subjects = range(8, 11)
sample_range = range(32)

# Process the data for each set
trainx, trainy = stack_features(pose_data_dict, train_subjects, sample_range)
valx, valy = stack_features(pose_data_dict, val_subjects, sample_range)
testx, testy = stack_features(pose_data_dict, test_subjects, sample_range)

In [10]:
trainx.shape, valx.shape, testx.shape

((2243, 132), (893, 132), (1156, 132))

In [11]:
trainy.shape, valy.shape, testy.shape

((2243,), (893,), (1156,))

### Nenural Network

In [12]:
def train_and_evaluate(model, train_features, train_labels, val_features, val_labels):
    model.fit(train_features, train_labels)
    val_predictions = model.predict(val_features)
    return accuracy_score(val_labels, val_predictions)

def find_hyperparameters(param_grid, trainx, trainy, valx, valy):
    best_score = 0
    best_params = {}
    for params in product(*param_grid.values()):
        param_dict = dict(zip(param_grid.keys(), params))
        model = MLPClassifier(**param_dict)
        val_accuracy = train_and_evaluate(model, trainx, trainy, valx, valy)
        if val_accuracy > best_score:
            best_score = val_accuracy
            best_params = param_dict
    return best_score, best_params

def nn_model(best_params, trainx, trainy, valx, valy, testx, testy):
    trainx_all = np.vstack((trainx, valx))
    trainy_all = np.hstack((trainy, valy))
    best_model = MLPClassifier(**best_params)
    best_model.fit(trainx_all, trainy_all)
    test_predictions = best_model.predict(testx)
    test_accuracy = accuracy_score(testy, test_predictions)
    return best_model, test_accuracy

In [13]:
# Hyperparameters
param_grid = {
    'hidden_layer_sizes': [(100, 50), (50, ), (100, )],
    'alpha': [0.001, 0.01, 0.1],
    'max_iter': [2000],
    'solver': ['adam'],
    'learning_rate': ['adaptive'],
    'activation': ['logistic', 'relu']
}

best_score, best_params = find_hyperparameters(param_grid, trainx, trainy, valx, valy)
best_model, test_accuracy = nn_model(best_params, trainx, trainy, valx, valy, testx, testy)

print("Best Parameters:", best_params)
print("Validation Set Accuracy with Best Parameters:", best_score)
print("Test Set Accuracy with Best Parameters:", test_accuracy)

Best Parameters: {'hidden_layer_sizes': (50,), 'alpha': 0.01, 'max_iter': 2000, 'solver': 'adam', 'learning_rate': 'adaptive', 'activation': 'logistic'}
Validation Set Accuracy with Best Parameters: 0.6438969764837627
Test Set Accuracy with Best Parameters: 0.5544982698961938


### Random Forest

In [14]:
def rf_find_hyp(param_grid, trainx, trainy, valx, valy):
    best_score = 0
    best_params = {}

    for params in product(*param_grid.values()):
        param_dict = dict(zip(param_grid.keys(), params))
        model = RandomForestClassifier(**param_dict)
        val_accuracy = train_and_evaluate(model, trainx, trainy, valx, valy)

        if val_accuracy > best_score:
            best_score = val_accuracy
            best_params = param_dict

    return best_score, best_params

def rf_model(best_params, trainx, trainy, valx, valy, testx, testy):
    trainx_all = np.vstack((trainx, valx))
    trainy_all = np.hstack((trainy, valy))
    
    best_model = RandomForestClassifier(**best_params)
    best_model.fit(trainx_all, trainy_all)

    test_predictions = best_model.predict(testx)
    test_accuracy = accuracy_score(testy, test_predictions)

    return best_model, test_accuracy

In [15]:
# Hyperparameter grid
n = 10
param_grid = {
    'n_estimators': [20, 30, 50, 100],
    'max_depth': list(np.arange(2, n+1)),
    'min_samples_leaf': list(np.arange(2, n+1))
}

best_score, best_params = rf_find_hyp(param_grid, trainx, trainy, valx, valy)
best_model, test_accuracy = rf_model(best_params, trainx, trainy, valx, valy, testx, testy)

print("Best Parameters:", best_params)
print("Validation Set Accuracy with Best Parameters:", best_score)
print("Test Set Accuracy with Best Parameters:", test_accuracy)

Best Parameters: {'n_estimators': 30, 'max_depth': 10, 'min_samples_leaf': 5}
Validation Set Accuracy with Best Parameters: 0.5487122060470325
Test Set Accuracy with Best Parameters: 0.513840830449827


## Acknowledgment
I would like to acknowledge the assistance provided by OpenAI's ChatGPT in completing my homework assignment for BMI 500 Week 13. The insights and guidance offered by ChatGPT significantly aided in understanding and solving various aspects of the assignment, particularly in the areas of code development and conceptual explanations. I have thoroughly reviewed and understood all contributions made by ChatGPT and have ensured that they align with the academic integrity standards of the course.

## References:

BMI 500 Lecture NOtes: 
https://drive.google.com/drive/folders/13fXFgqlh6m19XOXiFBn1MwsSPKOHpEps

Data Repository: 
https://www.dropbox.com/s/nzhu004aus5sho8/pose.zip?dl=0

Pandas: 
https://pandas.pydata.org/docs/reference/api/pandas.Series.rolling.html

Random Forest Classifier: 
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

Neural Network: 
https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier

Accuracy Score:
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html