In [1]:
import os
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, recall_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from joblib import dump, load

In [2]:
def process_files_in_directory(directory, event_name, other_features):
    column_names = ['Timestamp', 'Accelerometer_X', 'Accelerometer_Y', 'Accelerometer_Z', 'Gyroscope_X', 'Gyroscope_Y', 'Gyroscope_Z']
    #path = os.path.join(directory, 'timeseries')
    path = 'pads-parkinsons-disease-smartwatch-dataset-1.0.0/movement/timeseries/'
    file_pattern = f"{path}/*_{event_name}.txt"
    file_list = glob.glob(file_pattern)
    first_five_seconds_list = []
    rest_list = []
    
    
    for filename in tqdm(file_list, desc=f'Processing files for event {event_name}'):
        df = pd.read_csv(filename, delimiter=',')  
        df.columns = column_names
        df = df[(df['Timestamp'] > 0.5) & (df['Timestamp'] < 10.24)]
    
        base_name = os.path.basename(filename)
        unique_identifier = int(base_name.replace(f"_{event_name}.txt", ''))
    
        df['Patient_ID'] = unique_identifier
        
        label_value = other_features.loc[other_features['id'] == unique_identifier, 'label'].iloc[0]
        df['Label'] = label_value
    
    
        first_five_seconds_data = df[df['Timestamp'] <= 5]
        last_seconds_data = df[df['Timestamp'] > 5]
    
        mean_values_first_five = first_five_seconds_data[['Accelerometer_X', 'Accelerometer_Y', 'Accelerometer_Z',
                                 'Gyroscope_X', 'Gyroscope_Y', 'Gyroscope_Z']].mean()
        mean_dict_first_five = {
            'Accelerometer_X_mean': mean_values_first_five['Accelerometer_X'],
            'Accelerometer_Y_mean': mean_values_first_five['Accelerometer_Y'],
            'Accelerometer_Z_mean': mean_values_first_five['Accelerometer_Z'],
            'Gyroscope_X_mean': mean_values_first_five['Gyroscope_X'],
            'Gyroscope_Y_mean': mean_values_first_five['Gyroscope_Y'],
            'Gyroscope_Z_mean': mean_values_first_five['Gyroscope_Z'],
            'Label': df['Label'].iloc[0],
            'Patient_ID': df['Patient_ID'].iloc[0]
        }
        first_five_df = pd.DataFrame([mean_dict_first_five])
        first_five_seconds_list.append(first_five_df)
    
        mean_values_rest = last_seconds_data[['Accelerometer_X', 'Accelerometer_Y', 'Accelerometer_Z',
                                 'Gyroscope_X', 'Gyroscope_Y', 'Gyroscope_Z']].mean()
        mean_dict_rest = {
            'Accelerometer_X_mean': mean_values_rest['Accelerometer_X'],
            'Accelerometer_Y_mean': mean_values_rest['Accelerometer_Y'],
            'Accelerometer_Z_mean': mean_values_rest['Accelerometer_Z'],
            'Gyroscope_X_mean': mean_values_rest['Gyroscope_X'],
            'Gyroscope_Y_mean': mean_values_rest['Gyroscope_Y'],
            'Gyroscope_Z_mean': mean_values_rest['Gyroscope_Z'],
            'Label': df['Label'].iloc[0],
            'Patient_ID': df['Patient_ID'].iloc[0]
        }
        rest_df = pd.DataFrame([mean_dict_rest])
        rest_list.append(rest_df)
         
    first_five_seconds_list_df = pd.concat(first_five_seconds_list, ignore_index=True)
    rest_df = pd.concat(rest_list, ignore_index=True)
    return first_five_seconds_list_df, rest_df

In [3]:
def preprocess_and_split_data(df, part):
    df['Label'] = df['Label'].replace({1: 1, 2: 0, 0: 0})
    df.drop(columns=['Patient_ID'], inplace=True)
    
    features = np.array(df.iloc[:, :-1])
    labels = np.array(df['Label'])
    
    X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)
    
    train_label_counts = pd.Series(Y_train).value_counts(normalize=True)
    test_label_counts = pd.Series(Y_test).value_counts(normalize=True)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, Y_train, Y_test

In [4]:
def train_and_evaluate_xgb(X_train, Y_train, X_test, Y_test):
    # Initialize XGBoost classifier
    xgb_classifier = xgb.XGBClassifier()

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.01, 0.001]
    }

    # Define recall as the scoring metric
    scorer = make_scorer(recall_score)

    # Define stratified cross-validation strategy
    stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Perform hyperparameter tuning with stratified cross-validation
    grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, scoring=scorer, cv=stratified_cv, n_jobs=-1)
    grid_search.fit(X_train, Y_train)

    # Get the best hyperparameters and the best estimator
    best_params = grid_search.best_params_
    best_xgb_classifier = grid_search.best_estimator_

    # Evaluate the best estimator using cross-validation
    cv_recall_scores = cross_val_score(best_xgb_classifier, X_train, Y_train, cv=5, scoring=scorer)

    # Make predictions on the test set
    y_pred = best_xgb_classifier.predict(X_test)

    # Calculate accuracy and recall on the test set
    accuracy = accuracy_score(Y_test, y_pred)
    recall = recall_score(Y_test, y_pred)

    return best_params, cv_recall_scores, accuracy, recall

In [5]:
def train_and_evaluate_rf(X_train, Y_train, X_test, Y_test):
    # Initialize Random Forest classifier
    rf_classifier = RandomForestClassifier()

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Define recall as the scoring metric
    scorer = make_scorer(recall_score)

    # Define stratified cross-validation strategy
    stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Perform hyperparameter tuning with stratified cross-validation
    grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, scoring=scorer, cv=stratified_cv, n_jobs=-1)
    grid_search.fit(X_train, Y_train)

    # Get the best hyperparameters and the best estimator
    best_params = grid_search.best_params_
    best_rf_classifier = grid_search.best_estimator_

    # Evaluate the best estimator using cross-validation
    cv_recall_scores = cross_val_score(best_rf_classifier, X_train, Y_train, cv=5, scoring=scorer)

    # Make predictions on the test set
    y_pred = best_rf_classifier.predict(X_test)

    # Calculate accuracy and recall on the test set
    accuracy = accuracy_score(Y_test, y_pred)
    recall = recall_score(Y_test, y_pred)

    return best_params, cv_recall_scores, accuracy, recall


In [6]:
# Define a function to train and evaluate a classifier
def train_and_evaluate_classifier(X_train, Y_train, X_test, Y_test, classifier):
    classifier.fit(X_train, Y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(Y_test, y_pred)
    recall = recall_score(Y_test, y_pred)
    return accuracy, recall, classifier

In [7]:
directory = 'pads-parkinsons-disease-smartwatch-dataset-1.0.0/movement/timeseries/'
file_list = [file for file in os.listdir(directory) if file.endswith('.txt')]
event_names = []
# Initialize empty DataFrame to store results
results_df = pd.DataFrame(columns=['File', 'Classifier', 'Accuracy', 'Recall'])

files_starting_with_001 = [file for file in file_list if file.startswith('001')]

event_names = [file.split('001_')[1].split('.txt')[0] for file in files_starting_with_001]

for event_name in event_names:
        
    first_five, rest = process_files_in_directory(directory, event_name)
    
    # Preprocess and split first_five data
    X_train_first_five, X_test_first_five, Y_train_first_five, Y_test_first_five = preprocess_and_split_data(first_five, 'first_five')
    
    # Preprocess and split rest data
    X_train_rest, X_test_rest, Y_train_rest, Y_test_rest = preprocess_and_split_data(rest, 'rest')
    
    # Train and evaluate Random Forest classifier for first_five
    rf_classifier = RandomForestClassifier()
    rf_accuracy_first_five, rf_recall_first_five, rf_first_five_classifier = train_and_evaluate_classifier(X_train_first_five, Y_train_first_five, X_test_first_five, Y_test_first_five, rf_classifier)
    new_data_rf_first_five = pd.DataFrame({'File': [event_name], 'Classifier': 'Random Forest', 'Part': 'first_five', 'Accuracy': [rf_accuracy_first_five], 'Recall': [rf_recall_first_five]})
    # dump(rf_first_five_classifier, 'models/' + event_name + '_Random Forest_first_five.joblib')
    results_df = pd.concat([results_df, new_data_rf_first_five.reset_index(drop=True)], ignore_index=True)

    # Train and evaluate XGBoost classifier for first_five
    xgb_classifier = XGBClassifier()
    xgb_accuracy_first_five, xgb_recall_first_five, xgb_first_five_classifier = train_and_evaluate_classifier(X_train_first_five, Y_train_first_five, X_test_first_five, Y_test_first_five, xgb_classifier)
    new_data_xgb_first_five = pd.DataFrame({'File': [event_name], 'Classifier': 'XGBoost', 'Part': 'first_five', 'Accuracy': [xgb_accuracy_first_five], 'Recall': [xgb_recall_first_five]})
    # dump(xgb_first_five_classifier, 'models/' + event_name + '_XGBoost_first_five.joblib')
    results_df = pd.concat([results_df, new_data_xgb_first_five.reset_index(drop=True)], ignore_index=True)
    
    # Train and evaluate LightGBM classifier for first_five
    lgb_classifier = lgb.LGBMClassifier(verbose=-1)
    lgb_accuracy_first_five, lgb_recall_first_five, lgb_first_five_classifier = train_and_evaluate_classifier(X_train_first_five, Y_train_first_five, X_test_first_five, Y_test_first_five, lgb_classifier)
    new_data_lgb_first_five = pd.DataFrame({'File': [event_name], 'Classifier': 'LightGBM', 'Part': 'first_five', 'Accuracy': [lgb_accuracy_first_five], 'Recall': [lgb_recall_first_five]})
    # dump(lgb_first_five_classifier, 'models/' + event_name + '_LightGBM_first_five.joblib')
    results_df = pd.concat([results_df, new_data_lgb_first_five.reset_index(drop=True)], ignore_index=True)

    #Confidence ensemble for first_five
    voting_clf = VotingClassifier(estimators=[
        ('rf', rf_first_five_classifier), 
        ('xgb', xgb_first_five_classifier), 
        ('lgb', lgb_first_five_classifier)
    ], voting='soft')
    voting_clf.fit(X_train_first_five, Y_train_first_five)
    voting_accuracy = voting_clf.score(X_test_first_five, Y_test_first_five)
    y_pred_first_five = voting_clf.predict(X_test_first_five)
    voting_recall = recall_score(y_pred_first_five, Y_test_first_five)
    ensemble_first_five = pd.DataFrame({'File': [event_name], 'Classifier': 'Ensemble', 'Part': 'first_five', 'Accuracy': [voting_accuracy], 'Recall': [voting_recall]})
    # dump(voting_clf, 'models/' + event_name + '_Ensemble_first_five.joblib')
    results_df = pd.concat([results_df, ensemble_first_five.reset_index(drop=True)], ignore_index=True)

    highest_recall = max(rf_recall_first_five, xgb_recall_first_five, lgb_recall_first_five, voting_recall)

    if highest_recall == rf_recall_first_five:
        dump(rf_first_five_classifier, 'models/' + event_name + '_first_five.joblib')
    elif highest_recall == xgb_recall_first_five:
        dump(xgb_first_five_classifier, 'models/' + event_name + '_first_five.joblib')
    elif highest_recall == lgb_recall_first_five:
        dump(lgb_first_five_classifier, 'models/' + event_name + '_first_five.joblib')
    else: 
        dump(voting_clf, 'models/' + event_name + '_first_five.joblib')

    # Train and evaluate Random Forest classifier for rest
    rf_accuracy_rest, rf_recall_rest, rf_rest_classifier = train_and_evaluate_classifier(X_train_rest, Y_train_rest, X_test_rest, Y_test_rest, rf_classifier)
    new_data_rf_rest = pd.DataFrame({'File': [event_name], 'Classifier': 'Random Forest', 'Part': 'rest', 'Accuracy': [rf_accuracy_rest], 'Recall': [rf_recall_rest]})
    # dump(rf_rest_classifier, 'models/' + event_name + '_Random Forest_rest.joblib')
    results_df = pd.concat([results_df, new_data_rf_rest.reset_index(drop=True)], ignore_index=True)

    # Train and evaluate XGBoost classifier for rest
    xgb_accuracy_rest, xgb_recall_rest, xgb_rest_classifier = train_and_evaluate_classifier(X_train_rest, Y_train_rest, X_test_rest, Y_test_rest, xgb_classifier)
    new_data_xgb_rest = pd.DataFrame({'File': [event_name], 'Classifier': 'XGBoost', 'Part': 'rest', 'Accuracy': [xgb_accuracy_rest], 'Recall': [xgb_recall_rest]})
    # dump(xgb_rest_classifier, 'models/' + event_name + '_XGBoost_rest.joblib')
    results_df = pd.concat([results_df, new_data_xgb_rest.reset_index(drop=True)], ignore_index=True)
    
    # Train and evaluate LightGBM classifier for rest
    lgb_accuracy_rest, lgb_recall_rest, lgb_rest_classifier = train_and_evaluate_classifier(X_train_rest, Y_train_rest, X_test_rest, Y_test_rest, lgb_classifier)
    new_data_lgb_rest = pd.DataFrame({'File': [event_name], 'Classifier': 'LightGBM', 'Part': 'rest', 'Accuracy': [lgb_accuracy_rest], 'Recall': [lgb_recall_rest]})
    # dump(lgb_rest_classifier, 'models/' + event_name + '_LightGBM_rest.joblib')
    results_df = pd.concat([results_df, new_data_lgb_rest.reset_index(drop=True)], ignore_index=True)

    #Confidence ensemble for rest
    voting_clf = VotingClassifier(estimators=[
        ('rf', rf_rest_classifier), 
        ('xgb', xgb_rest_classifier), 
        ('lgb', lgb_rest_classifier)
    ], voting='soft')
    voting_clf.fit(X_train_rest, Y_train_rest)
    y_pred_first_five = voting_clf.predict(X_test_first_five)
    voting_recall = recall_score(y_pred_first_five, Y_test_first_five)
    ensemble_rest = pd.DataFrame({'File': [event_name], 'Classifier': 'Ensemble', 'Part': 'rest', 'Accuracy': [voting_accuracy], 'Recall': [voting_recall]})
    # dump(voting_clf, 'models/' + event_name + '_Ensemble_rest.joblib')
    results_df = pd.concat([results_df, ensemble_rest.reset_index(drop=True)], ignore_index=True)

    highest_recall = max(rf_recall_rest, xgb_recall_rest, lgb_recall_rest, voting_recall)

    if highest_recall == rf_recall_rest:
        dump(rf_rest_classifier, 'models/' + event_name + '_rest.joblib')
    elif highest_recall == xgb_recall_rest:
        dump(xgb_rest_classifier, 'models/' + event_name + '_rest.joblib')
    elif highest_recall == lgb_recall_rest:
        dump(lgb_rest_classifier, 'models/' + event_name + '_rest.joblib')
    else: 
        dump(voting_clf, 'models/' + event_name + '_rest.joblib')
        
print(results_df)

Processing files for event LiftHold_LeftWrist: 100%|██████████| 469/469 [00:04<00:00, 112.36it/s]
  results_df = pd.concat([results_df, new_data_rf_first_five.reset_index(drop=True)], ignore_index=True)
Processing files for event StretchHold_LeftWrist: 100%|██████████| 469/469 [00:04<00:00, 116.00it/s]
Processing files for event CrossArms_LeftWrist: 100%|██████████| 469/469 [00:03<00:00, 118.31it/s]
Processing files for event Entrainment_RightWrist: 100%|██████████| 469/469 [00:04<00:00, 109.19it/s]
Processing files for event PointFinger_LeftWrist: 100%|██████████| 469/469 [00:03<00:00, 124.75it/s]
Processing files for event DrinkGlas_RightWrist: 100%|██████████| 469/469 [00:03<00:00, 126.63it/s]
Processing files for event TouchIndex_RightWrist: 100%|██████████| 469/469 [00:03<00:00, 124.88it/s]
Processing files for event DrinkGlas_LeftWrist: 100%|██████████| 469/469 [00:03<00:00, 126.79it/s]
Processing files for event Relaxed_RightWrist: 100%|██████████| 469/469 [00:04<00:00, 111.13it

                     File     Classifier  Accuracy    Recall        Part
0      LiftHold_LeftWrist  Random Forest  0.542553  0.690909  first_five
1      LiftHold_LeftWrist        XGBoost  0.500000  0.563636  first_five
2      LiftHold_LeftWrist       LightGBM  0.425532  0.472727  first_five
3      LiftHold_LeftWrist       Ensemble  0.500000  0.574074  first_five
4      LiftHold_LeftWrist  Random Forest  0.531915  0.672727        rest
..                    ...            ...       ...       ...         ...
171  CrossArms_RightWrist       Ensemble  0.595745  0.623188  first_five
172  CrossArms_RightWrist  Random Forest  0.574468  0.763636        rest
173  CrossArms_RightWrist        XGBoost  0.553191  0.690909        rest
174  CrossArms_RightWrist       LightGBM  0.585106  0.690909        rest
175  CrossArms_RightWrist       Ensemble  0.595745  0.609375        rest

[176 rows x 5 columns]


In [8]:
results_df.to_csv('Results.csv', index = False)