In [1]:
import os
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, recall_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from joblib import dump, load

In [2]:
other_features = pd.read_csv('pads-parkinsons-disease-smartwatch-dataset-1.0.0/preprocessed/file_list.csv')
df = other_features[['age_at_diagnosis', 'age', 'height', 'weight', 'gender', 'handedness', 'appearance_in_kinship', 'label']]
df

Unnamed: 0,age_at_diagnosis,age,height,weight,gender,handedness,appearance_in_kinship,label
0,56,56,173,78,male,right,True,0
1,69,81,193,104,male,right,False,2
2,45,45,170,78,female,right,False,0
3,63,67,161,90,female,right,False,1
4,65,75,172,86,male,left,False,1
...,...,...,...,...,...,...,...,...
464,62,65,175,80,male,right,True,1
465,84,84,172,74,female,right,True,0
466,55,57,190,100,male,right,False,1
467,73,76,198,118,male,right,False,1


In [3]:
from sklearn.preprocessing import LabelEncoder

def preprocess_and_split_data(df):
    label_encoder = LabelEncoder()

    df['label'] = df['label'].replace({1: 1, 2: 0, 0: 0})
    df['gender'] = label_encoder.fit_transform(df['gender'])
    df['handedness'] = label_encoder.fit_transform(df['handedness'])
    df['appearance_in_kinship'] = label_encoder.fit_transform(df['appearance_in_kinship'])

    features = np.array(df.iloc[:, :-1])
    labels = np.array(df['label'])
    
    X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)
    
    train_label_counts = pd.Series(Y_train).value_counts(normalize=True)
    test_label_counts = pd.Series(Y_test).value_counts(normalize=True)
    
    return X_train, X_test, Y_train, Y_test

In [4]:
# Define a function to train and evaluate a classifier
def train_and_evaluate_classifier(X_train, Y_train, X_test, Y_test, classifier):
    classifier.fit(X_train, Y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(Y_test, y_pred)
    recall = recall_score(Y_test, y_pred)
    return accuracy, recall, classifier

In [5]:

    results_df = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Recall'])

    X_train, X_test, Y_train, Y_test = preprocess_and_split_data(df)
    print(X_train)
    rf_classifier = RandomForestClassifier()
    rf_accuracy_first_five, rf_recall_first_five, rf_first_five_classifier = train_and_evaluate_classifier(X_train, Y_train, X_test, Y_test, rf_classifier)
    new_data_rf_first_five = pd.DataFrame({'Classifier': 'Random Forest', 'Accuracy': [rf_accuracy_first_five], 'Recall': [rf_recall_first_five]})
    # dump(rf_first_five_classifier, 'models/' + event_name + '_Random Forest_first_five.joblib')
    results_df = pd.concat([results_df, new_data_rf_first_five.reset_index(drop=True)], ignore_index=True)

    # Train and evaluate XGBoost classifier for first_five
    xgb_classifier = XGBClassifier()
    xgb_accuracy_first_five, xgb_recall_first_five, xgb_first_five_classifier = train_and_evaluate_classifier(X_train, Y_train, X_test, Y_test, xgb_classifier)
    new_data_xgb_first_five = pd.DataFrame({'Classifier': 'XGBoost', 'Accuracy': [xgb_accuracy_first_five], 'Recall': [xgb_recall_first_five]})
    # dump(xgb_first_five_classifier, 'models/' + event_name + '_XGBoost_first_five.joblib')
    results_df = pd.concat([results_df, new_data_xgb_first_five.reset_index(drop=True)], ignore_index=True)
    
    # Train and evaluate LightGBM classifier for first_five
    lgb_classifier = lgb.LGBMClassifier(verbose=-1)
    lgb_accuracy_first_five, lgb_recall_first_five, lgb_first_five_classifier = train_and_evaluate_classifier(X_train, Y_train, X_test, Y_test, lgb_classifier)
    new_data_lgb_first_five = pd.DataFrame({'Classifier': 'LightGBM', 'Accuracy': [lgb_accuracy_first_five], 'Recall': [lgb_recall_first_five]})
    # dump(lgb_first_five_classifier, 'models/' + event_name + '_LightGBM_first_five.joblib')
    results_df = pd.concat([results_df, new_data_lgb_first_five.reset_index(drop=True)], ignore_index=True)

    #Confidence ensemble for first_five
    voting_clf = VotingClassifier(estimators=[
        ('rf', rf_first_five_classifier), 
        ('xgb', xgb_first_five_classifier), 
        ('lgb', lgb_first_five_classifier)
    ], voting='soft')
    voting_clf.fit(X_train, Y_train)
    voting_accuracy = voting_clf.score(X_test, Y_test)
    y_pred_first_five = voting_clf.predict(X_test)
    voting_recall = recall_score(y_pred_first_five, Y_test)
    ensemble_first_five = pd.DataFrame({'Classifier': 'Ensemble', 'Accuracy': [voting_accuracy], 'Recall': [voting_recall]})
    # dump(voting_clf, 'models/' + event_name + '_Ensemble_first_five.joblib')
    results_df = pd.concat([results_df, ensemble_first_five.reset_index(drop=True)], ignore_index=True)

    highest_recall = max(rf_recall_first_five, xgb_recall_first_five, lgb_recall_first_five, voting_recall)

    if highest_recall == rf_recall_first_five:
        dump(rf_first_five_classifier, 'models/final.joblib')
    elif highest_recall == xgb_recall_first_five:
        dump(xgb_first_five_classifier, 'models/final.joblib')
    elif highest_recall == lgb_recall_first_five:
        dump(lgb_first_five_classifier, 'models/final.joblib')
    else: 
        dump(voting_clf, 'models/final.joblib')

    results_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].replace({1: 1, 2: 0, 0: 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gender'] = label_encoder.fit_transform(df['gender'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['handedness'] = label_encoder.fit_transform(df['handedness'])
A value is trying to be set o

[[ 68  72 178 ...   1   1   1]
 [ 48  48 172 ...   0   1   0]
 [ 64  64 170 ...   1   1   0]
 ...
 [ 73  78 172 ...   1   1   0]
 [ 37  61 182 ...   1   1   1]
 [ 45  51 183 ...   1   1   0]]


Unnamed: 0,Classifier,Accuracy,Recall
0,Random Forest,0.744681,0.890909
1,XGBoost,0.755319,0.836364
2,LightGBM,0.744681,0.854545
3,Ensemble,0.723404,0.737705


In [6]:
X_train

array([[ 68,  72, 178, ...,   1,   1,   1],
       [ 48,  48, 172, ...,   0,   1,   0],
       [ 64,  64, 170, ...,   1,   1,   0],
       ...,
       [ 73,  78, 172, ...,   1,   1,   0],
       [ 37,  61, 182, ...,   1,   1,   1],
       [ 45,  51, 183, ...,   1,   1,   0]])

In [7]:
target = [56, 56, 173, 78]
found_subarray = None
for i in range(len(X_train)):
    subarr = X_train[i, :4]  # Considering only the first 4 columns
    if np.array_equal(subarr, target):
        found_subarray = X_train[i]
        print(found_subarray)
        break

[ 56  56 173  78   1   1   1]
