# Imports

In [1]:
## General
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

## Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.multiclass import OneVsOneClassifier

## Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

## Metrics
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (roc_auc_score, ConfusionMatrixDisplay, 
                             precision_score, PrecisionRecallDisplay,
                             recall_score, RocCurveDisplay, f1_score,
                             accuracy_score, classification_report)

# Functions

In [2]:
def classification_metrics(y_true, y_pred, label="",
                           output_dict=False, figsize=(8,4),
                           normalize='true', cmap='Blues',
                           colorbar=False):
    
  ## Get the classification report
  report = classification_report(y_true, y_pred)
    
  ## Print header and report
  header = "-"*70
  print(header, f" Classification Metrics: {label}", header, sep='\n')
  print(report)
    
  ## Confusion matrices subplots
  fig, axes = plt.subplots(ncols=2, figsize=figsize)
    
  ## Confusion matrix  of raw counts
  ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                normalize=None, cmap='gist_gray', colorbar=colorbar,
                ax = axes[0]);
  axes[0].set_title("Raw Counts")
  
  ## Confusion matrix with the test data
  ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                normalize=normalize, cmap=cmap, colorbar=colorbar,
                ax = axes[1]);
  axes[1].set_title("Normalized Confusion Matrix")

  ## Adjust layout and show figure
  fig.tight_layout()
  plt.show()
  
  ## Return dictionary of classification_report
  if output_dict==True:
    report_dict = classification_report(y_true, y_pred, output_dict=True)
    return report_dict

In [16]:
def evaluate_classification(model, X_train, y_train, X_test, y_test,
                         figsize=(6,4), normalize='true', output_dict = False,
                            cmap_train='Blues', cmap_test="Reds",colorbar=False):
    ## Get predictions for training data
    y_train_pred = model.predict(X_train)
    
    ## Call the helper function to obtain regression metrics for training data
    results_train = classification_metrics(y_train, y_train_pred, #verbose = verbose,
                                     output_dict=True, figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_train,
                                     label='Training Data')
    print()
    ## Get predictions for test data
    y_test_pred = model.predict(X_test)
    
    ## Call the helper function to obtain regression metrics for test data
    results_test = classification_metrics(y_test, y_test_pred, #verbose = verbose,
                                  output_dict=True,figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_test,
                                    label='Test Data')
    
    ## Calculate AUC for both training and testing sets
    print(f"Training AUC: {roc_auc_score(y_train, y_train_pred, multi_class='ovo')}")
    print(f"Testing AUC: {roc_auc_score(y_test, y_test_pred, multi_class='ovo')}")
    
    ## Visualize the ROC curve
    RocCurveDisplay.from_estimator(model, X_train, y_train)
    plt.plot([0, 1], [0, 1], ls = '--', label = 'Baseline (AUC = 0.5)')
    plt.legend();
    
    if output_dict == True:
        ## Store results in a dataframe if ouput_frame is True
        results_dict = {'train':results_train,
                    'test': results_test}
        return results_dict

# Load Data

In [4]:
## Load unprocessed data to avoid data leakage
df = pd.read_csv('Data/fall_data.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2039 entries, 0 to 2038
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Distance       2039 non-null   float64
 1   Pressure       2039 non-null   float64
 2   HRV            2039 non-null   float64
 3   Sugar level    2039 non-null   float64
 4   SpO2           2039 non-null   float64
 5   Accelerometer  2039 non-null   float64
 6   Decision       2039 non-null   int64  
dtypes: float64(6), int64(1)
memory usage: 111.6 KB


Unnamed: 0,Distance,Pressure,HRV,Sugar level,SpO2,Accelerometer,Decision
0,25.54,1.0,101.396,61.08,87.77,1.0,1
1,2.595,2.0,110.19,20.207,65.19,1.0,2
2,68.067,0.0,87.412,79.345,99.345,0.0,0
3,13.09,1.0,92.266,36.18,81.545,1.0,1
4,69.43,0.0,89.48,80.0,99.99,0.0,0


In [5]:
## Format column headers to lowercase
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,distance,pressure,hrv,sugar level,spo2,accelerometer,decision
0,25.54,1.0,101.396,61.08,87.77,1.0,1
1,2.595,2.0,110.19,20.207,65.19,1.0,2
2,68.067,0.0,87.412,79.345,99.345,0.0,0
3,13.09,1.0,92.266,36.18,81.545,1.0,1
4,69.43,0.0,89.48,80.0,99.99,0.0,0


In [6]:
df.columns

Index(['distance', 'pressure', 'hrv', 'sugar level', 'spo2', 'accelerometer',
       'decision '],
      dtype='object')

In [7]:
## Reformat column names
df.rename(columns={'distance':'distance (cm)','hrv':'hrv (bpm)',
                   'sugar level':'blood sugar level (mg/dL)',
                   'decision ':'decision',}, inplace=True)
df.head()

Unnamed: 0,distance (cm),pressure,hrv (bpm),blood sugar level (mg/dL),spo2,accelerometer,decision
0,25.54,1.0,101.396,61.08,87.77,1.0,1
1,2.595,2.0,110.19,20.207,65.19,1.0,2
2,68.067,0.0,87.412,79.345,99.345,0.0,0
3,13.09,1.0,92.266,36.18,81.545,1.0,1
4,69.43,0.0,89.48,80.0,99.99,0.0,0


In [8]:
## Determine class balance in the target
df['decision'].value_counts(normalize=True)

0    0.338401
1    0.334478
2    0.327121
Name: decision, dtype: float64

> The class balance is pretty evenly split so we will not need to re-balance classes. 

# Model Validation & Preprocessing

## Train Test Split

In [9]:
## Train test split
X = df.drop(columns=['decision'])
y = df['decision']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
## Print length of train and test splits
print(f'Length of X_train: {len(X_train)}')
print(f'Length of X_test: {len(X_test)}')

Length of X_train: 1529
Length of X_test: 510


## Preprocessing Pipeline

In [11]:
## Column Transformers
scaler = StandardScaler()

## Column selectors
num_col = make_column_selector(dtype_include='number')

## Tuples for pipeline
num_tuple = (scaler, num_col)

## Preprocessor object
prepocessor = make_column_transformer(num_tuple, verbose_feature_names_out=False)
prepocessor

# Baseline Models

## Decision Tree

In [18]:
## Instantiate and fit random forest model
tree = OneVsOneClassifier(DecisionTreeClassifier())
tree_pipe = make_pipeline(prepocessor, tree)

tree_pipe.fit(X_train, y_train)

## Random Forest

In [31]:
## Instantiate and fit random forest model
rf = OneVsOneClassifier(RandomForestClassifier())
rf_pipe = make_pipeline(prepocessor, rf)

rf_pipe.fit(X_train, y_train)

## LightGBM

In [40]:
## Instantiate and fit lightgbm model 
lgb = OneVsOneClassifier(LGBMClassifier())
lgb_pipe = make_pipeline(prepocessor, lgb)

lgb_pipe.fit(X_train, y_train)