In [3]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn import preprocessing

import matplotlib.pyplot as plt
import seaborn as sns


In [6]:
train_data = pd.read_csv("train_data.csv")
X_train = train_data.drop(columns=["Activity"])
y_train = train_data["Activity"]

  train_data = pd.read_csv("train_data.csv")


In [7]:
test_data = pd.read_csv("test_data.csv")
X_test = test_data.drop(columns=["Activity"])
y_test = test_data["Activity"]

<h4>Evaluation Metrics</h4>
We compute the following evaluation metrics:

* Accuracy: Measures the overall correctness of predictions.
* Precision: Evaluates how precise the model's predictions are across all classes.
* Recall: Measures the model's ability to identify all relevant instances across all classes.
* F1 Score: A balanced measure that combines precision and recall, making it useful for imbalanced datasets.

<h4>Decision Tree Classifier</h4>
Here we evaluate the performance of the Decision Tree Classifier using 5-Fold Cross Validation. Cross-validation is a robust technique for assessing model performance as it splits the dataset into multiple folds, trains on different subsets, and evaluates on the remaining data. This process helps ensure the model's performance generalizes well across unseen data.

In [8]:
dt_model = DecisionTreeClassifier()

kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracy = cross_val_score(dt_model, X_test, y_test, cv=kf, scoring='accuracy').mean()
precision = cross_val_score(dt_model, X_test, y_test, cv=kf, scoring='precision_macro').mean()
recall = cross_val_score(dt_model, X_test, y_test, cv=kf, scoring='recall_macro').mean()
f1 = cross_val_score(dt_model, X_test, y_test, cv=kf, scoring='f1_macro').mean()

print("Decision Tree Classifier:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1 Score: {f1:.4f}")

Decision Tree Classifier:
  Accuracy: 0.5399
  Precision: 0.8101
  Recall: 0.5336
  F1 Score: 0.5740


<h4>Random Forest Classifier</h4>
Here we evaluate the performance of the Random Forest Classifier using 5-Fold Cross Validation. Random Forest is a model that builds multiple decision trees and aggregates their predictions to improve accuracy and reduce overfitting.

In [9]:
rf_model = RandomForestClassifier()

kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracy = cross_val_score(rf_model, X_test, y_test, cv=kf, scoring='accuracy').mean()
precision = cross_val_score(rf_model, X_test, y_test, cv=kf, scoring='precision_macro').mean()
recall = cross_val_score(rf_model, X_test, y_test, cv=kf, scoring='recall_macro').mean()
f1 = cross_val_score(rf_model, X_test, y_test, cv=kf, scoring='f1_macro').mean()

print("Random Forest Classifier:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1 Score: {f1:.4f}")


Random Forest Classifier:
  Accuracy: 0.5680
  Precision: 0.8677
  Recall: 0.5589
  F1 Score: 0.6081


 <h4>Logistic Regression</h4>

 Here we evaluate the performance of the Random Forest Classifier using 5-Fold Cross Validation. Random Forest is an ensemble model that builds multiple decision trees and aggregates their predictions to improve accuracy and reduce overfitting.

In [12]:
lr_model = LogisticRegression(max_iter=1000)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracy = cross_val_score(lr_model, X_test, y_test, cv=kf, scoring='accuracy').mean()
precision = cross_val_score(lr_model, X_test, y_test, cv=kf, scoring='precision_macro').mean()
recall = cross_val_score(lr_model, X_test, y_test, cv=kf, scoring='recall_macro').mean()
f1 = cross_val_score(lr_model, X_test, y_test, cv=kf, scoring='f1_macro').mean()

print("Logistic Regression:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1 Score: {f1:.4f}")


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1223, in fit
    X, y = self._validate_data(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py", line 1301, in check_X_y
    X = check_array(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py", line 1064, in check_array
    _assert_all_finite(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py", line 123, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py", line 172, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


<h3>Leave-One-Subject-Out Cross-Validation (LOSO-CV)</h3>


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import LeaveOneOut, cross_val_score

loo = LeaveOneOut()

lr_model = LogisticRegression(max_iter=1000)
rf_model = RandomForestClassifier(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)

lr_accuracy = cross_val_score(lr_model, X_test, y_test, cv=loo, scoring='accuracy').mean()
lr_precision = cross_val_score(lr_model, X_test, y_test, cv=loo, scoring='precision_macro').mean()
lr_recall = cross_val_score(lr_model, X_test, y_test, cv=loo, scoring='recall_macro').mean()
lr_f1 = cross_val_score(lr_model, X_test, y_test, cv=loo, scoring='f1_macro').mean()

rf_accuracy = cross_val_score(rf_model, X_test, y_test, cv=loo, scoring='accuracy').mean()
rf_precision = cross_val_score(rf_model, X_test, y_test, cv=loo, scoring='precision_macro').mean()
rf_recall = cross_val_score(rf_model, X_test, y_test, cv=loo, scoring='recall_macro').mean()
rf_f1 = cross_val_score(rf_model, X_test, y_test, cv=loo, scoring='f1_macro').mean()

dt_accuracy = cross_val_score(dt_model, X_test, y_test, cv=loo, scoring='accuracy').mean()
dt_precision = cross_val_score(dt_model, X_test, y_test, cv=loo, scoring='precision_macro').mean()
dt_recall = cross_val_score(dt_model, X_test, y_test, cv=loo, scoring='recall_macro').mean()
dt_f1 = cross_val_score(dt_model, X_test, y_test, cv=loo, scoring='f1_macro').mean()

print("Logistic Regression (Leave-One-Out CV):")
print(f"  Accuracy: {lr_accuracy:.4f}")
print(f"  Precision: {lr_precision:.4f}")
print(f"  Recall: {lr_recall:.4f}")
print(f"  F1 Score: {lr_f1:.4f}")

print("Random Forest Classifier (Leave-One-Out CV):")
print(f"  Accuracy: {rf_accuracy:.4f}")
print(f"  Precision: {rf_precision:.4f}")
print(f"  Recall: {rf_recall:.4f}")
print(f"  F1 Score: {rf_f1:.4f}")

print("Decision Tree Classifier (Leave-One-Out CV):")
print(f"  Accuracy: {dt_accuracy:.4f}")
print(f"  Precision: {dt_precision:.4f}")
print(f"  Recall: {dt_recall:.4f}")
print(f"  F1 Score: {dt_f1:.4f}")
