# Training a Classification Model for Gait Movements

This section focuses on training a machine learning model to classify gait movements, specifically in the context of physiotherapy for amputees. The classification is based on features extracted from gait cycles, aiming to support rehabilitation by identifying different movement patterns accurately.






In [104]:
# Dependencies
import warnings
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

warnings.filterwarnings("ignore")

random_seed = 70

## Loading and Balancing Data

In [105]:
df = pd.read_csv("dataset2.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165 entries, 0 to 164
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   average_speed_m_s        165 non-null    float64
 1   cadence_steps_per_min    165 non-null    float64
 2   average_step_length_m    165 non-null    float64
 3   average_stride_length_m  165 non-null    float64
 4   average_step_width_m     165 non-null    float64
 5   step_symmetry            165 non-null    float64
 6   left_knee_flexion_deg    162 non-null    float64
 7   right_knee_flexion_deg   162 non-null    float64
 8   average_leg_opening_deg  165 non-null    float64
 9   label                    165 non-null    int64  
dtypes: float64(9), int64(1)
memory usage: 13.0 KB


In [106]:
# Drop missing values
df = df.dropna()
df = df[(df != 0).all(axis=1)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 159 entries, 2 to 164
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   average_speed_m_s        159 non-null    float64
 1   cadence_steps_per_min    159 non-null    float64
 2   average_step_length_m    159 non-null    float64
 3   average_stride_length_m  159 non-null    float64
 4   average_step_width_m     159 non-null    float64
 5   step_symmetry            159 non-null    float64
 6   left_knee_flexion_deg    159 non-null    float64
 7   right_knee_flexion_deg   159 non-null    float64
 8   average_leg_opening_deg  159 non-null    float64
 9   label                    159 non-null    int64  
dtypes: float64(9), int64(1)
memory usage: 13.7 KB


In [107]:
df.groupby("label").size()

Unnamed: 0_level_0,0
label,Unnamed: 1_level_1
1,23
2,26
3,27
4,30
5,27
6,26


This dataset is unbalanced. For coherent training, it is important to perform balacing. However, I will randomly remove some samples from each labels.

In [123]:
def balacing_data(df_unbalanced, label, n):
  array = df_unbalanced.groupby("label").size().reset_index()[0].to_numpy()

  # Check if it is balanced
  if np.all(array == array[0]):
    return df_unbalanced

  df_label = df_unbalanced[df_unbalanced["label"] == label]
  idx_to_remove = df_label.sample(n=n, random_state=random_seed).index
  df_reduced = df_unbalanced.drop(idx_to_remove).reset_index(drop=True)
  return df_reduced

df = balacing_data(df, 2, 3)
df = balacing_data(df, 3, 4)
df = balacing_data(df, 4, 7)
df = balacing_data(df, 5, 4)
df = balacing_data(df, 6, 3)
df.groupby("label").size()

Unnamed: 0_level_0,0
label,Unnamed: 1_level_1
1,23
2,23
3,23
4,23
5,23
6,23


## Preparing Data

In [124]:
X = df.drop(columns=["label"]).to_numpy()
y = df["label"].to_numpy()

In [125]:
y

array([3, 3, 1, 1, 5, 5, 6, 6, 6, 5, 4, 6, 6, 4, 1, 1, 1, 1, 1, 6, 3, 5,
       5, 5, 5, 5, 4, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 5, 6, 6, 4, 4, 3, 3,
       4, 4, 6, 1, 3, 3, 2, 2, 2, 2, 6, 6, 1, 4, 2, 2, 1, 1, 1, 1, 6, 4,
       4, 5, 2, 1, 1, 2, 2, 2, 2, 6, 6, 4, 4, 6, 6, 6, 5, 5, 5, 5, 3, 3,
       3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 5, 5, 5, 3, 3, 2, 2, 4, 4, 4, 4, 4,
       4, 3, 3, 3, 3, 3, 3, 1, 1, 4, 4, 6, 2, 2, 1, 1, 5, 5, 6, 2, 2, 5,
       5, 6, 6, 6, 6, 4])

In [126]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,54.027675,64.615385,0.86103,0.365467,0.263095,0.850416,16.014168,17.735113,27.060599
1,54.027675,88.0,1.687438,0.404609,0.307584,0.968576,16.770643,17.650891,26.998184
2,2.800726,107.889908,1.047619,0.287635,0.222265,1.728383,16.870642,24.815438,21.284787
3,2.800726,65.172414,1.231962,0.292152,0.230377,1.57542,18.91162,25.431151,21.623228
4,2.707821,53.846154,0.94862,0.368988,0.303782,0.951562,14.639505,14.22203,22.807222


In [127]:
pd.DataFrame(X).describe().loc[["std", "min", "max"]]

Unnamed: 0,0,1,2,3,4,5,6,7,8
std,6.505841,44.519534,1.174039,0.244792,0.246954,1.133547,15.000957,14.182072,6.057117
min,0.199445,10.632911,0.491996,0.011567,0.005524,0.133768,9.32478,4.596223,0.393617
max,54.027675,224.0,7.800124,1.804169,1.728698,7.563157,108.341858,114.945949,37.254284


Since the values vary significantly, a `StandardScaler` followed by a `Normalizer` will be used in the MLP pipeline before training.

In [128]:
kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)

# 20% of the data will be used for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,
                                                random_state=random_seed)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((110, 9), (28, 9), (110,), (28,))

# Random Forest Classifier

### Hyperparameters
Defining hyperparameter options

In [114]:
rf_param_grid = {
    "max_depth": (None, 5, 10),
    "min_samples_split": (2, 5),
    "criterion": ("gini", "entropy")
}

Model construction

In [115]:
rf_model = RandomForestClassifier(random_state=random_seed)

grid_search_rf = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_param_grid,
    cv=kf,
    scoring="accuracy",
    verbose=1,
    n_jobs=-1 # it uses all availables cores in parallel
)

Finding the best *hyperparameters*

In [116]:
%%time
grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: user 263 ms, sys: 11 ms, total: 274 ms
Wall time: 3.44 s


### Result

In [117]:
best_rf_model = grid_search_rf.best_estimator_
test_pred_rf = best_rf_model.predict(X_test)
test_acc_rf = accuracy_score(y_test, test_pred_rf)

print(f"Random Forest Best Params: {grid_search_rf.best_params_}")
print(f"Random Forest Test Accuracy: {test_acc_rf:.4}")

Random Forest Best Params: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 2}
Random Forest Test Accuracy: 0.4643


# Multiple Layer Perceptron Classifier


### Hyperparameters
Defining hyperparameter options

In [118]:
mlp_param_grid = {
    "mlp__hidden_layer_sizes": ((10,), (50,), (100,), (50, 50), (100, 100)),
    "mlp__activation": ("relu", "tanh", "sigmoid"),
    "mlp__learning_rate_init": (.01, .001),
    "mlp__solver": ("lbfgs", "adam")
}

In [119]:
def add_gaussian_noise(X, y, noise_level=.05, n_copies=1):
  noisy_copies = []
  for _ in range(n_copies):
      noise = np.random.normal(loc=0.0, scale=noise_level, size=X.shape)
      noisy_X = X + noise
      noisy_copies.append(noisy_X)

  X_combined = np.vstack([X] + noisy_copies)
  y_combined = np.hstack([y] * (n_copies + 1))
  return X_combined, y_combined

  # X_train, y_train = add_gaussian_noise(X_train, y_train, n_copies=6)

Model and pipeline construction

In [129]:
mlp_model = MLPClassifier(
    max_iter=250,
    early_stopping=True,
    random_state=random_seed
)

mlp_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("normalizer", Normalizer()),
    ("mlp", mlp_model)
])

grid_search_mlp = GridSearchCV(
    estimator=mlp_pipeline,
    param_grid=mlp_param_grid,
    cv=kf,
    scoring="accuracy",
    verbose=1,
    n_jobs=-1
)

Finding the best *hyperparameters*

In [130]:
%%time
grid_search_mlp.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
CPU times: user 1.82 s, sys: 16 ms, total: 1.83 s
Wall time: 2.17 s


### Result

In [131]:
best_mlp_model = grid_search_mlp.best_estimator_
test_pred_mlp = best_mlp_model.predict(X_test)
test_acc_mlp = accuracy_score(y_test, test_pred_mlp)

print(f"MLP Best params: {grid_search_mlp.best_params_}")
print(f"MLP Test Accuracy: {test_acc_mlp:.4}")

MLP Best params: {'mlp__activation': 'relu', 'mlp__hidden_layer_sizes': (50, 50), 'mlp__learning_rate_init': 0.01, 'mlp__solver': 'lbfgs'}
MLP Test Accuracy: 0.2857
