<a href="https://colab.research.google.com/github/l-isaro/Formative-2---Hidden-Markov-Models/blob/main/Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Human Activity Recognition with Hidden Markov Models (HMM)

This notebook loads accelerometer and gyroscope recordings collected with **Sensor Logger**, extracts features in windows, and trains **one HMM per activity**. It performs **leave-one-clip-out** evaluation and reports accuracy, sensitivity, specificity, and a confusion matrix.

In [3]:

# === Setup ===
import os, re, math, json
from pathlib import Path
import numpy as np
import pandas as pd

try:
    import hmmlearn
    from hmmlearn.hmm import GaussianHMM
except Exception:
    try:
        import sys, subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "hmmlearn", "--quiet"])
        from hmmlearn.hmm import GaussianHMM
    except Exception as e:
        raise ImportError("hmmlearn is required. Please install it in your environment (pip install hmmlearn).")


import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

DATA_DIR = Path("data")

WINDOW_SECONDS = 1.0
WINDOW_OVERLAP = 0.5  # 50% overlap

# HMM config per-activity
N_COMPONENTS = 3
COVARIANCE_TYPE = "diag"
N_ITER = 200



# Discover and pair files


In [4]:
def discover_pairs(data_dir: Path):
    acc_files = sorted([p for p in data_dir.glob("*_acc_*.csv")])
    gyro_files = sorted([p for p in data_dir.glob("*_gyro_*.csv")])
    # Index gyro files by (activity, clip)
    key = lambda p: (p.name.split("_")[0].lower(), re.findall(r"_(\d+)\.", p.name)[0] if re.findall(r"_(\d+)\.", p.name) else "0")
    gyro_index = {key(p): p for p in gyro_files}
    pairs = []
    for a in acc_files:
        k = key(a)
        g = gyro_index.get(k)
        if g is not None:
            pairs.append((k[0], k[1], a, g))
    if not pairs:
        raise FileNotFoundError("No matching *_acc_*.csv and *_gyro_*.csv pairs found. Check file names.")
    return pairs

pairs = discover_pairs(DATA_DIR)
print(f"Discovered {len(pairs)} clip pairs:")
for act, clip, a, g in pairs:
    print(f" - {act} clip {clip}: {a.name} + {g.name}")


Discovered 40 clip pairs:
 - jumping clip 1: jumping_acc_1.csv + jumping_gyro_1.csv
 - jumping clip 10: jumping_acc_10.csv + jumping_gyro_10.csv
 - jumping clip 2: jumping_acc_2.csv + jumping_gyro_2.csv
 - jumping clip 3: jumping_acc_3.csv + jumping_gyro_3.csv
 - jumping clip 4: jumping_acc_4.csv + jumping_gyro_4.csv
 - jumping clip 5: jumping_acc_5.csv + jumping_gyro_5.csv
 - jumping clip 6: jumping_acc_6.csv + jumping_gyro_6.csv
 - jumping clip 7: jumping_acc_7.csv + jumping_gyro_7.csv
 - jumping clip 8: jumping_acc_8.csv + jumping_gyro_8.csv
 - jumping clip 9: jumping_acc_9.csv + jumping_gyro_9.csv
 - standing clip 1: standing_acc_1.csv + standing_gyro_1.csv
 - standing clip 10: standing_acc_10.csv + standing_gyro_10.csv
 - standing clip 2: standing_acc_2.csv + standing_gyro_2.csv
 - standing clip 3: standing_acc_3.csv + standing_gyro_3.csv
 - standing clip 4: standing_acc_4.csv + standing_gyro_4.csv
 - standing clip 5: standing_acc_5.csv + standing_gyro_5.csv
 - standing clip 6: st

#Load & merge a pair on seconds_elapsed

In [5]:

def load_sensor_csv(path: Path):
    df = pd.read_csv(path)
    # Rename columns to ensure lowercase and explicit axes
    cols = {c: c.strip().lower() for c in df.columns}
    df = df.rename(columns=cols)
    # Expect 'seconds_elapsed' and axes x,y,z
    required = ["seconds_elapsed", "x", "y", "z"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"{path.name} is missing required columns: {missing}")
    # Sort for merge_asof
    return df.sort_values("seconds_elapsed").reset_index(drop=True)

def merge_acc_gyro(acc_df: pd.DataFrame, gyro_df: pd.DataFrame):
    merged = pd.merge_asof(
        acc_df.sort_values("seconds_elapsed"),
        gyro_df.sort_values("seconds_elapsed"),
        on="seconds_elapsed",
        suffixes=("_acc","_gyro")
    )
    # Compute sampling rate estimate from accelerometer stream
    diffs = np.diff(merged["seconds_elapsed"].values)
    diffs = diffs[diffs > 0]
    if len(diffs) == 0:
        raise ValueError("Cannot compute sampling rate; timestamps look constant.")
    sr = 1.0 / np.median(diffs)  # samples per second
    return merged, float(sr)

# Quick smoke-test on the first pair
act0, clip0, a0, g0 = pairs[0]
m0, sr0 = merge_acc_gyro(load_sensor_csv(a0), load_sensor_csv(g0))
print(act0, clip0, "sampling rate ~", round(sr0,2), "Hz; merged shape:", m0.shape)
m0.head()


jumping 1 sampling rate ~ 99.64 Hz; merged shape: (1030, 9)


Unnamed: 0,time_acc,seconds_elapsed,z_acc,y_acc,x_acc,time_gyro,z_gyro,y_gyro,x_gyro
0,1761476138136281000,0.096281,-0.156107,-0.122559,-0.041694,1761476138136281000,-0.022027,-0.170773,0.013279
1,1761476138146317000,0.106317,-0.788565,-0.239894,0.000617,1761476138146317000,-0.034577,-0.284303,0.04861
2,1761476138156353000,0.116353,-0.648677,-0.224978,0.05382,1761476138156353000,-0.060462,-0.252275,0.124626
3,1761476138166389000,0.126389,-0.067947,-0.153062,0.106948,1761476138166389000,-0.07281,-0.171008,0.178924
4,1761476138176425000,0.136425,0.0949,-0.079979,0.038289,1761476138176425000,-0.053513,-0.166026,0.192915


# Harmonize sampling rates (resample)
We resample merged streams to a common target sampling rate using linear interpolation.
Target is the median SR across all clips (computed after discovery).

In [6]:
def resample_to_target(merged_df: pd.DataFrame, orig_sr: float, target_sr: float):
    if abs(orig_sr - target_sr) < 1e-6:
        return merged_df, orig_sr
    t = merged_df['seconds_elapsed'].values
    t_new = np.arange(t.min(), t.max(), 1.0/target_sr)
    out = pd.DataFrame({'seconds_elapsed': t_new})
    for col in ['x_acc','y_acc','z_acc','x_gyro','y_gyro','z_gyro']:
        out[col] = np.interp(t_new, t, merged_df[col].values)
    return out, target_sr

# Compute global target_sr using the first pass sampling-rate estimates
srs = []
for act, clip, a_path, g_path in pairs:
    acc_df = load_sensor_csv(a_path); gyro_df = load_sensor_csv(g_path)
    m, sr = merge_acc_gyro(acc_df, gyro_df)
    srs.append(sr)
target_sr = float(np.median(srs))
print("Target sampling rate (median across clips):", round(target_sr,2), "Hz")

# Guidance for window size: aim for ~1–2 seconds to capture gait cycles; set via WINDOW_SECONDS above.
print("Windowing guidance: with", round(target_sr,2), "Hz, a", WINDOW_SECONDS, "s window uses ~", int(WINDOW_SECONDS*target_sr), "samples.")


Target sampling rate (median across clips): 99.95 Hz
Windowing guidance: with 99.95 Hz, a 1.0 s window uses ~ 99 samples.
