# Exploratory Data Analysis

This notebook compares Euclidean, Cosine, and USE-based similarity for movement windows.

In [2]:
import sys
sys.path.append('/content/drive/MyDrive/msc_data_analytics_thesis_project_pose_estimation')

In [12]:
# EDA Notebook: Exploratory Analysis of Engineered Exercise Datasets

# ✅ Environment Setup
from utils.setup import setup_environment
base_path = setup_environment(mount_gdrive=True)

import os
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

from utils.config import ENGINEERED_DIR, EDA_DIR, EXERCISES, VISUALIZATIONS_DIR

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Environment set. Project base path: /content/drive/MyDrive/pose-estimation-research


In [4]:
def load_dataset(exercise, use_featured=False):
    path = os.path.join(ENGINEERED_DIR, exercise)

    fname_x = "X_featured.npy" if use_featured else "X.npy"
    fname_meta = "meta_featured.json" if use_featured else "meta.json"

    X = np.load(os.path.join(path, fname_x))
    y = np.load(os.path.join(path, "y.npy"))

    with open(os.path.join(path, fname_meta), "r") as f:
        meta = json.load(f)

    return X, y, meta

In [13]:
def save_plot(exercise_variant, filename):
    folder = os.path.join(VISUALIZATIONS_DIR, exercise_variant)
    os.makedirs(folder, exist_ok=True)
    plt.savefig(os.path.join(folder, filename))
    plt.close()

In [6]:
def plot_label_distribution(y, exercise):
    plt.figure(figsize=(6,4))
    sns.histplot(y, bins=20, kde=True, color='skyblue')
    plt.title(f"Degradation Score Distribution: {exercise}")
    plt.xlabel("Degradation Score")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.tight_layout()
    save_plot(exercise, "label_distribution.png")

In [7]:
def plot_sample_windows(X, y, feature_names, exercise, n=3):
    for i in range(min(n, len(X))):
        plt.figure(figsize=(10, 4))
        for f in range(X.shape[2]):
            plt.plot(X[i, :, f], label=feature_names[f])
        plt.title(f"Sample Time Series Window (y={y[i]:.2f})")
        plt.xlabel("Frame")
        plt.ylabel("Normalized Angle")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        save_plot(exercise, f"sample_window_{i+1}.png")

In [8]:
def plot_feature_correlation(X, feature_names, exercise):
    X_flat = X.reshape(-1, X.shape[2])
    df = StandardScaler().fit_transform(X_flat)
    corr = np.corrcoef(df.T)
    plt.figure(figsize=(6,5))
    sns.heatmap(corr, annot=True, xticklabels=feature_names, yticklabels=feature_names, cmap="coolwarm", fmt=".2f")
    plt.title(f"Feature Correlation Heatmap: {exercise}")
    plt.tight_layout()
    save_plot(exercise, "correlation_heatmap.png")

In [9]:
def plot_tsne_projection(X, y, exercise):
    X_avg = X.mean(axis=1)
    X_scaled = StandardScaler().fit_transform(X_avg)
    X_tsne = TSNE(n_components=2, perplexity=15, random_state=42).fit_transform(X_scaled)
    plt.figure(figsize=(6,5))
    scatter = plt.scatter(X_tsne[:,0], X_tsne[:,1], c=y, cmap="viridis", alpha=0.8)
    plt.title(f"t-SNE Projection: {exercise}")
    plt.colorbar(scatter, label="Degradation Score")
    plt.tight_layout()
    save_plot(exercise, "tsne_projection.png")

In [14]:
for exercise in tqdm(EXERCISES, desc="📊 Running EDA", unit="exercise"):
    for use_featured in [False, True]:
        variant = "featured" if use_featured else "base"
        print(f"\n🔍 {exercise} | EDA Variant: {variant}")

        try:
            X, y, meta = load_dataset(exercise, use_featured=use_featured)
            feature_names = meta.get("feature_names", [f"f{i}" for i in range(X.shape[2])])

            plot_label_distribution(y, f"{exercise}_{variant}")
            plot_sample_windows(X, y, feature_names, f"{exercise}_{variant}")
            plot_feature_correlation(X, feature_names, f"{exercise}_{variant}")
            plot_tsne_projection(X, y, f"{exercise}_{variant}")
        except Exception as e:
            print(f"⚠️ Skipped {exercise} ({variant}) due to error: {e}")

📊 Running EDA:   0%|          | 0/4 [00:00<?, ?exercise/s]


🔍 squat | EDA Variant: base

🔍 squat | EDA Variant: featured


  c /= stddev[:, None]
  c /= stddev[None, :]
📊 Running EDA:  25%|██▌       | 1/4 [00:25<01:15, 25.09s/exercise]


🔍 bench_press | EDA Variant: base

🔍 bench_press | EDA Variant: featured


  c /= stddev[:, None]
  c /= stddev[None, :]
📊 Running EDA:  50%|█████     | 2/4 [00:53<00:53, 26.96s/exercise]


🔍 pull_ups | EDA Variant: base

🔍 pull_ups | EDA Variant: featured


  c /= stddev[:, None]
  c /= stddev[None, :]
📊 Running EDA:  75%|███████▌  | 3/4 [01:17<00:25, 25.80s/exercise]


🔍 lunges | EDA Variant: base

🔍 lunges | EDA Variant: featured


  c /= stddev[:, None]
  c /= stddev[None, :]
📊 Running EDA: 100%|██████████| 4/4 [02:09<00:00, 32.32s/exercise]
