In [1]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# A Comprehensive Analysis of MNIST Classification\n",
    "\n",
    "## Project Overview\n",
    "This project is a multi-stage investigation into classifying the MNIST handwritten digit dataset. It progresses from foundational classification techniques to advanced dimensionality reduction, comparing multiple models and data representation strategies to find the most effective and efficient approach.\n",
    "\n",
    "The analysis is presented in three parts:\n",
    "1. **Part 1: Foundational KNN Classification:** An initial exploration using the K-Nearest Neighbors (KNN) algorithm.\n",
    "2. **Part 2: Improving Performance with PCA:** Applying Principal Component Analysis (PCA) and comparing KNN and Support Vector Machine (SVM) classifiers.\n",
    "3. **Part 3: Advanced Dimensionality Reduction:** A deep dive comparing four data representation techniques (Raw data, PCA, Kernel PCA, and UMAP) with an SVM classifier."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Initial Setup: Import Libraries and Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Import our custom helper functions from the 'src' folder\n",
    "from src.data_loader import load_full_mnist_dataset\n",
    "from src.models import train_and_evaluate_svm, train_and_evaluate_knn\n",
    "from src.reduction import apply_pca, apply_kpca, apply_umap\n",
    "\n",
    "# Load the dataset\n",
    "X, y = load_full_mnist_dataset()\n",
    "\n",
    "# Split data for the entire project\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Part 1 & 2: KNN and SVM on a 6-Digit Subset with PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter for the 6 specified digits for this part of the analysis\n",
    "selected_digits_part1 = [2, 3, 4, 7, 8, 9]\n",
    "mask_train = np.isin(y_train, selected_digits_part1)\n",
    "mask_test = np.isin(y_test, selected_digits_part1)\n",
    "\n",
    "X_train_p1 = X_train[mask_train]\n",
    "y_train_p1 = y_train[mask_train]\n",
    "X_test_p1 = X_test[mask_test]\n",
    "y_test_p1 = y_test[mask_test]\n",
    "\n",
    "# --- Evaluate KNN on Raw Data (Part 1 Logic) ---\n",
    "knn_raw_results = train_and_evaluate_knn(X_train_p1, y_train_p1, X_test_p1, y_test_p1, k=5)\n",
    "print(f\"KNN Accuracy on Raw 6-Digit Subset: {knn_raw_results['accuracy']:.4f}\\n\")\n",
    "\n",
    "# --- Apply PCA and Evaluate Models (Part 2 Logic) ---\n",
    "X_train_p1_pca, X_test_p1_pca = apply_pca(X_train_p1, X_test_p1, n_components=50)\n",
    "\n",
    "knn_pca_results = train_and_evaluate_knn(X_train_p1_pca, y_train_p1, X_test_p1_pca, y_test_p1, k=5)\n",
    "print(f\"KNN Accuracy on PCA 6-Digit Subset: {knn_pca_results['accuracy']:.4f}\\n\")\n",
    "\n",
    "svm_pca_results = train_and_evaluate_svm(X_train_p1_pca, y_train_p1, X_test_p1_pca, y_test_p1)\n",
    "print(f\"SVM Accuracy on PCA 6-Digit Subset: {svm_pca_results['accuracy']:.4f}\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Part 3: Advanced Dimensionality Reduction Comparison (Full 10 Digits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_results = []\n",
    "\n",
    "# --- Baseline: SVM on Raw Data ---\n",
    "raw_results = train_and_evaluate_svm(X_train, y_train, X_test, y_test)\n",
    "final_results.append({'Method': 'Raw Pixels', **raw_results})\n",
    "\n",
    "# --- PCA + SVM ---\n",
    "X_train_pca, X_test_pca = apply_pca(X_train, X_test)\n",
    "pca_results = train_and_evaluate_svm(X_train_pca, y_train, X_test_pca, y_test)\n",
    "final_results.append({'Method': 'PCA', **pca_results})\n",
    "\n",
    "# --- Kernel PCA + SVM ---\n",
    "X_train_kpca, X_test_kpca = apply_kpca(X_train, X_test)\n",
    "kpca_results = train_and_evaluate_svm(X_train_kpca, y_train, X_test_kpca, y_test)\n",
    "final_results.append({'Method': 'Kernel PCA', **kpca_results})\n",
    "\n",
    "# --- UMAP + SVM ---\n",
    "X_train_umap, X_test_umap = apply_umap(X_train, X_test)\n",
    "umap_results = train_and_evaluate_svm(X_train_umap, y_train, X_test_umap, y_test)\n",
    "final_results.append({'Method': 'UMAP', **umap_results})\n",
    "\n",
    "results_df = pd.DataFrame(final_results)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Final Results Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Final Performance Comparison:\")\n",
    "display(results_df[['Method', 'accuracy', 'f1_score', 'training_time']])\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "sns.barplot(x='Method', y='accuracy', data=results_df, palette='viridis')\n",
    "plt.title('Final Classification Accuracy Comparison')\n",
    "plt.ylabel('Accuracy')\n",
    "plt.xlabel('Data Representation Method')\n",
    "plt.ylim(0.9, 1.0)\n",
    "plt.grid(axis='y', linestyle='--')\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}



NameError: name 'null' is not defined