# MNIST OCR Neural Network with Varying Training Set Sizes

This notebook:
1. Loads the provided **JCB704_MNIST_resized.csv** dataset.
2. For each selected fraction of the original data (5%, 10%, 20%, 40%, 80%):
   - Takes a balanced subsample of that fraction.
   - Performs a train/test split.
   - Trains a neural network (MLP) with:
     - Two hidden layers of **128** and **64** nodes
     - **ReLU** activation in the hidden layers
     - **Sigmoid (logistic)** output layer (via `predict_proba` for the positive class)
     - **Adam** optimizer
     - **max_iter = 500**
   - Computes **precision**, **recall**, and **ROC AUC**.
3. Compares performance across fractions.


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, classification_report, confusion_matrix


In [None]:
# Update the path if your CSV is in a different location
data_path = "JCB704_MNIST_resized.csv"

df = pd.read_csv(data_path)
print("Full dataset shape:", df.shape)
print("Label distribution:")
print(df['labels'].value_counts())


In [None]:
def evaluate_fraction(df, fraction, test_size=0.2, random_state=42):
    """Subsample a balanced fraction of the full dataset, then
    train/test split and fit an MLP classifier.

    Returns a dict with performance metrics.
    """
    assert 0 < fraction <= 1.0, "fraction must be between 0 and 1."

    # Balanced subsample by label (keeps class distribution the same across labels)
    df_balanced = (
        df
        .groupby('labels', group_keys=False)
        .apply(lambda g: g.sample(frac=fraction, random_state=random_state))
    )

    X = df_balanced.drop(columns=['labels']).values
    y = df_balanced['labels'].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    clf = MLPClassifier(
        hidden_layer_sizes=(128, 64),
        activation='relu',      # ReLU in hidden layers
        solver='adam',          # Adam optimizer
        max_iter=500,
        random_state=random_state
    )

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # For binary classification, the second column is the probability of the positive class (label 1)
    y_proba = clf.predict_proba(X_test)[:, 1]

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    print(f"=== Fraction: {fraction*100:.0f}% of original data ===")
    print(f"Subset size: {len(df_balanced)} (train: {len(X_train)}, test: {len(X_test)})")
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification report:")
    print(classification_report(y_test, y_pred, digits=4))

    return {
        "fraction": fraction,
        "subset_size": len(df_balanced),
        "precision": precision,
        "recall": recall,
        "roc_auc": auc
    }


In [None]:
fractions = [0.05, 0.10, 0.20, 0.40, 0.80]

results = []
for frac in fractions:
    metrics = evaluate_fraction(df, frac)
    results.append(metrics)

results_df = pd.DataFrame(results)
results_df['fraction_pct'] = results_df['fraction'] * 100
results_df = results_df[['fraction_pct', 'subset_size', 'precision', 'recall', 'roc_auc']]
results_df


In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.plot(results_df['fraction_pct'], results_df['precision'], marker='o', label='Precision')
plt.plot(results_df['fraction_pct'], results_df['recall'], marker='o', label='Recall')
plt.plot(results_df['fraction_pct'], results_df['roc_auc'], marker='o', label='ROC AUC')
plt.xlabel('Training fraction of original dataset (%)')
plt.ylabel('Score')
plt.ylim(0, 1.05)
plt.legend()
plt.grid(True)
plt.show()
