# Malaria Blood Cell Classification with ResNet-50 (TensorFlow/Keras)

This notebook trains and evaluates a ResNet-50 model to classify blood cell images as Parasitized vs Uninfected using a manifest-based `tf.data` pipeline (no image copying).

- Dataset path: `/Users/jitesh/Downloads/cell_images` (Parasitized/ and Uninfected/)
- Manifests: `data/manifests/train.csv`, `val.csv`, `test.csv`
- Image size: 224, Batch size: 32
- Two-phase training: head, then fine-tune last ResNet block
- Metrics: Accuracy, Precision, Recall, F1, ROC-AUC; Confusion Matrix saved to `reports/figures/confusion_matrix.png`


In [None]:
# If running in a fresh environment, uncomment the next lines to install missing packages.
# %pip install --upgrade pip
# %pip install -r ../requirements.txt
import os
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

from src.datasets_tf import get_datasets_from_manifests
from src.model_tf import build_resnet50

print(tf.__version__)
print(tf.config.list_physical_devices())


## Paths and configuration

In [None]:
RAW_DIR = Path('/Users/jitesh/Downloads/cell_images')
MANIFEST_DIR = Path('../data/manifests')
MODELS_DIR = Path('../models')
REPORTS_DIR = Path('../reports')
FIG_DIR = REPORTS_DIR / 'figures'

IMG_SIZE = 224
BATCH_SIZE = 32
EPOCHS_HEAD = 10
EPOCHS_FT = 10
LR_HEAD = 1e-4
LR_FT = 1e-5
PATIENCE = 4
SEED = 42

MODELS_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR.mkdir(parents=True, exist_ok=True)

MODEL_OUT = MODELS_DIR / 'best_resnet50.h5'
METRICS_TXT = REPORTS_DIR / 'metrics.txt'


## Create manifests (if missing)

In [None]:
# Generate manifests only if they do not exist yet.
if not (MANIFEST_DIR / 'train.csv').exists():
    MANIFEST_DIR.mkdir(parents=True, exist_ok=True)
    import subprocess, sys
    print('Creating manifests...')
    cmd = [sys.executable, '-m', 'src.create_manifests', '--raw_dir', str(RAW_DIR), '--out_dir', str(MANIFEST_DIR), '--val_size', '0.15', '--test_size', '0.15', '--seed', str(SEED)]
    print(' '.join(cmd))
    res = subprocess.run(cmd, capture_output=True, text=True)
    print(res.stdout)
    if res.returncode != 0:
        print(res.stderr)
        raise RuntimeError('Failed to create manifests')
else:
    print('Manifests already exist at', MANIFEST_DIR)


## Build datasets from manifests

In [None]:
train_ds, val_ds, test_ds, class_names = get_datasets_from_manifests(str(MANIFEST_DIR), img_size=IMG_SIZE, batch_size=BATCH_SIZE, seed=SEED)
class_names


## Build model and train (head phase)

In [None]:
model, base = build_resnet50(input_shape=(IMG_SIZE, IMG_SIZE, 3))
model.compile(optimizer=tf.keras.optimizers.Adam(LR_HEAD),
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

callbacks = [
    EarlyStopping(monitor='val_auc', mode='max', patience=PATIENCE, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_auc', mode='max', factor=0.5, patience=max(1, PATIENCE-1), min_lr=1e-6),
    ModelCheckpoint(filepath=str(MODEL_OUT), monitor='val_auc', mode='max', save_best_only=True)
]

history_head = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS_HEAD, callbacks=callbacks)
MODEL_OUT.exists(), MODEL_OUT


## Fine-tune last ResNet block

In [None]:
# Unfreeze last conv block (conv5*)
base.trainable = True
trainable = False
for layer in base.layers:
    if isinstance(layer, tf.keras.layers.Conv2D) and 'conv5' in layer.name:
        trainable = True
    layer.trainable = trainable

model.compile(optimizer=tf.keras.optimizers.Adam(LR_FT),
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

history_ft = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS_FT, callbacks=callbacks)
model.save(MODEL_OUT)
MODEL_OUT.exists(), MODEL_OUT


## Evaluate on validation and test sets

In [None]:
val_metrics = model.evaluate(val_ds, return_dict=True)
test_metrics = model.evaluate(test_ds, return_dict=True)
val_metrics, test_metrics


## Detailed metrics and confusion matrix (Test set)

In [None]:
# Collect ground truth and predictions
y_true = []
y_prob = []
for batch, labels in test_ds:
    y_true.extend(labels.numpy().reshape(-1).astype(int).tolist())
    y_prob.extend(model.predict(batch, verbose=0).reshape(-1).tolist())

y_true = np.array(y_true)
y_prob = np.array(y_prob)
y_pred = (y_prob >= 0.5).astype(int)

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
try:
    auc = roc_auc_score(y_true, y_prob)
except Exception:
    auc = float('nan')

print({'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'auc': auc})

# Save metrics
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
with open(METRICS_TXT, 'w') as f:
    f.write(f'Accuracy: {acc}
')
    f.write(f'Precision: {prec}
')
    f.write(f'Recall: {rec}
')
    f.write(f'F1: {f1}
')
    f.write(f'ROC-AUC: {auc}
')
print('Saved metrics to', METRICS_TXT)

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(4,3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Parasitized','Uninfected'],
            yticklabels=['Parasitized','Uninfected'])
plt.ylabel('True')
plt.xlabel('Predicted')
plt.tight_layout()
fig_path = FIG_DIR / 'confusion_matrix.png'
plt.savefig(fig_path, dpi=150)
plt.show()
print('Saved confusion matrix to', fig_path)
