# Chest X-Ray Images (Pneumonia) - Improved Model

[Dataset link](https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia/code/data)

This notebook creates an improved CNN model for pneumonia detection with better accuracy.

# 1️⃣ Setup & Import

In [None]:
!pip install kagglehub scikit-learn

import os
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import kagglehub

# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

# 2️⃣ Download Dataset

In [None]:
# Download dataset
path = kagglehub.dataset_download("paultimothymooney/chest-xray-pneumonia")
print("✅ Dataset downloaded to:", path)

# Dataset structure: /content/chest_xray/train /val /test
base_dir = path
train_dir = os.path.join(base_dir, 'chest_xray', 'train')
val_dir = os.path.join(base_dir, 'chest_xray', 'val')
test_dir = os.path.join(base_dir, 'chest_xray', 'test')

# 3️⃣ Data Preprocessing & Augmentation

In [None]:
# Enhanced data augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest'
)

# Only rescaling for validation and test
val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

# Create generators
IMG_SIZE = (224, 224)
BATCH_SIZE = 32

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    shuffle=True
)

validation_generator = val_datagen.flow_from_directory(
    val_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    shuffle=False
)

test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    shuffle=False
)

# 4️⃣ Build Improved Model with ResNet50

In [None]:
# Load ResNet50 base model
base_model = ResNet50(
    weights='imagenet',
    include_top=False,
    input_shape=(224, 224, 3)
)

# Freeze base model layers initially
base_model.trainable = False

# Build model
model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.BatchNormalization(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])

# Compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

model.summary()

# 5️⃣ Handle Class Imbalance

In [None]:
# Calculate class weights to handle imbalance
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(train_generator.classes),
    y=train_generator.classes
)
class_weights = dict(enumerate(class_weights))
print("Class weights:", class_weights)

# 6️⃣ Training with Callbacks

In [None]:
# Callbacks for better training
callbacks_list = [
    callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),
    callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-6,
        verbose=1
    ),
    callbacks.ModelCheckpoint(
        'best_model.h5',
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    )
]

# Train the model
EPOCHS = 20

history = model.fit(
    train_generator,
    epochs=EPOCHS,
    validation_data=validation_generator,
    class_weight=class_weights,
    callbacks=callbacks_list,
    verbose=1
)

# 7️⃣ Fine-tuning (Optional)

In [None]:
# Unfreeze some layers for fine-tuning
base_model.trainable = True

# Freeze all layers except the last 10
for layer in base_model.layers[:-10]:
    layer.trainable = False

# Recompile with lower learning rate
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

# Fine-tune for a few more epochs
fine_tune_history = model.fit(
    train_generator,
    epochs=10,
    validation_data=validation_generator,
    class_weight=class_weights,
    callbacks=callbacks_list,
    verbose=1
)

# 8️⃣ Evaluation

In [None]:
# Evaluate on test set
test_loss, test_acc, test_auc, test_precision, test_recall = model.evaluate(test_generator, verbose=1)

print(f"\n✅ Test Results:")
print(f"Accuracy: {test_acc:.4f}")
print(f"AUC: {test_auc:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-Score: {2 * test_precision * test_recall / (test_precision + test_recall):.4f}")

# Classification report
y_true = test_generator.classes
y_pred = (model.predict(test_generator) > 0.5).astype(int)

print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred, target_names=['NORMAL', 'PNEUMONIA']))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("\n📊 Confusion Matrix:")
print(cm)

# 9️⃣ Plot Training History

In [None]:
# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# 🔟 Save Model

In [None]:
# Save the model
model.save('pneumonia_resnet50_improved.h5')
print("✅ Improved model saved as 'pneumonia_resnet50_improved.h5'")

# Also save in Keras format for better compatibility
model.save('pneumonia_resnet50_improved.keras')
print("✅ Model also saved in Keras format")

# Check file size
import os
file_size = os.path.getsize('pneumonia_resnet50_improved.h5') / (1024 * 1024)  # MB
print(f"📁 Model file size: {file_size:.2f} MB")

if file_size > 50:
    print("⚠️ Model file > 50MB - upload to Google Drive")
else:
    print("✅ Model file ≤ 50MB - can be stored locally")

# 🎯 Test Inference

In [None]:
# Test preprocessing function (same as in utils.py)
def preprocess_pil(img, target_size=(224, 224)):
    img = img.convert("RGB").resize(target_size)
    x = np.array(img).astype("float32") / 255.0
    return np.expand_dims(x, axis=0)

# Test on a few samples
import random

print("🧪 Testing inference on random samples:")
for i in random.sample(range(len(test_generator.filepaths)), 5):
    path = test_generator.filepaths[i]
    img = Image.open(path).convert('RGB')
    x = preprocess_pil(img)
    
    pred = model.predict(x, verbose=0)[0][0]
    label = 'PNEUMONIA' if pred > 0.5 else 'NORMAL'
    
    print(f"{os.path.basename(path)} → {label} ({pred:.3f})")

print("\n✅ Model ready for deployment!")