<a href="https://colab.research.google.com/github/hibames/pneumonia-detection-/blob/main/PD2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import os
from tensorflow.keras.applications.efficientnet import EfficientNetB0, preprocess_input
import joblib
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.image import img_to_array, array_to_img, load_img


In [None]:
# Paths to data
data_dir = '/content/drive/MyDrive/chest_xray'
train_dir = os.path.join(data_dir, 'train')
val_dir = os.path.join(data_dir, 'val')
test_dir = os.path.join(data_dir, 'test')


In [None]:
original_dataset_path = '/content/drive/MyDrive/chest_xray/train/NORMAL'
output_path = '/content/drive/MyDrive/chest_xray/train/NORMAL'  # Augmented images added here
target_count = 2777  # Match PNEUMONIA training count


In [None]:
# Count existing images
current_images = os.listdir(original_dataset_path)
num_current = len(current_images)
print(f"Original NORMAL images: {num_current}")


In [None]:
IMG_SIZE = (224, 224)
BATCH_SIZE = 32

In [None]:
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Load original images
images = [f for f in os.listdir(original_dataset_path) if f.endswith('.jpeg') or f.endswith('.jpg') or f.endswith('.png')]
num_current = len(images)
print(f"Original NORMAL images: {num_current}")

i = 0
while len(os.listdir(output_path)) < target_count:
    img_path = os.path.join(original_dataset_path, images[i % num_current])
    img = load_img(img_path)
    x = img_to_array(img)
    x = np.expand_dims(x, axis=0)

    for batch in datagen.flow(x, batch_size=1, save_to_dir=output_path, save_prefix='aug', save_format='jpeg'):
        break  # Generate one image per loop iteration

    i += 1

In [None]:
train_ds = tf.data.Dataset.from_generator(
    lambda: train_data,
    output_signature=(
        tf.TensorSpec(shape=(None, 224, 224), dtype=tf.float32),  # Image shape and type
        tf.TensorSpec(shape=(None,), dtype=tf.int32)  # Label shape and type
    )
)

In [None]:
import os
import shutil
import random

random.seed(42)

source_base = '/content/drive/MyDrive/chest_xray/train'
classes = ['NORMAL', 'PNEUMONIA']

train_base = 'split/train'
val_base = 'split/val'
test_base = 'split/test'

train_ratio = 0.7
val_ratio = 0.15  # 15% for validation, remaining 15% for test

for class_name in classes:
    src_dir = os.path.join(source_base, class_name)
    train_dir = os.path.join(train_base, class_name)
    val_dir = os.path.join(val_base, class_name)
    test_dir = os.path.join(test_base, class_name)

    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    all_files = [f for f in os.listdir(src_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    random.shuffle(all_files)

    total = len(all_files)
    train_end = int(total * train_ratio)
    val_end = int(total * (train_ratio + val_ratio))

    train_files = all_files[:train_end]
    val_files = all_files[train_end:val_end]
    test_files = all_files[val_end:]

    for f in train_files:
        shutil.copy(os.path.join(src_dir, f), os.path.join(train_dir, f))
    for f in val_files:
        shutil.copy(os.path.join(src_dir, f), os.path.join(val_dir, f))
    for f in test_files:
        shutil.copy(os.path.join(src_dir, f), os.path.join(test_dir, f))

    print(f"{class_name} - Train: {len(train_files)}, Val: {len(val_files)}, Test: {len(test_files)}")


In [None]:
train_datagen = ImageDataGenerator(rescale=1./255)  # You might need other preprocessing steps
val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

train_data = train_datagen.flow_from_directory(
    'split/train',  # Path to your training data
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary'  # Adjust if you have more than 2 classes
)

val_data = val_datagen.flow_from_directory(
    'split/val',  # Path to your validation data
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary'
)

test_data = test_datagen.flow_from_directory(
    'split/test',  # Path to your test data
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    shuffle=False  # Important for consistent evaluation
)

In [None]:

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array, load_img
import numpy as np
import os

# Load pre-trained VGG16
vgg_model = VGG16(weights='imagenet', include_top=False, pooling='avg', input_shape=(224, 224, 3))

def extract_features(directory):
    features = []
    labels = []

    for label in ['NORMAL', 'PNEUMONIA']:
        folder = os.path.join(directory, label)
        for filename in os.listdir(folder):
            path = os.path.join(folder, filename)
            img = load_img(path, target_size=(224, 224))
            img_array = img_to_array(img)
            img_array = preprocess_input(img_array)
            img_array = np.expand_dims(img_array, axis=0)

            feature = vgg_model.predict(img_array, verbose=0)
            features.append(feature.flatten())
            labels.append(0 if label == 'NORMAL' else 1)

    return np.array(features), np.array(labels)

# Extract features from train and test
X_train, y_train = extract_features('split/train')
X_test, y_test = extract_features('split/test')


In [None]:
# --- Extract Features ---
X_train_feat, y_train = extract_features('split/train')
X_val_feat, y_val = extract_features('split/val')
X_test_feat, y_test = extract_features('split/test')


In [None]:
# -scaler = StandardScaler()
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_feat)
X_test_scaled = scaler.transform(X_test_feat)



In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Apply SelectKBest to extract top k features
k = 20  # You can adjust this number based on your dataset
selector = SelectKBest(score_func=f_classif, k=k)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Optional: Check selected feature indices
selected_features = selector.get_support(indices=True)
print("Selected feature indices:", selected_features)



Selected feature indices: [  0  23  54  85 102 129 139 159 161 164 174 177 213 216 227 229 321 340
 375 394]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Random Forest
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("\nXGBoost:")
print(classification_report(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("ROC AUC:", roc_auc_score(y_test, xgb.predict_proba(X_test)[:, 1]))


Random Forest:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       417
           1       0.97      0.96      0.97       591

    accuracy                           0.96      1008
   macro avg       0.96      0.96      0.96      1008
weighted avg       0.96      0.96      0.96      1008

Confusion Matrix:
 [[398  19]
 [ 21 570]]
ROC AUC: 0.9882672542169312


Parameters: { "use_label_encoder" } are not used.




XGBoost:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       417
           1       0.98      0.97      0.98       591

    accuracy                           0.98      1008
   macro avg       0.97      0.98      0.97      1008
weighted avg       0.98      0.98      0.98      1008

Confusion Matrix:
 [[407  10]
 [ 15 576]]
ROC AUC: 0.9950171842221655


In [None]:
import numpy as np

def binary_cross_entropy(y_true, y_pred):
    # Clip predictions to avoid log(0)
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)

    # Compute BCE loss
    bce_loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return bce_loss

# Compute BCE Loss for XGBoost
y_proba_xgb = xgb.predict_proba(X_test_selected)[:, 1]
xgb_bce_loss_manual = binary_cross_entropy(y_test, y_proba_xgb)
print("Manual XGBoost Binary Cross-Entropy Loss (BCE Loss):", xgb_bce_loss_manual)

# Compute BCE Loss for Random Forest
y_proba_rf = rf.predict_proba(X_test_selected)[:, 1]
rf_bce_loss_manual = binary_cross_entropy(y_test, y_proba_rf)
print("Manual Random Forest Binary Cross-Entropy Loss (BCE Loss):", rf_bce_loss_manual)


ValueError: Feature shape mismatch, expected: 512, got 20

In [None]:
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Input, GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Load VGG19 pretrained model (without the classification head)
base_model = VGG19(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False  # Freeze convolutional base

# Add custom top layers
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation='sigmoid')(x)  # For binary classification

# Build the final model
cnn_model = Model(inputs=base_model.input, outputs=x)

# Compile the model
cnn_model.compile(optimizer=Adam(learning_rate=0.0001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

# Callbacks
callbacks = [
    EarlyStopping(patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

# Train the model
cnn_model.fit(train_data, validation_data=val_data, epochs=10, callbacks=callbacks, verbose=1)

# Predict
y_pred_cnn_proba = cnn_model.predict(test_data).ravel()
y_pred_cnn = (y_pred_cnn_proba > 0.5).astype(int)


Epoch 1/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 481ms/step - accuracy: 0.5629 - loss: 0.7183 - val_accuracy: 0.7192 - val_loss: 0.5587 - learning_rate: 1.0000e-04
Epoch 2/10
[1m 70/147[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m30s[0m 391ms/step - accuracy: 0.6809 - loss: 0.5897

In [None]:
# --- EVALUATION FUNCTION ---
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score # Importing necessary functions
def evaluate_model(name, y_true, y_pred, y_proba):
    print(f"\n{name} Evaluation")
    print(confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    auc = roc_auc_score(y_true, y_proba)
    print(f"ROC AUC: {auc:.4f}")
    return {
        'Model': name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-Score': f1_score(y_true, y_pred),
        'ROC AUC': auc
    }


In [None]:
# --- COMPARE MODELS ---
y_pred_rf_proba = rf.predict_proba(X_test_selected)[:, 1]
y_pred_xgb_proba = xgb.predict_proba(X_test_selected)[:, 1]

y_pred_xgb = xgb.predict(X_test_selected)

results = []
results.append(evaluate_model("Random Forest", y_test, y_pred_rf, y_pred_rf_proba))
results.append(evaluate_model("XGBoost", y_test, y_pred_xgb, y_pred_xgb_proba))
results.append(evaluate_model("CNN", y_test, y_pred_cnn, y_pred_cnn_proba))


In [None]:
# --- VISUALIZE COMPARISON ---
import pandas as pd
results_df = pd.DataFrame(results)
results_df.set_index('Model', inplace=True)
results_df.plot(kind='bar', figsize=(10,6), ylim=(0,1), title='Model Performance Comparison', ylabel='Score')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# --- PLOT ROC CURVES ---
from sklearn.metrics import roc_curve, roc_auc_score # Import roc_curve
def plot_roc_curve(y_true, y_proba, label):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc_score(y_true, y_proba):.2f})')

plt.figure(figsize=(8,6))
plot_roc_curve(y_test, y_pred_rf_proba, 'Random Forest')
plot_roc_curve(y_test, y_pred_xgb_proba, 'XGBoost')
plot_roc_curve(y_test, y_pred_cnn_proba, 'CNN')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
