In [None]:

import os
import random
import shutil
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models

In [None]:
dataset_path = kagglehub.dataset_download("tawsifurrahman/covid19-radiography-database")

print("Dataset path:", dataset_path)
print("Subfolders in dataset:", os.listdir(dataset_path))

#path of the dataset
covid_folder = os.path.join(dataset_path, "COVID-19_Radiography_Dataset", "COVID", "images")
normal_folder = os.path.join(dataset_path, "COVID-19_Radiography_Dataset", "Normal", "images")

Dataset path: /root/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5
Subfolders in dataset: ['COVID-19_Radiography_Dataset']


In [None]:
# Remove old train/test if they exist
if os.path.exists("train"):
    shutil.rmtree("train")
if os.path.exists("test"):
    shutil.rmtree("test")

# Recreate them
os.makedirs("train/COVID", exist_ok=True)
os.makedirs("train/Normal", exist_ok=True)
os.makedirs("test/COVID", exist_ok=True)
os.makedirs("test/Normal", exist_ok=True)

print("'train' and 'test' folders have been reset.")

'train' and 'test' folders have been reset.


In [None]:
# exactly 400 images from each class (COVID, Normal).
# split them into train/test in an imbalanced way:

# COVID: 300 train, 100 test
# Normal: 200 train, 200 test

covid_count = 400
normal_count = 400

# --- COVID ---
covid_images = os.listdir(covid_folder)
random.shuffle(covid_images)
covid_images = covid_images[:covid_count]  # only take 400 COVID images total

covid_train = covid_images[:300]   # first 300 go to train
covid_test = covid_images[300:]    # remaining 100 go to test

for img_name in covid_train:
    src = os.path.join(covid_folder, img_name)
    dst = os.path.join("train/COVID", img_name)
    shutil.copy(src, dst)

for img_name in covid_test:
    src = os.path.join(covid_folder, img_name)
    dst = os.path.join("test/COVID", img_name)
    shutil.copy(src, dst)

# --- Normal ---
normal_images = os.listdir(normal_folder)
random.shuffle(normal_images)
normal_images = normal_images[:normal_count]  # only take 400 Normal images total

normal_train = normal_images[:200]  # 200 go to train
normal_test = normal_images[200:]   # 200 go to test

for img_name in normal_train:
    src = os.path.join(normal_folder, img_name)
    dst = os.path.join("train/Normal", img_name)
    shutil.copy(src, dst)

for img_name in normal_test:
    src = os.path.join(normal_folder, img_name)
    dst = os.path.join("test/Normal", img_name)
    shutil.copy(src, dst)

print("Train set -> 300 COVID + 200 Normal = 500 images (imbalanced).")
print("Test set -> 100 COVID + 200 Normal = 300 images (imbalanced).")

Train set -> 300 COVID + 200 Normal = 500 images (imbalanced).
Test set -> 100 COVID + 200 Normal = 300 images (imbalanced).


In [None]:
batch_size = 32
img_size = (224, 224)

# split 20% of the 500 training images for validation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2
)

train_generator = train_datagen.flow_from_directory(
    "train",
    target_size=img_size,
    batch_size=batch_size,
    class_mode='binary',
    subset='training'      # 80% (400 images) used for training
)

val_generator = train_datagen.flow_from_directory(
    "train",
    target_size=img_size,
    batch_size=batch_size,
    class_mode='binary',
    subset='validation'    # 20% (100 images) used for validation
)

# Test data generator uses all 300 images in the 'test' folder
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
    "test",
    target_size=img_size,
    batch_size=batch_size,
    class_mode='binary'
)

print("Data generators ready (train, val, test).")

Found 400 images belonging to 2 classes.
Found 100 images belonging to 2 classes.
Found 300 images belonging to 2 classes.
Data generators ready (train, val, test).


In [None]:
model = models.Sequential([
    # 1st Conv block
    layers.Conv2D(32, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),

    # 2nd Conv block
    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),

    # 3rd Conv block
    layers.Conv2D(128, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),

    # FC layers
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [None]:
epochs = 5
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=epochs
)

Epoch 1/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 5s/step - accuracy: 0.5564 - loss: 0.8578 - val_accuracy: 0.6000 - val_loss: 0.6698
Epoch 2/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 4s/step - accuracy: 0.6042 - loss: 0.6764 - val_accuracy: 0.6000 - val_loss: 0.6677
Epoch 3/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 4s/step - accuracy: 0.6083 - loss: 0.6601 - val_accuracy: 0.8300 - val_loss: 0.5620
Epoch 4/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 4s/step - accuracy: 0.6677 - loss: 0.6068 - val_accuracy: 0.8200 - val_loss: 0.4174
Epoch 5/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 4s/step - accuracy: 0.7900 - loss: 0.4479 - val_accuracy: 0.7500 - val_loss: 0.4871


In [None]:
test_loss, test_acc = model.evaluate(test_generator)
print(f"Test Accuracy: {test_acc:.4f}")

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1s/step - accuracy: 0.8255 - loss: 0.4268
Test Accuracy: 0.8233


We selected 400 COVID and 400 Normal images in total. Then, we made an imbalanced split for training vs. testing:

Train: 300 COVID + 200 Normal = 500 images
Test: 100 COVID + 200 Normal = 300 images
Within the train folder, we further did an 80–20 split (using validation_split=0.2), so 80% went to actual training and 20% to validation. We used a 3-block CNN (32/64/128 filters) with Adam, binary crossentropy, and accuracy over 5 epochs. Finally, we evaluated the model on the 300-image test set, ensuring those images never appeared in training.

In [None]:

import os
import random
import shutil
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models
import kagglehub

In [None]:
dataset_path = kagglehub.dataset_download("tawsifurrahman/covid19-radiography-database")

print("Dataset path:", dataset_path)
print("Subfolders in dataset:", os.listdir(dataset_path))

# Adjusting the dataset path
covid_folder = os.path.join(dataset_path, "COVID-19_Radiography_Dataset", "COVID", "images")
normal_folder = os.path.join(dataset_path, "COVID-19_Radiography_Dataset", "Normal", "images")

Dataset path: /root/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5
Subfolders in dataset: ['COVID-19_Radiography_Dataset']


In [None]:
# Remove old 'train' and 'test' if they exist (to avoid leftovers)
if os.path.exists("train"):
    shutil.rmtree("train")
if os.path.exists("test"):
    shutil.rmtree("test")

# Create new empty folders
os.makedirs("train/COVID", exist_ok=True)
os.makedirs("train/Normal", exist_ok=True)
os.makedirs("test/COVID", exist_ok=True)
os.makedirs("test/Normal", exist_ok=True)

print("Clean 'train' and 'test' directories have been created.")

Clean 'train' and 'test' directories have been created.


In [None]:
# We want exactly 1,000 images of COVID and 1,000 Normal
# Then from these 1,000, we keep 800 for train, 200 for test in each class.

num_total = 1000    # total images per class (COVID or Normal)
num_test = 200      # from those 1,000, 200 go to test
num_train = num_total - num_test  # 800 remain for train

# --- COVID ---
covid_images = os.listdir(covid_folder)
random.shuffle(covid_images)
covid_images = covid_images[:num_total]  # Only take the first 1000

covid_test = covid_images[:num_test]     # first 200 for test
covid_train = covid_images[num_test:]    # remaining 800 for train

for img_name in covid_test:
    src = os.path.join(covid_folder, img_name)
    dst = os.path.join("test/COVID", img_name)
    shutil.copy(src, dst)

for img_name in covid_train:
    src = os.path.join(covid_folder, img_name)
    dst = os.path.join("train/COVID", img_name)
    shutil.copy(src, dst)

# --- Normal ---
normal_images = os.listdir(normal_folder)
random.shuffle(normal_images)
normal_images = normal_images[:num_total]  # Only take the first 1000

normal_test = normal_images[:num_test]
normal_train = normal_images[num_test:]

for img_name in normal_test:
    src = os.path.join(normal_folder, img_name)
    dst = os.path.join("test/Normal", img_name)
    shutil.copy(src, dst)

for img_name in normal_train:
    src = os.path.join(normal_folder, img_name)
    dst = os.path.join("train/Normal", img_name)
    shutil.copy(src, dst)

print("Done splitting 1,000 images per class into 800 train / 200 test each.")
print("Test set: 200 COVID + 200 Normal = 400 images (50% each).")
print("Train set: 800 COVID + 800 Normal = 1,600 images total.")


Done splitting 1,000 images per class into 800 train / 200 test each.
Test set: 200 COVID + 200 Normal = 400 images (50% each).
Train set: 800 COVID + 800 Normal = 1,600 images total.


In [None]:
batch_size = 32
img_size = (224, 224)

# 80-20 split of the *training folder* for actual train vs. validation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2  # 20% validation
)

# 80% of 'train'
train_generator = train_datagen.flow_from_directory(
    "train",
    target_size=img_size,
    batch_size=batch_size,
    class_mode='binary',
    subset='training'
)

# 20% of 'train'
val_generator = train_datagen.flow_from_directory(
    "train",
    target_size=img_size,
    batch_size=batch_size,
    class_mode='binary',
    subset='validation'
)

# Test generator uses all images in the 'test' folder
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
    "test",
    target_size=img_size,
    batch_size=batch_size,
    class_mode='binary'
)

print("Data generators ready: train (80%), val (20%), and separate test.")


Found 1280 images belonging to 2 classes.
Found 320 images belonging to 2 classes.
Found 400 images belonging to 2 classes.
Data generators ready: train (80%), val (20%), and separate test.


In [None]:
model = models.Sequential([
    layers.Conv2D(32, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),

    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),

    layers.Conv2D(128, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),

    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()


In [None]:
epochs = 5
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=epochs
)


Epoch 1/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 4s/step - accuracy: 0.5609 - loss: 0.9247 - val_accuracy: 0.7188 - val_loss: 0.5493
Epoch 2/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 4s/step - accuracy: 0.7875 - loss: 0.4959 - val_accuracy: 0.8281 - val_loss: 0.4074
Epoch 3/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 4s/step - accuracy: 0.8748 - loss: 0.3504 - val_accuracy: 0.8813 - val_loss: 0.3777
Epoch 4/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 4s/step - accuracy: 0.8730 - loss: 0.3229 - val_accuracy: 0.8406 - val_loss: 0.4474
Epoch 5/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 4s/step - accuracy: 0.9019 - loss: 0.2664 - val_accuracy: 0.8219 - val_loss: 0.4234


In [None]:
test_loss, test_acc = model.evaluate(test_generator)
print(f"Test Accuracy: {test_acc:.4f}")


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1s/step - accuracy: 0.8626 - loss: 0.4034
Test Accuracy: 0.8675


We download 1,000 COVID and 1,000 Normal images for a total of 2,000 images.
We split them into:
Train: 800 COVID + 800 Normal = 1,600 images.
Test: 200 COVID + 200 Normal = 400 images, i.e. 50% each class.
Inside the train folder, we further do 80-20 for actual train vs. validation via validation_split=0.2.
We build a 3-layer CNN with MaxPooling and a fully connected top.
We train for 5 epochs using Adam + binary_crossentropy.
We evaluate on the completely separate 400-image test set.

In [None]:

import os
import random
import shutil
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models
import kagglehub


In [None]:
# downloading the COVID-19 Radiography Database
dataset_path = kagglehub.dataset_download("tawsifurrahman/covid19-radiography-database")

print("Dataset path:", dataset_path)
print("Contents:", os.listdir(dataset_path))

covid_folder = os.path.join(dataset_path, "COVID-19_Radiography_Dataset", "COVID", "images")
pneumonia_folder = os.path.join(dataset_path, "COVID-19_Radiography_Dataset", "Viral Pneumonia", "images")
normal_folder = os.path.join(dataset_path, "COVID-19_Radiography_Dataset", "Normal", "images")


Downloading from https://www.kaggle.com/api/v1/datasets/download/tawsifurrahman/covid19-radiography-database?dataset_version_number=5...


100%|██████████| 778M/778M [00:08<00:00, 94.4MB/s]

Extracting files...





Dataset path: /root/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5
Contents: ['COVID-19_Radiography_Dataset']


In [None]:
# Remove old train/test if they exist
if os.path.exists("train"):
    shutil.rmtree("train")
if os.path.exists("test"):
    shutil.rmtree("test")

# Recreate them (3 classes: COVID, Pneumonia, Normal)
for folder in ["train", "test"]:
    os.makedirs(os.path.join(folder, "COVID"), exist_ok=True)
    os.makedirs(os.path.join(folder, "Pneumonia"), exist_ok=True)
    os.makedirs(os.path.join(folder, "Normal"), exist_ok=True)

print("Fresh 'train' and 'test' folders created with 3 subfolders each.")


Fresh 'train' and 'test' folders created with 3 subfolders each.


In [None]:
def copy_images(src_folder, dest_train, dest_test, total_images=500, train_count=400):
    """
    Copies images from src_folder into the train/test subfolders.
    total_images: how many total images to pick
    train_count: how many of those go to train (remainder go to test)
    """
    images = os.listdir(src_folder)
    random.shuffle(images)
    images = images[:total_images]  # only pick the first 'total_images'

    train_images = images[:train_count]
    test_images = images[train_count:]

    for img_name in train_images:
        shutil.copy(os.path.join(src_folder, img_name),
                    os.path.join(dest_train, img_name))
    for img_name in test_images:
        shutil.copy(os.path.join(src_folder, img_name),
                    os.path.join(dest_test, img_name))


# We'll take 500 images/class. 400 for train, 100 for test.
copy_images(covid_folder, "train/COVID", "test/COVID", total_images=500, train_count=400)
copy_images(pneumonia_folder, "train/Pneumonia", "test/Pneumonia", total_images=500, train_count=400)
copy_images(normal_folder, "train/Normal", "test/Normal", total_images=500, train_count=400)

print("Data copied:\n - 3 classes: COVID, Pneumonia, Normal")
print(" - 400 images each in train, 100 each in test => balanced split.")


Data copied:
 - 3 classes: COVID, Pneumonia, Normal
 - 400 images each in train, 100 each in test => balanced split.


In [None]:
batch_size = 32
img_size = (224, 224)

# We'll do an 80–20 split of the train folder for actual training vs. validation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2  # 20% of the 1,200 training images => 240 val, 960 train
)

train_generator = train_datagen.flow_from_directory(
    "train",
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',  # 3 classes => categorical
    subset='training'
)

val_generator = train_datagen.flow_from_directory(
    "train",
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',
    subset='validation'
)

# Test generator uses all images in test folder
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
    "test",
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical'
)

print("Generators ready: train (80%), val (20%), and separate test.")


Found 960 images belonging to 3 classes.
Found 240 images belonging to 3 classes.
Found 300 images belonging to 3 classes.
Generators ready: train (80%), val (20%), and separate test.


In [None]:
model = models.Sequential([
    layers.Conv2D(32, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),

    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),

    layers.Conv2D(128, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),

    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    # For 3 classes, output layer = Dense(3, softmax)
    layers.Dense(3, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',  # multi-class classification
    metrics=['accuracy']
)

model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
epochs = 5
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=epochs
)


Epoch 1/5


  self._warn_if_super_not_called()


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 233ms/step - accuracy: 0.4844 - loss: 1.0725 - val_accuracy: 0.7583 - val_loss: 0.6253
Epoch 2/5
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 135ms/step - accuracy: 0.7751 - loss: 0.5087 - val_accuracy: 0.7958 - val_loss: 0.5933
Epoch 3/5
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 127ms/step - accuracy: 0.7880 - loss: 0.4914 - val_accuracy: 0.8417 - val_loss: 0.4954
Epoch 4/5
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 103ms/step - accuracy: 0.8572 - loss: 0.3652 - val_accuracy: 0.8333 - val_loss: 0.5935
Epoch 5/5
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 139ms/step - accuracy: 0.8785 - loss: 0.3237 - val_accuracy: 0.8208 - val_loss: 0.5383


In [None]:
test_loss, test_acc = model.evaluate(test_generator)
print(f"Test Accuracy: {test_acc:.4f}")


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 200ms/step - accuracy: 0.8422 - loss: 0.4443
Test Accuracy: 0.8600


In our three-class classification setup, the model was tasked with differentiating among COVID, Pneumonia, and Normal (healthy) X-ray images. We extracted an equal number of images for each class—ensuring the dataset remained balanced—and then split those images into training, validation, and testing sets. Specifically, we used an 80-20 split for training and validation, while keeping a separate test set (with equal representation from each class) to evaluate final performance.

The CNN architecture consisted of three convolutional layers (32, 64, and 128 filters) each followed by MaxPooling, ending with a fully connected portion (Dense layer with 128 neurons + Dropout, then a final Dense layer with 3 outputs and softmax activation). We used categorical crossentropy as the loss function for this multi-class problem and optimized via Adam. After 5 epochs, the model achieved 86% accuracy on the test set. This level of performance indicates that despite the additional complexity of distinguishing three classes, the overall learning trend remained similar to the binary classification scenario.

When comparing the three-class results with the binary setting (where we observed 86.75% accuracy on a balanced dataset), we see that the performance trends are quite similar. While adding a third class can sometimes reduce accuracy due to increased complexity, the model’s final accuracy in this experiment still closely mirrored that of the binary classification, reflecting consistent learning capacity and robust feature extraction in both scenarios.





