
# Nephros: Kidney Disease Detection

This notebook demonstrates the process of classifying kidney diseases using the Kidney CT Scan Dataset. It covers:

1. Dataset Preparation
2. Data Preprocessing
3. Model Development
4. Model Evaluation
5. Conclusions and Next Steps
    

## 1. Dataset Preparation

These first steps can be run to download the dataset. Be sure to have the Kaggle API downloaded already!

In [None]:
from google.colab import files
files.upload()


In [None]:
!kaggle datasets download -d anima890/kidney-ct-scan


In [None]:
!unzip -o kidney-ct-scan.zip -d kidney_ct_scan


In [None]:
import os

# Path to the extracted dataset
dataset_path = '/content/kidney_ct_scan'
print(os.listdir(dataset_path))  # Check what files or folders exist


In [None]:
csv_path = os.path.join(dataset_path, 'kidneyData.csv')  # Update with the actual name if needed
import pandas as pd

data = pd.read_csv(csv_path)
print(data.head())  # Preview the first few rows


## 2. Data Preprocessing

In the next few steps, you can inspect and play around with the data. Set up directories in order to properly set up the data for later steps.

In [None]:
import os
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split

# Define the path to the dataset
dataset_csv_path = '/content/kidney_ct_scan/kidneyData.csv'  # Path to the CSV file
base_dir = '/content/data'  # Base directory for organized data

# Load the dataset
data = pd.read_csv(dataset_csv_path)

# Preview the data
print(data.head())


In [None]:
# Update the path column to point to the correct directory
data['path'] = data['path'].str.replace('/content/data', '/content/kidney_ct_scan')

# Verify the updated paths
print(data['path'].head())


In [None]:
import os

# List the contents of the main dataset directory
dataset_path = '/content/kidney_ct_scan'
print(os.listdir(dataset_path))  # Show the main folder contents


In [None]:
for folder in os.listdir(dataset_path):
    folder_path = os.path.join(dataset_path, folder)
    if os.path.isdir(folder_path):
        print(f"Folder: {folder}")
        print(f"Sample Files: {os.listdir(folder_path)[:5]}")  # Show first 5 files


In [None]:
subfolder_path = os.path.join(dataset_path, 'CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone')
print(os.listdir(subfolder_path)[:10])  # Show first 10 files


In [None]:
nested_folder_path = os.path.join(subfolder_path, 'CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone')
print(os.listdir(nested_folder_path)[:10])  # Show first 10 items in the nested folder


In [None]:
# Update the paths to include the nested folder structure
data['path'] = data['path'].str.replace(
    '/content/data/CT KIDNEY DATASET Normal, CYST, TUMOR and STONE',
    '/content/kidney_ct_scan/CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone/CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone'
)

# Verify updated paths
print(data['path'].head())


In [None]:
import os

nested_folder_path = '/content/kidney_ct_scan/CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone/CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone'
for class_name in os.listdir(nested_folder_path):
    class_folder = os.path.join(nested_folder_path, class_name)
    if os.path.isdir(class_folder):
        print(f"Class: {class_name}")
        print(f"Sample files: {os.listdir(class_folder)[:5]}")  # Show first 5 files


Now, that I have found the fact that the data has a nested folder, I move on to making sure that the data is clean (there are no corrupt or missing images)

In [None]:
import pandas as pd
import os

# Path to the nested folder
nested_folder_path = '/content/kidney_ct_scan/CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone/CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone'

# Create a new DataFrame with correct paths
file_paths = []
labels = []

for class_name in os.listdir(nested_folder_path):
    class_folder = os.path.join(nested_folder_path, class_name)
    if os.path.isdir(class_folder)
        for file_name in os.listdir(class_folder):
            file_paths.append(os.path.join(class_folder, file_name))
            labels.append(class_name)  # Use the folder name as the label

# Create a DataFrame
data_cleaned = pd.DataFrame({'path': file_paths, 'Class': labels})

# Verify the new DataFrame
print(data_cleaned.head())
print(f"Number of files: {len(data_cleaned)}")


In [None]:
missing_files = []
for path in data_cleaned['path']:
    if not os.path.exists(path):
        missing_files.append(path)

print(f"Number of missing files: {len(missing_files)}")
print(f"Sample missing files: {missing_files[:5]}")


In [None]:
dataset_path = '/content/kidney_ct_scan/CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone/CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone'


For this version due to a limited availability to computational resources, I will be taking a subset of the data and splitting it into training, validation, and test sets.

In [None]:
from PIL import Image
import shutil
import random

def create_subset(input_dir, output_dir, fraction=0.2, target_size=(224, 224)):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for class_name in os.listdir(input_dir):
        class_dir = os.path.join(input_dir, class_name)
        if not os.path.isdir(class_dir):
            continue

        output_class_dir = os.path.join(output_dir, class_name)
        os.makedirs(output_class_dir, exist_ok=True)

        file_paths = [os.path.join(class_dir, f) for f in os.listdir(class_dir)]
        sampled_files = random.sample(file_paths, int(len(file_paths) * fraction))

        for file_path in sampled_files:
            with Image.open(file_path) as img:
                img = img.resize(target_size)
                img.save(os.path.join(output_class_dir, os.path.basename(file_path)))

# Create a subset of the data
create_subset(
    dataset_path,
    '/content/kidney_ct_scan_subset',
    fraction=0.2  # Use 20% of the data
)


In [None]:
import os
from sklearn.model_selection import train_test_split
import shutil

def split_dataset(input_dir, output_dir, train_frac=0.6, val_frac=0.2, test_frac=0.2):
    # Ensure output directories exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    train_dir = os.path.join(output_dir, 'train')
    val_dir = os.path.join(output_dir, 'val')
    test_dir = os.path.join(output_dir, 'test')

    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    # Process each class
    for class_name in os.listdir(input_dir):
        class_dir = os.path.join(input_dir, class_name)
        if not os.path.isdir(class_dir):
            continue

        # List all files in the class directory
        files = os.listdir(class_dir)

        # Split files into train and temp (val + test)
        train_files, temp_files = train_test_split(files, test_size=(1 - train_frac), random_state=42)

        # Further split temp into val and test
        val_files, test_files = train_test_split(
            temp_files,
            test_size=(test_frac / (val_frac + test_frac)),
            random_state=42
        )

        # Copy files to their respective directories
        for file_set, target_dir in zip([train_files, val_files, test_files], [train_dir, val_dir, test_dir]):
            class_output_dir = os.path.join(target_dir, class_name)
            os.makedirs(class_output_dir, exist_ok=True)
            for file_name in file_set:
                shutil.copy(os.path.join(class_dir, file_name), os.path.join(class_output_dir, file_name))

# Usage
split_dataset(
    '/content/kidney_ct_scan_subset',  # Input dataset path
    '/content/kidney_ct_scan_split',  # Output dataset path
    train_frac=0.6,                   # 60% training
    val_frac=0.2,                     # 20% validation
    test_frac=0.2                     # 20% test
)


## 3. Model Development

In this section, I play around with different machine learning techniques to improve my model. I first implement transfer learning for the pre-trained model MobileNetV2. Then, I use different mechanisms like fine tuning the model by freezing and unfreezing layers of the CNN, adjusting the learning rate, and playing with the batch size when training the model.

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(
    rescale=1.0/255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

val_test_datagen = ImageDataGenerator(rescale=1.0/255)

train_generator = train_datagen.flow_from_directory(
    '/content/kidney_ct_scan_split/train',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

val_generator = val_test_datagen.flow_from_directory(
    '/content/kidney_ct_scan_split/val',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)


In [None]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten

base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

model = Sequential([
    base_model,
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(4, activation='softmax')  # Adjust for the number of classes
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=50,
    callbacks=[early_stopping]
)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

test_generator = val_test_datagen.flow_from_directory(
    '/content/kidney_ct_scan_split/test',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    shuffle=False
)

predictions = model.predict(test_generator)
y_pred = np.argmax(predictions, axis=1)
y_true = test_generator.classes

print(classification_report(y_true, y_pred, target_names=test_generator.class_indices.keys()))
print(confusion_matrix(y_true, y_pred))


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout

model = Sequential([
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.3),  # Add dropout to the first convolutional block

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.4),  # Add dropout to the second convolutional block

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),  # Add dropout before the final Dense layer
    Dense(4, activation='softmax')  # 4 classes
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
from tensorflow.keras.regularizers import l2

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D((2, 2)),
    Dropout(0.3),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.4),

    Flatten(),
    Dense(128, activation='relu', kernel_regularizer=l2(0.01)),  # Add L2 regularization
    Dropout(0.5),
    Dense(4, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,  # Stop training if val_loss doesn't improve for 5 epochs
    restore_best_weights=True
)

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=50,
    callbacks=[early_stopping]
)


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_generator.classes),
    y=train_generator.classes
)
class_weights = dict(enumerate(class_weights))

# Apply class weights during training
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=50,
    class_weight=class_weights,  # Apply class weights here
    callbacks=[early_stopping]
)


In [None]:
# Reinitialize train and validation generators with smaller batch size
train_generator = train_datagen.flow_from_directory(
    '/content/kidney_ct_scan_split/train',
    target_size=(224, 224),
    batch_size=16,  # Reduced batch size
    class_mode='categorical'
)

val_generator = val_test_datagen.flow_from_directory(
    '/content/kidney_ct_scan_split/val',
    target_size=(224, 224),
    batch_size=16,  # Match the batch size for validation
    class_mode='categorical'
)

# Retrain the model
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=50,
    class_weight=class_weights,  # Retain class weights
    callbacks=[early_stopping]
)


In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Use a learning rate scheduler to reduce the learning rate on a plateau
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,  # Reduce learning rate by half
    patience=3,  # Wait 3 epochs before reducing
    min_lr=1e-6  # Minimum learning rate
)

# Recompile the model with an initial lower learning rate
model.compile(
    optimizer=Adam(learning_rate=1e-4),  # Start with a lower learning rate
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Retrain the model
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=50,
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr]
)


In [None]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam

# Load MobileNetV2 with pretrained weights
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze all layers in the base model initially
base_model.trainable = False

# Add custom layers on top
model = Sequential([
    base_model,
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')  # 4 output classes
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10,  # Train for a few epochs with frozen base layers
    class_weight=class_weights,
    callbacks=[early_stopping]
)


In [None]:
# Unfreeze the last few layers of the base model
base_model.trainable = True
for layer in base_model.layers[:-50]:  # Freeze all layers except the last 50
    layer.trainable = False


In [None]:
model.compile(optimizer=Adam(learning_rate=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
history_fine_tune = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=20,  # Fine-tune for more epochs
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr]  # Use early stopping and learning rate scheduler
)


In [None]:
# Evaluate the fine-tuned model
loss, accuracy = model.evaluate(test_generator)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")


In [None]:
# Unfreeze more layers in the base model
base_model.trainable = True
for layer in base_model.layers[:-100]:  # Freeze all layers except the last 100
    layer.trainable = False

# Recompile with an even smaller learning rate
model.compile(optimizer=Adam(learning_rate=1e-6), loss='categorical_crossentropy', metrics=['accuracy'])

# Retrain the model
history_fine_tune_more = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=20,
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr]
)


In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Recreate the ImageDataGenerator
test_datagen = ImageDataGenerator(rescale=1.0/255.0)

# Path to your test dataset directory
test_dir = '/content/kidney_ct_scan_split/test'

# Reinitialize test_generator
test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(224, 224),  # Same size used during training
    batch_size=16,          # Use the batch size you had before
    class_mode='categorical',
    shuffle=False           # Important for evaluation
)


In [None]:
loss, accuracy = model.evaluate(test_generator)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")


In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(
    rescale=1.0 / 255.0,
    rotation_range=40,
    width_shift_range=0.3,
    height_shift_range=0.3,
    shear_range=0.3,
    zoom_range=0.3,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest'
)

train_generator = train_datagen.flow_from_directory(
    '/content/kidney_ct_scan_split/train',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)


In [None]:
val_datagen = ImageDataGenerator(rescale=1.0 / 255.0)
val_generator = val_datagen.flow_from_directory(
    '/content/kidney_ct_scan_split/val',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

test_datagen = ImageDataGenerator(rescale=1.0 / 255.0)
test_generator = test_datagen.flow_from_directory(
    '/content/kidney_ct_scan_split/test',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    shuffle=False
)


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Early Stopping
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=5,          # Stop training after 5 epochs of no improvement
    restore_best_weights=True  # Restore weights from the best epoch
)

# Learning Rate Reduction
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',  # Monitor validation loss
    factor=0.1,          # Reduce learning rate by a factor of 10
    patience=3,          # Wait 3 epochs of no improvement before reducing
    min_lr=1e-7,         # Set a minimum learning rate
    verbose=1
)


In [None]:
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=20,
    callbacks=[early_stopping, reduce_lr]
)


In [None]:
# Training generator
train_generator = train_datagen.flow_from_directory(
    'kidney_ct_scan_split/train',  # Update with your training dataset path
    target_size=(224, 224),
    batch_size=32,  # Increased batch size
    class_mode='categorical'
)

# Validation generator
val_generator = val_datagen.flow_from_directory(
    'kidney_ct_scan_split/val',  # Update with your validation dataset path
    target_size=(224, 224),
    batch_size=32,  # Match batch size to training
    class_mode='categorical'
)

# Test generator
test_generator = test_datagen.flow_from_directory(
    'kidney_ct_scan_split/test',  # Update with your test dataset path
    target_size=(224, 224),
    batch_size=32,  # Match batch size to training
    class_mode='categorical'
)


In [None]:
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=20,  # Adjust as needed
    callbacks=[early_stopping, reduce_lr]  # Include your existing callbacks
)


In [None]:
loss, accuracy = model.evaluate(test_generator)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")


In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=1e-6)


In [None]:
# Training generator
train_generator = train_datagen.flow_from_directory(
    'kidney_ct_scan_split/train',  # Update with your training dataset path
    target_size=(224, 224),
    batch_size=64,  # Increased batch size
    class_mode='categorical'
)

# Validation generator
val_generator = val_datagen.flow_from_directory(
    'kidney_ct_scan_split/val',  # Update with your validation dataset path
    target_size=(224, 224),
    batch_size=64,  # Match batch size to training
    class_mode='categorical'
)

# Test generator
test_generator = test_datagen.flow_from_directory(
    'kidney_ct_scan_split/test',  # Update with your test dataset path
    target_size=(224, 224),
    batch_size=64,  # Match batch size to training
    class_mode='categorical'
)

In [None]:
from tensorflow.keras.optimizers import Adam

model.compile(optimizer=Adam(learning_rate=1e-4),  # Start with a higher learning rate
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=5,
                               restore_best_weights=True)

history = model.fit(train_generator,
                    validation_data=val_generator,
                    epochs=20,
                    callbacks=[reduce_lr, early_stopping])


## 4. Model Evaluation

I now evaluate my model using my test set. I get a test loss of 0.22 and an accuracy of around 93 percent.

In [None]:
loss, accuracy = model.evaluate(test_generator)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")


Here, I save the model.

In [None]:
import os

# Define the directory where you want to save the model
save_dir = '/content/saved_model'
os.makedirs(save_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Save the model with the .keras extension
model.save(os.path.join(save_dir, 'my_model.keras'))




In [None]:
from google.colab import files

# Download the saved model
files.download('/content/saved_model/my_model.keras')
