# Brain Tumor Classification with CNN

This notebook implements a CNN for binary classification of MRI images as tumor (`yes`) or no tumor (`no`). The dataset has images in `yes` and `no` subfolders, with no masks provided.

**Dependencies**: TensorFlow, NumPy, Matplotlib, OpenCV, Scikit-learn

**Dataset**: MRI images in `data_dir/images/yes` and `data_dir/images/no`.

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import os
import matplotlib.pyplot as plt
import cv2
from sklearn.model_selection import train_test_split
%matplotlib inline

## 1. Dataset

Define paths to the dataset. Images are in `images/yes` and `images/no` subfolders.

In [None]:
# Dataset paths (adjust based on your dataset structure)
data_dir = 'path/to/dataset'  # Update to your dataset path
image_dir = os.path.join(data_dir, 'images')
yes_dir = os.path.join(image_dir, 'yes')
no_dir = os.path.join(image_dir, 'no')

# Image parameters
IMG_HEIGHT = 128
IMG_WIDTH = 128
BATCH_SIZE = 16

## 2. Exploratory Data Analysis (EDA)

Analyze the dataset to understand the number of images in `yes` and `no` folders.

In [None]:
# Get list of image files
yes_files = [os.path.join(yes_dir, f) for f in os.listdir(yes_dir) if f.endswith(('.jpg', '.png'))]
no_files = [os.path.join(no_dir, f) for f in os.listdir(no_dir) if f.endswith(('.jpg', '.png'))]
image_files = yes_files + no_files
labels = [1] * len(yes_files) + [0] * len(no_files)  # 1 for tumor, 0 for no tumor

# Check dataset size
print(f'Number of tumor images (yes): {len(yes_files)}')
print(f'Number of non-tumor images (no): {len(no_files)}')
print(f'Total images: {len(image_files)}')

# Check sample image dimensions
sample_img = cv2.imread(image_files[0])
print(f'Sample image shape: {sample_img.shape}')

## 3. Plot

Visualize sample images from `yes` and `no` folders to understand the data distribution.

In [None]:
# Plot sample images
plt.figure(figsize=(10, 5))
for i in range(3):
    img = cv2.imread(yes_files[i] if i < len(yes_files) else no_files[i - len(yes_files)])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.subplot(1, 3, i+1)
    plt.imshow(img)
    plt.title('Tumor' if i < len(yes_files) else 'No Tumor')
    plt.axis('off')
plt.tight_layout()
plt.show()

## 4. Data Augmentation

Apply data augmentation to training images to increase dataset diversity.

In [None]:
def create_data_generator(img_files, labels, target_size, batch_size, augment=False):
    if augment:
        datagen = ImageDataGenerator(
            rescale=1./255,
            rotation_range=20,
            width_shift_range=0.2,
            height_shift_range=0.2,
            shear_range=0.2,
            zoom_range=0.2,
            horizontal_flip=True,
            fill_mode='nearest'
        )
    else:
        datagen = ImageDataGenerator(rescale=1./255)

    # Custom generator for images and labels
    def flow_from_files(datagen, files, labels, target_size, batch_size, shuffle=True, seed=42):
        while True:
            indices = np.arange(len(files))
            if shuffle:
                np.random.seed(seed)
                np.random.shuffle(indices)
            for i in range(0, len(files), batch_size):
                batch_indices = indices[i:i + batch_size]
                batch_files = [files[idx] for idx in batch_indices]
                batch_labels = [labels[idx] for idx in batch_indices]
                batch_images = [cv2.imread(f, cv2.IMREAD_COLOR) for f in batch_files]
                batch_images = [cv2.resize(img, target_size) for img in batch_images]
                batch_images = np.array(batch_images) / 255.0
                batch_labels = np.array(batch_labels)
                yield batch_images, batch_labels

    return flow_from_files(datagen, img_files, labels, target_size, batch_size)

## 5. Data Preprocessing

Apply preprocessing steps: convert BGR to grayscale, apply GaussianBlur, threshold, erode, dilate, and find contours.

In [None]:
def preprocess_image(img_path, target_size=(IMG_HEIGHT, IMG_WIDTH)):
    img = cv2.imread(img_path)
    # Convert BGR to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Apply GaussianBlur
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    # Apply thresholding
    _, thresh = cv2.threshold(blurred, 127, 255, cv2.THRESH_BINARY)
    # Erode and dilate
    kernel = np.ones((3, 3), np.uint8)
    eroded = cv2.erode(thresh, kernel, iterations=1)
    dilated = cv2.dilate(eroded, kernel, iterations=1)
    # Find contours
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # Draw contours on original image (for visualization)
    contour_img = img.copy()
    cv2.drawContours(contour_img, contours, -1, (0, 255, 0), 2)
    # Resize to target size
    processed_img = cv2.resize(img, target_size)  # Use original image for classification
    processed_img = processed_img / 255.0
    return processed_img, contour_img

## 6. Image Loading

Load and preprocess images from `yes` and `no` folders.

In [None]:
# Load images
images = np.array([preprocess_image(f)[0] for f in image_files])
labels = np.array(labels)

print(f'Loaded images shape: {images.shape}')
print(f'Loaded labels shape: {labels.shape}')

## 7. Data Splitting (Train, Test, Validation)

Split the dataset into training, validation, and test sets (70% train, 15% validation, 15% test).

In [None]:
# Split data
X_train, X_temp, y_train, y_temp = train_test_split(images, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f'Training set: {X_train.shape[0]} images')
print(f'Validation set: {X_val.shape[0]} images')
print(f'Test set: {X_test.shape[0]} images')

## 8. CNN Model Building and Training

Define and train a CNN for binary classification.

In [None]:
def cnn_model(input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D(pool_size=(2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    return model

# Create and compile model
model = cnn_model()
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Model summary
model.summary()

# Train model
EPOCHS = 20
history = model.fit(
    X_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val)
)

## 9. Evaluation and Visualization

Evaluate the model on the test set and visualize training/validation metrics and sample predictions.

In [None]:
# Evaluate model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

# Save model
model.save('brain_tumor_cnn_model.h5')

# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Visualize sample predictions
predictions = model.predict(X_test[:3])
plt.figure(figsize=(10, 5))
for i in range(3):
    plt.subplot(1, 3, i+1)
    plt.imshow(X_test[i])
    plt.title(f'Pred: {"Tumor" if predictions[i] > 0.5 else "No Tumor"}\nTrue: {"Tumor" if y_test[i] == 1 else "No Tumor"}')
    plt.axis('off')
plt.tight_layout()
plt.show()