# Data Preparation for Spine Degeneration Classification

This notebook prepares spine X-ray images for deep learning model training.

## Steps:
1. Load and explore the dataset
2. Visualize class distribution
3. Set up data augmentation
4. Create data generators
5. Calculate class weights for imbalanced data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os

print(f"TensorFlow version: {tf.__version__}")

## 1. Dataset Configuration

In [None]:
# Dataset paths
DATASET_DIR = Path('../dataset')
TRAIN_DIR = DATASET_DIR / 'train'
VAL_DIR = DATASET_DIR / 'val'
TEST_DIR = DATASET_DIR / 'test'

# Image parameters
IMG_SIZE = (224, 224)
BATCH_SIZE = 32

# Class names
CLASS_NAMES = ['grade_0', 'grade_1', 'grade_2', 'grade_3', 'grade_4']
CLASS_LABELS = ['Healthy', 'Doubtful', 'Minimal', 'Moderate', 'Severe']

## 2. Explore Dataset

In [None]:
def count_images(directory):
    """Count images in each class folder"""
    counts = {}
    for class_name in CLASS_NAMES:
        class_dir = directory / class_name
        if class_dir.exists():
            counts[class_name] = len(list(class_dir.glob('*.*')))
        else:
            counts[class_name] = 0
    return counts

# Count images in each split
train_counts = count_images(TRAIN_DIR)
val_counts = count_images(VAL_DIR)
test_counts = count_images(TEST_DIR)

print("Dataset Distribution:")
print("=" * 50)
df = pd.DataFrame({
    'Class': CLASS_LABELS,
    'Train': list(train_counts.values()),
    'Validation': list(val_counts.values()),
    'Test': list(test_counts.values())
})
df['Total'] = df['Train'] + df['Validation'] + df['Test']
print(df)
print("=" * 50)
print(f"Total images: {df['Total'].sum()}")

## 3. Visualize Class Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar chart
df_plot = df.set_index('Class')[['Train', 'Validation', 'Test']]
df_plot.plot(kind='bar', ax=axes[0], color=['#3498db', '#2ecc71', '#e74c3c'])
axes[0].set_title('Image Distribution by Split', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Severity Grade', fontsize=12)
axes[0].set_ylabel('Number of Images', fontsize=12)
axes[0].legend(title='Split')
axes[0].grid(axis='y', alpha=0.3)
plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=45, ha='right')

# Pie chart
total_per_class = df.groupby('Class')['Total'].sum()
colors = ['#2ecc71', '#f39c12', '#3498db', '#e74c3c', '#9b59b6']
axes[1].pie(total_per_class, labels=CLASS_LABELS, autopct='%1.1f%%', 
           colors=colors, startangle=90)
axes[1].set_title('Overall Class Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../assets/data.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Calculate Class Weights

Since the dataset may be imbalanced, we calculate class weights to give more importance to underrepresented classes during training.

In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Calculate class weights
train_samples = np.array(list(train_counts.values()))
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.arange(len(CLASS_NAMES)),
    y=np.repeat(np.arange(len(CLASS_NAMES)), train_samples)
)

class_weight_dict = dict(enumerate(class_weights))

print("Class Weights:")
for i, (class_name, weight) in enumerate(zip(CLASS_LABELS, class_weights)):
    print(f"  {class_name:12s}: {weight:.3f}")
    
# Save class weights for later use
np.save('../src/class_weights.npy', class_weight_dict)

## 5. Data Augmentation Setup

In [None]:
# Training data generator with augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    brightness_range=[0.9, 1.1],
    fill_mode='nearest'
)

# Validation and test generators (no augmentation)
val_test_datagen = ImageDataGenerator(rescale=1./255)

# Create generators
train_generator = train_datagen.flow_from_directory(
    TRAIN_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=True
)

val_generator = val_test_datagen.flow_from_directory(
    VAL_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

test_generator = val_test_datagen.flow_from_directory(
    TEST_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

## 6. Visualize Augmented Images

In [None]:
# Get a batch of images
sample_batch = next(train_generator)
sample_images = sample_batch[0]
sample_labels = sample_batch[1]

# Plot some augmented images
fig, axes = plt.subplots(2, 4, figsize=(15, 8))
axes = axes.ravel()

for i in range(8):
    axes[i].imshow(sample_images[i])
    label_idx = np.argmax(sample_labels[i])
    axes[i].set_title(f'Grade {label_idx}: {CLASS_LABELS[label_idx]}', fontsize=11)
    axes[i].axis('off')

plt.suptitle('Sample Augmented Training Images', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

## 7. Summary

✅ Dataset loaded and explored
✅ Class distribution visualized
✅ Class weights calculated for imbalanced data
✅ Data augmentation configured
✅ Data generators created

**Next Steps:**
- Train models using notebooks: `02_model_xception.ipynb`, `02_model_resnet50.ipynb`, `02_model_inception_resnet_v2.ipynb`
- Create ensemble model: `02_ensemble_models.ipynb`
- Evaluate on test set: `03_best_model_on_test.ipynb`

In [None]:
print("\n" + "="*60)
print("DATA PREPARATION COMPLETE!")
print("="*60)
print(f"Total training images: {train_generator.samples}")
print(f"Total validation images: {val_generator.samples}")
print(f"Total test images: {test_generator.samples}")
print(f"Number of classes: {len(CLASS_NAMES)}")
print(f"Image size: {IMG_SIZE}")
print(f"Batch size: {BATCH_SIZE}")
print("="*60)