---
# PART 1: SETUP & DATA PREPARATION
---

## 1.1) Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print('âœ“ Google Drive mounted successfully')

## 1.2) Imports

In [None]:
import os
import numpy as np
import pandas as pd
import pickle
from PIL import Image
import sys
import io
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import AdamW
import warnings
warnings.filterwarnings('ignore')

print(f'TensorFlow version: {tf.__version__}')
print(f'GPU Available: {tf.config.list_physical_devices("GPU")}')

## 1.3) Configure Paths
**Update BASE_PATH to match your Google Drive structure**

In [None]:
# Base path - UPDATE THIS to match your Google Drive structure
BASE_PATH = '/content/drive/MyDrive/Colab Notebooks/KaggleChallenge2'

# Dataset paths
DATA_ROOT = os.path.join(BASE_PATH, 'dataset')
TRAIN_DATA_PATH = os.path.join(DATA_ROOT, 'train_data')
TEST_DATA_PATH = os.path.join(DATA_ROOT, 'test_data')
LABELS_FILE = os.path.join(DATA_ROOT, 'train_labels.csv')

# Output directories
PREPROCESSED_DIR = os.path.join(BASE_PATH, 'preprocessed_data')
MODELS_DIR = os.path.join(BASE_PATH, 'models')

# Create output directories (replace if exists)
os.makedirs(PREPROCESSED_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

# Log file (tee all stdout/stderr to file + notebook output)
LOG_FILE = os.path.join(BASE_PATH, 'pipeline_log.txt')

class Tee(io.TextIOBase):
    def __init__(self, *streams):
        self.streams = streams
    def write(self, data):
        for s in self.streams:
            s.write(data)
        return len(data)
    def flush(self):
        for s in self.streams:
            s.flush()

log_fh = open(LOG_FILE, 'w', buffering=1)
tee_stream = Tee(sys.stdout, log_fh)
sys.stdout = tee_stream
sys.stderr = tee_stream
print(f'Logging to: {LOG_FILE}')
print(f'Run started: {datetime.now()}')

print('âœ“ Paths configured:')
print(f'  Dataset: {DATA_ROOT}')
print(f'  Preprocessed: {PREPROCESSED_DIR}')
print(f'  Models: {MODELS_DIR}')

## 1.4) Configuration Parameters

In [None]:
# Preprocessing parameters
TARGET_SIZE = (260, 260)  # EfficientNetB2 optimal size
VAL_SPLIT = 0.30
RANDOM_STATE = 42

# Training parameters
BATCH_SIZE = 16
EPOCHS_PHASE1 = 18  # Frozen backbone
EPOCHS_PHASE2 = 20  # Fine-tuning
INITIAL_LR = 1e-3
FINETUNE_LR = 1e-4
UNFREEZE_LAYERS = 120  # Number of layers to unfreeze for fine-tuning
WEIGHT_DECAY = 1e-4
LABEL_SMOOTHING = 0.05

print('âœ“ Configuration:')
print(f'  Image size: {TARGET_SIZE}')
print(f'  Validation split: {VAL_SPLIT*100}%')
print(f'  Batch size: {BATCH_SIZE}')
print(f'  Phase 1 epochs: {EPOCHS_PHASE1} (frozen)')
print(f'  Phase 2 epochs: {EPOCHS_PHASE2} (fine-tune)')
print(f'  Label smoothing: {LABEL_SMOOTHING}')
print(f'  Weight decay: {WEIGHT_DECAY}')

## 1.5) Load Labels

In [None]:
# Load training labels
labels_df = pd.read_csv(LABELS_FILE)

print('='*60)
print('TRAINING LABELS')
print('='*60)
print(f'\nDataFrame shape: {labels_df.shape}')
print(f'\nFirst 5 rows:')
print(labels_df.head())

print(f'\n\nClass distribution:')
class_counts = labels_df['label'].value_counts()
print(class_counts)
print(f'\nClass balance ratio: {class_counts.max() / class_counts.min():.2f}:1')

## 1.6) Encode Labels

In [None]:
# Encode labels
label_encoder = LabelEncoder()
labels_df['label_encoded'] = label_encoder.fit_transform(labels_df['label'])

class_names = label_encoder.classes_
num_classes = len(class_names)

print('='*60)
print('LABEL ENCODING')
print('='*60)
print(f'Number of classes: {num_classes}')
print(f'\nClass mapping:')
for idx, name in enumerate(class_names):
    count = (labels_df['label_encoded'] == idx).sum()
    print(f'  {idx}: {name} ({count} samples)')

## 1.7) Train/Validation Split

In [None]:
# Stratified split
train_df, val_df = train_test_split(
    labels_df,
    test_size=VAL_SPLIT,
    stratify=labels_df['label_encoded'],
    random_state=RANDOM_STATE
)

print('='*60)
print('TRAIN/VALIDATION SPLIT')
print('='*60)
print(f'Training samples: {len(train_df)}')
print(f'Validation samples: {len(val_df)}')

print(f'\n\nTraining class distribution:')
print(train_df['label'].value_counts())

print(f'\n\nValidation class distribution:')
print(val_df['label'].value_counts())

## 1.8) Define Preprocessing Function

In [None]:
def load_and_preprocess_image(image_path, target_size=TARGET_SIZE):
    """
    Load and preprocess image with EfficientNet preprocessing.
    """
    try:
        # Load image
        image = Image.open(image_path).convert('RGB')
        
        # Resize
        image = image.resize(target_size, Image.BILINEAR)
        
        # Convert to array
        image = np.array(image, dtype=np.float32)
        
            preprocess_input = tf.keras.applications.efficientnet.preprocess_input
            image = preprocess_input(image)
        
        return image, True
    except Exception as e:
        print(f'Error loading {image_path}: {e}')
        return None, False

print('âœ“ Preprocessing function defined')

## 1.9) Preprocess Training Data
This will take a few minutes...

In [None]:
print('\n' + '='*60)
print('PREPROCESSING TRAINING DATA')
print('='*60)

# Initialize arrays
X_train = np.zeros((len(train_df), *TARGET_SIZE, 3), dtype=np.float32)
y_train = np.zeros(len(train_df), dtype=np.int32)

failed_count = 0

# Process each training image
for array_idx, (_, row) in enumerate(tqdm(train_df.iterrows(), total=len(train_df), desc='Processing')):
    image_path = os.path.join(TRAIN_DATA_PATH, row['sample_index'])
    image, success = load_and_preprocess_image(image_path)
    
    if success:
        X_train[array_idx] = image
        y_train[array_idx] = row['label_encoded']
    else:
        failed_count += 1

print(f'\nâœ“ Training data preprocessed')
print(f'  Successful: {len(train_df) - failed_count}')
print(f'  Failed: {failed_count}')
print(f'  X_train shape: {X_train.shape}')
print(f'  y_train shape: {y_train.shape}')

## 1.10) Preprocess Validation Data

In [None]:
print('\n' + '='*60)
print('PREPROCESSING VALIDATION DATA')
print('='*60)

# Initialize arrays
X_val = np.zeros((len(val_df), *TARGET_SIZE, 3), dtype=np.float32)
y_val = np.zeros(len(val_df), dtype=np.int32)

failed_count = 0

# Process each validation image
for array_idx, (_, row) in enumerate(tqdm(val_df.iterrows(), total=len(val_df), desc='Processing')):
    image_path = os.path.join(TRAIN_DATA_PATH, row['sample_index'])
    image, success = load_and_preprocess_image(image_path)
    
    if success:
        X_val[array_idx] = image
        y_val[array_idx] = row['label_encoded']
    else:
        failed_count += 1

print(f'\nâœ“ Validation data preprocessed')
print(f'  Successful: {len(val_df) - failed_count}')
print(f'  Failed: {failed_count}')
print(f'  X_val shape: {X_val.shape}')
print(f'  y_val shape: {y_val.shape}')

## 1.11) Convert Labels to One-Hot & Save Data
**One-hot encoding is needed for label smoothing**

In [None]:
# Convert labels to one-hot for label smoothing compatibility
y_train_onehot = keras.utils.to_categorical(y_train, num_classes=num_classes)
y_val_onehot = keras.utils.to_categorical(y_val, num_classes=num_classes)

print('\n' + '='*60)
print('SAVING PREPROCESSED DATA')
print('='*60)

# Save arrays (overwrite if exists)
np.save(os.path.join(PREPROCESSED_DIR, 'X_train.npy'), X_train)
np.save(os.path.join(PREPROCESSED_DIR, 'X_val.npy'), X_val)
np.save(os.path.join(PREPROCESSED_DIR, 'y_train.npy'), y_train)
np.save(os.path.join(PREPROCESSED_DIR, 'y_val.npy'), y_val)

# Save metadata
metadata = {
    'target_size': TARGET_SIZE,
    'num_classes': num_classes,
    'class_names': class_names.tolist(),
    'train_size': len(train_df),
    'val_size': len(val_df),
    'val_split': VAL_SPLIT,
    'random_state': RANDOM_STATE
}

with open(os.path.join(PREPROCESSED_DIR, 'metadata.pkl'), 'wb') as f:
    pickle.dump(metadata, f)

# Save label encoder
with open(os.path.join(PREPROCESSED_DIR, 'label_encoder.pkl'), 'wb') as f:
    pickle.dump(label_encoder, f)

print('âœ“ All data saved to:', PREPROCESSED_DIR)
print('  - X_train.npy')
print('  - X_val.npy')
print('  - y_train.npy')
print('  - y_val.npy')
print('  - metadata.pkl')
print('  - label_encoder.pkl')

print(f'\nâœ“ Labels converted to one-hot encoding')
print(f'  y_train_onehot shape: {y_train_onehot.shape}')
print(f'  y_val_onehot shape: {y_val_onehot.shape}')

---
# PART 2: MODEL BUILDING & TRAINING
---

## 2.1) Calculate Class Weights

In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Calculate class weights for imbalanced dataset
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))

print('='*60)
print('CLASS WEIGHTS (for imbalanced data)')
print('='*60)
for class_idx, weight in class_weights_dict.items():
    class_name = class_names[class_idx]
    count = np.sum(y_train == class_idx)
    print(f'{class_name:20s} (class {class_idx}): weight={weight:.3f}, count={count}')

## 2.2) Build Model with Data Augmentation

In [None]:
def build_model(input_shape, num_classes):
    """
    Build EfficientNetB2 model with:
    - Data augmentation
    - Transfer learning
    - Strong classification head
    """
    # Input
    inputs = layers.Input(shape=input_shape)
    
    # Data augmentation (applied during training only)
    x = layers.RandomFlip("horizontal_and_vertical")(inputs)
    x = layers.RandomRotation(0.12)(x)
    x = layers.RandomZoom(0.12)(x)
    x = layers.RandomContrast(0.15)(x)
    x = layers.RandomTranslation(0.05, 0.05)(x)
    x = layers.RandomBrightness(0.1)(x)
    
    # Load pre-trained EfficientNetB2 (frozen initially)
    base_model = tf.keras.applications.EfficientNetB2(
        include_top=False,
        weights='imagenet',
        input_tensor=x,
        pooling='avg'
    )
    base_model.trainable = False  # Freeze initially
    
    # Classification head
    x = base_model.output
    x = layers.BatchNormalization()(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    
    # Create model
    model = keras.Model(inputs=inputs, outputs=outputs)
    
    return model, base_model

# Build model
input_shape = (*TARGET_SIZE, 3)
model, base_model = build_model(input_shape, num_classes)

print('='*60)
print('MODEL ARCHITECTURE')
print('='*60)
print(f'Input shape: {input_shape}')
print(f'Number of classes: {num_classes}')
print(f'Total parameters: {model.count_params():,}')
print(f'Base model (EfficientNetB2) layers: {len(base_model.layers)}')
print(f'\nâœ“ Model built successfully')

## 2.3) Phase 1 - Train with Frozen Backbone

In [None]:
print('\n' + '='*60)
print('PHASE 1: TRAINING WITH FROZEN BACKBONE')
print('='*60)

# Compile model (use CategoricalCrossentropy for label smoothing)
model.compile(
    optimizer=AdamW(learning_rate=INITIAL_LR, weight_decay=WEIGHT_DECAY),
    loss=keras.losses.CategoricalCrossentropy(label_smoothing=LABEL_SMOOTHING),
    metrics=['accuracy']
)

# Callbacks
callbacks_phase1 = [
    ModelCheckpoint(
        os.path.join(MODELS_DIR, 'best_model_phase1.h5'),
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    ),
    EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-7,
        verbose=1
    )
]

# Train with one-hot encoded labels
history_phase1 = model.fit(
    X_train, y_train_onehot,
    validation_data=(X_val, y_val_onehot),
    epochs=EPOCHS_PHASE1,
    batch_size=BATCH_SIZE,
    class_weight=class_weights_dict,
    callbacks=callbacks_phase1,
    verbose=1
)

print('\nâœ“ Phase 1 training complete')

## 2.4) Phase 2 - Fine-tune Top Layers

In [None]:
print('\n' + '='*60)
print('PHASE 2: FINE-TUNING TOP LAYERS')
print('='*60)

# Unfreeze top layers of base model
base_model.trainable = True
total_layers = len(base_model.layers)
freeze_until = total_layers - UNFREEZE_LAYERS

for layer in base_model.layers[:freeze_until]:
    layer.trainable = False

print(f'Total base model layers: {total_layers}')
print(f'Freezing layers: 0 to {freeze_until-1}')
print(f'Unfreezing layers: {freeze_until} to {total_layers-1} ({UNFREEZE_LAYERS} layers)')

# Recompile with lower learning rate (use CategoricalCrossentropy for label smoothing)
model.compile(
    optimizer=AdamW(learning_rate=FINETUNE_LR, weight_decay=WEIGHT_DECAY),
    loss=keras.losses.CategoricalCrossentropy(label_smoothing=LABEL_SMOOTHING),
    metrics=['accuracy']
)

print(f'Trainable parameters: {sum([tf.size(w).numpy() for w in model.trainable_weights]):,}')

# Callbacks for phase 2
callbacks_phase2 = [
    ModelCheckpoint(
        os.path.join(MODELS_DIR, 'best_model.h5'),
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    ),
    EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-8,
        verbose=1
    )
]

# Train with one-hot encoded labels
history_phase2 = model.fit(
    X_train, y_train_onehot,
    validation_data=(X_val, y_val_onehot),
    epochs=EPOCHS_PHASE2,
    batch_size=BATCH_SIZE,
    class_weight=class_weights_dict,
    callbacks=callbacks_phase2,
    verbose=1
)

print('\nâœ“ Phase 2 training complete')

## 2.5) Save Final Model & Training Info
**This will overwrite existing model files**

In [None]:
print('\n' + '='*60)
print('SAVING MODEL & TRAINING INFO')
print('='*60)

# Save final model (overwrite)
model.save(os.path.join(MODELS_DIR, 'final_model.h5'))
print('âœ“ Final model saved to: models/final_model.h5')
print('âœ“ Best model saved to: models/best_model.h5')

# Save training info
training_info = {
    'class_names': class_names.tolist(),
    'num_classes': num_classes,
    'input_shape': input_shape,
    'batch_size': BATCH_SIZE,
    'epochs_phase1': EPOCHS_PHASE1,
    'epochs_phase2': EPOCHS_PHASE2,
    'initial_lr': INITIAL_LR,
    'finetune_lr': FINETUNE_LR,
    'history_phase1': history_phase1.history,
    'history_phase2': history_phase2.history
}

with open(os.path.join(MODELS_DIR, 'training_info.pkl'), 'wb') as f:
    pickle.dump(training_info, f)

print('âœ“ Training info saved to: models/training_info.pkl')

## 2.6) Evaluate on Validation Set

In [None]:
print('\n' + '='*60)
print('FINAL VALIDATION EVALUATION')
print('='*60)

# Load best model
best_model = keras.models.load_model(os.path.join(MODELS_DIR, 'best_model.h5'))

# Evaluate
val_loss, val_accuracy = best_model.evaluate(X_val, y_val_onehot, verbose=0)

print(f'Validation Loss: {val_loss:.4f}')
print(f'Validation Accuracy: {val_accuracy:.4f} ({val_accuracy*100:.2f}%)')

# Per-class accuracy
y_val_pred = best_model.predict(X_val, verbose=0)
y_val_pred_classes = np.argmax(y_val_pred, axis=1)

print(f'\nPer-class accuracy:')
for class_idx in range(num_classes):
    mask = y_val == class_idx
    if mask.sum() > 0:
        class_acc = (y_val_pred_classes[mask] == class_idx).mean()
        class_name = class_names[class_idx]
        count = mask.sum()
        print(f'  {class_name:20s}: {class_acc:.4f} ({class_acc*100:.2f}%) - {count} samples')

## 2.7) Plot Training History

In [None]:
# Combine histories
all_history = {
    'loss': history_phase1.history['loss'] + history_phase2.history['loss'],
    'val_loss': history_phase1.history['val_loss'] + history_phase2.history['val_loss'],
    'accuracy': history_phase1.history['accuracy'] + history_phase2.history['accuracy'],
    'val_accuracy': history_phase1.history['val_accuracy'] + history_phase2.history['val_accuracy']
}

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Loss
axes[0].plot(all_history['loss'], label='Train Loss', linewidth=2)
axes[0].plot(all_history['val_loss'], label='Val Loss', linewidth=2)
axes[0].axvline(x=EPOCHS_PHASE1, color='red', linestyle='--', label='Fine-tuning starts')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training & Validation Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy
axes[1].plot(all_history['accuracy'], label='Train Accuracy', linewidth=2)
axes[1].plot(all_history['val_accuracy'], label='Val Accuracy', linewidth=2)
axes[1].axvline(x=EPOCHS_PHASE1, color='red', linestyle='--', label='Fine-tuning starts')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Training & Validation Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(MODELS_DIR, 'training_history.png'), dpi=150, bbox_inches='tight')
plt.show()

print('\nâœ“ Training history plot saved to: models/training_history.png')

---
# PART 3: GENERATE PREDICTIONS
---

## 3.1) Load Test Image List

In [None]:
# Get all test images
test_files = sorted([f for f in os.listdir(TEST_DATA_PATH) if f.startswith('img_')])

print('='*60)
print('TEST DATA INFO')
print('='*60)
print(f'Total test images: {len(test_files)}')
print(f'First 5: {test_files[:5]}')
print(f'Last 5: {test_files[-5:]}')

## 3.2) Preprocess Test Images
This may take a few minutes...

In [None]:
print('\n' + '='*60)
print('PREPROCESSING TEST IMAGES')
print('='*60)

# Initialize array
X_test = np.zeros((len(test_files), *TARGET_SIZE, 3), dtype=np.float32)

failed_count = 0

# Process each test image
for idx, filename in enumerate(tqdm(test_files, desc='Processing')):
    image_path = os.path.join(TEST_DATA_PATH, filename)
    image, success = load_and_preprocess_image(image_path)
    
    if success:
        X_test[idx] = image
    else:
        failed_count += 1

print(f'\nâœ“ Test data preprocessed')
print(f'  Successful: {len(test_files) - failed_count}')
print(f'  Failed: {failed_count}')
print(f'  X_test shape: {X_test.shape}')

## 3.3) Make Predictions

In [None]:
print('\n' + '='*60)
print('MAKING PREDICTIONS')
print('='*60)

# Predict with best model
predictions = best_model.predict(X_test, batch_size=BATCH_SIZE, verbose=1)
predicted_classes = np.argmax(predictions, axis=1)

# Decode to label names
predicted_labels = label_encoder.inverse_transform(predicted_classes)

print(f'\nâœ“ Predictions complete')
print(f'  Total predictions: {len(predicted_labels)}')

# Show prediction distribution
print(f'\nPrediction distribution:')
unique, counts = np.unique(predicted_labels, return_counts=True)
for label, count in zip(unique, counts):
    percentage = (count / len(predicted_labels)) * 100
    print(f'  {label}: {count} ({percentage:.1f}%)')

## 3.4) Create & Save Submission File
**This will overwrite existing submission.csv**

In [None]:
# Create submission dataframe
submission_df = pd.DataFrame({
    'sample_index': test_files,
    'label': predicted_labels
})

# Save to CSV (overwrite if exists)
submission_path = os.path.join(BASE_PATH, 'submission.csv')
submission_df.to_csv(submission_path, index=False)

print('\n' + '='*60)
print('SUBMISSION FILE CREATED')
print('='*60)
print(f'âœ“ Saved to: {submission_path}')
print(f'\nFirst 10 rows:')
print(submission_df.head(10))
print(f'\nLast 10 rows:')
print(submission_df.tail(10))
print(f'\nTotal rows: {len(submission_df)}')
print(f'\nâœ“ Ready to submit to Kaggle!')

---
# PIPELINE COMPLETE! ðŸŽ‰
---

**Next Steps:**
1. Download `submission.csv` from your Google Drive
2. Submit to Kaggle
3. Check your F1-score on the leaderboard

**Expected Results:**
- Validation Accuracy: >50%
- Kaggle F1-Score: >0.35 (improvement from 0.2544)

**Files Generated (all in your Drive):**
- `preprocessed_data/`: X_train.npy, X_val.npy, y_train.npy, y_val.npy, metadata.pkl, label_encoder.pkl
- `models/`: best_model.h5, final_model.h5, training_info.pkl, training_history.png
- `submission.csv`: Ready for Kaggle submission
- `pipeline_log.txt`: Complete execution log