In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss


import matplotlib.pyplot as plt
import seaborn as sns


from itertools import combinations
import random

from PIL import Image


import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models # Keep for potential future use
from torch.optim.lr_scheduler import CosineAnnealingLR # Import scheduler

import collections
from collections.abc import Callable, Sequence
from concurrent import futures
from datetime import datetime
from functools import partial
import math
import multiprocessing
import os
from typing import Any, Tuple, Dict, Optional

import numpy as np
import pandas as pd
from PIL import Image
import scipy
import seaborn as sns
import skimage as ski
from matplotlib import pyplot as plt
import matplotlib.patheffects as pe
from tqdm.notebook import tqdm


In [None]:
# path = "/kaggle/input/final-deepmind-comp-dataset/final_deepmind_comp_dataset/zindi_data/"
path = "/kaggle/input/inundata-mapping-floods-in-south-africa/"
train = pd.read_csv(path + "Train.csv")
test = pd.read_csv(path + "Test.csv")
submission = pd.read_csv(path + "SampleSubmission.csv")
images = np.load(path + "composite_images.npz")
display(train.head(), test.head())

In [None]:
def get_location(value):
  return value.split("_")[0] + '_' + value.split("_")[1]

def get_event_id(value):
  return value.split("_")[3]
for df in [train, test]:
  df['location_id'] = df['event_id'].apply(lambda x: get_location(x))
  df['event'] = df['event_id'].apply(lambda x: get_event_id(x))

print(len(set(train['location_id'])), len(set(test['location_id'])))
print(len(set(train['location_id']).intersection(set(test['location_id']))))
print(len(images))
display(train.head(), test.head())

In [None]:
train.groupby(['location_id'])['event_id'].count()

In [None]:
# seperate the training into train and validation group 
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    train_grouped, test_size=0.2, random_state=42, stratify=train_grouped['label']
)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

In [None]:
# Define constants
IMG_SIZE = (224, 224)  # Standard input size for many CNNs
BATCH_SIZE = 32
EPOCHS = 20
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

IMAGE_TYPES = ["moisture-stress", "nir11slope"]

In [None]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset

class FloodEventDataset(Dataset):
    def __init__(self, root_dir, labels_df, transform=None):
        self.root_dir = root_dir
        self.labels_df = labels_df.set_index('location_id')['label'].to_dict()
        self.labels_df = {str(k): v for k, v in self.labels_df.items()}
        # print("THESE ARE THE LABELS", labels_df)
        self.transform = transform

        # The subfolders: type1_folder to type5_folder
        self.type_folders = ["moisture-stress", "nir11slope"]

        # Get filenames from the first folder
        folder_path = os.path.join(root_dir, self.type_folders[0])
        # print("THIS IS FOLDER_PATH", folder_path)
        all_files = os.listdir(folder_path)

        self.event_filenames = [
            f for f in all_files if f.endswith('.png') and f[:-4] in self.labels_df
        ]
        # print("THIS IS EVENT_FILENAMES", self.event_filenames)
        

    def __len__(self):
        return len(self.event_filenames)

    def __getitem__(self, idx):
        filename = self.event_filenames[idx]
        images = []

        for folder in self.type_folders:
            img_path = os.path.join(self.root_dir, folder, filename)
            img = Image.open(img_path)
            if self.transform:
                img = self.transform(img)
            images.append(img)

        # Shape: (5, C, H, W)
        stacked_images = torch.stack(images)
        location_id = filename.split(".")[0]
        label = torch.tensor(self.labels_df[location_id], dtype=torch.float32)
        return stacked_images, label


In [None]:
# seperate the training into train and validation group 
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    train_grouped, test_size=0.2, random_state=42, stratify=train_grouped['label']
)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

In [None]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Assume `labels_df` has columns: location_id, flood_label
train_dataset = FloodEventDataset(root_dir='/kaggle/input', labels_df=train_df, transform=transform)
print(len(train_dataset))


In [None]:
validation_dataset = FloodEventDataset(root_dir='/kaggle/input', labels_df=val_df, transform=transform)
print(len(validation_dataset))

In [None]:
# dataloaders 
imgs, label = validation_dataset[0]

print(imgs.shape)  # torch.Size([2, 3, 224, 224])
print(label)       # tensor(0.) or tensor(1.)

In [None]:
# image classification model

class FloodClassifier(nn.Module):
    def __init__(self, num_image_types=5, pretrained=True):
        super(FloodClassifier, self).__init__()
        
        # Feature extractor (using ResNet18)
        self.feature_extractors = nn.ModuleList([
            self._create_feature_extractor(pretrained) for _ in range(num_image_types)
        ])
        
        # Calculate feature size (ResNet18 produces 512 features)
        feature_size = 512 * num_image_types
        
        # Classification layers
        self.classifier = nn.Sequential(
            nn.Linear(feature_size, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
        
    def _create_feature_extractor(self, pretrained):
        # Using ResNet18 as feature extractor
        model = models.resnet18(pretrained=pretrained)
        # Remove final classification layer
        modules = list(model.children())[:-1]
        feature_extractor = nn.Sequential(*modules)
        
        # Freeze the feature extractor
        for param in feature_extractor.parameters():
            param.requires_grad = False
            
        return feature_extractor
    
    def forward(self, x):
        # x shape: [batch_size, num_image_types, channels, height, width]
        batch_size, num_imgs, c, h, w = x.shape
        
        # Process each image type
        features = []
        for i in range(num_imgs):
            # Extract the i-th image type from all batches
            img_i = x[:, i, :, :, :]  # [batch_size, channels, height, width]
            # Get features
            feat_i = self.feature_extractors[i](img_i)  # [batch_size, feature_size, 1, 1]
            feat_i = feat_i.view(batch_size, -1)  # [batch_size, feature_size]
            features.append(feat_i)
        
        # Concatenate all features
        combined_features = torch.cat(features, dim=1)  # [batch_size, feature_size * num_image_types]
        
        # Apply classifier
        output = self.classifier(combined_features)
        return output


In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=20):
    # Track best model
    best_val_loss = float('inf')
    best_model_weights = None
    
    # For tracking metrics
    history = {
        'train_loss': [], 'val_loss': [],
        'train_acc': [], 'val_acc': []
    }
    
    # Training loop
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, labels in train_loader:
            # Move to device
            inputs = inputs.to(DEVICE)
            labels = labels.to(DEVICE).view(-1, 1)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            # Track statistics
            running_loss += loss.item() * inputs.size(0)
            predicted = (outputs >= 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = correct / total
        
        # Validation phase
        model.eval()
        val_running_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = inputs.to(DEVICE)
                labels = labels.to(DEVICE).view(-1, 1)
                
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                val_running_loss += loss.item() * inputs.size(0)
                predicted = (outputs >= 0.5).float()
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
                
        val_epoch_loss = val_running_loss / len(val_loader.dataset)
        val_epoch_acc = val_correct / val_total
        
        # Save metrics
        history['train_loss'].append(epoch_loss)
        history['val_loss'].append(val_epoch_loss)
        history['train_acc'].append(epoch_acc)
        history['val_acc'].append(val_epoch_acc)
        
        print(f'Epoch {epoch+1}/{num_epochs} - '
              f'Loss: {epoch_loss:.4f} - Acc: {epoch_acc:.4f} - '
              f'Val Loss: {val_epoch_loss:.4f} - Val Acc: {val_epoch_acc:.4f}')
        
        # Save best model
        if val_epoch_loss < best_val_loss:
            best_val_loss = val_epoch_loss
            best_model_weights = model.state_dict().copy()
    
    # Load best model weights
    model.load_state_dict(best_model_weights)
    
    return model, history

In [None]:
# Function to plot training history
def plot_training_history(history):
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(history['train_acc'], label='Train Accuracy')
    plt.plot(history['val_acc'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history['train_loss'], label='Train Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.show()


In [None]:
def train_flood_classifier(train_dataset, val_dataset):
    transform = transforms.Compose([
        transforms.Resize(IMG_SIZE),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    train_transform = transforms.Compose([
        transforms.Resize(IMG_SIZE),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(10),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    val_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=False)
    # Initialize model
    model = FloodClassifier(num_image_types=len(IMAGE_TYPES), pretrained=True)
    model = model.to(DEVICE)
    
    # Set up loss function and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    
    # Train model
    model, history = train_model(
        model, train_loader, val_loader, criterion, optimizer, num_epochs=EPOCHS
    )
    
    # Plot training history
    plot_training_history(history)
    
    # Save model
    # torch.save(model.state_dict(), "flood_classifier_pytorch.pth")
    torch.save(model.state_dict(), '/kaggle/working/flood_classifier_pytorch.pth')
    print("Model saved as 'flood_classifier_pytorch.pth'")
    
    return model

In [None]:
model = train_flood_classifier(train_dataset, validation_dataset)

In [None]:
def generate_prediction_csv(model, train_dataset, train_df, validation_dataset, val_df, csv_output_path="flood_predictions.csv", random_state=42):
  
    
    # Create transform for inference
    transform = transforms.Compose([
        transforms.Resize(IMG_SIZE),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    val_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    # Make predictions
    model.eval()
    
    # For train set
    train_probs = []
    train_true_labels = []
    
    with torch.no_grad():
        for inputs, labels in train_loader:
            inputs = inputs.to(DEVICE)
            outputs = model(inputs)
            probs = outputs.cpu().numpy().flatten()
            train_probs.extend(probs)
            train_true_labels.extend(labels.numpy())

    predicted_train_df = pd.DataFrame({
        'location_id': train_df['location_id'],
        'flood_probability': train_probs,
        'predicted_label': (np.array(train_probs) >= 0.5).astype(int),
        'true_label': train_true_labels,
        'dataset': 'train'
    })
        
        
    
    # For validation set
    val_probs = []
    val_true_labels = []
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(DEVICE)
            outputs = model(inputs)
            probs = outputs.cpu().numpy().flatten()
            val_probs.extend(probs)
            val_true_labels.extend(labels.numpy())

    predicted_val_df = pd.DataFrame({
        'location_id': val_df['location_id'],
        'flood_probability': val_probs,
        'predicted_label': (np.array(val_probs) >= 0.5).astype(int),
        'true_label': val_true_labels,
        'dataset': 'validation'
    })
    
    # Create DataFrames for train and validation sets
    
    
    # # Combine and save to CSV
    combined_df = pd.concat([predicted_train_df, predicted_val_df])
    combined_df.to_csv(csv_output_path, index=False)
    
    # print(f"Predictions saved to {csv_output_path}")
    
    # Optional: Calculate and display some metrics
    train_accuracy = np.mean((np.array(train_probs) >= 0.5).astype(int) == np.array(train_true_labels))
    val_accuracy = np.mean((np.array(val_probs) >= 0.5).astype(int) == np.array(val_true_labels))
    
    print(f"Train accuracy: {train_accuracy:.4f}")
    print(f"Validation accuracy: {val_accuracy:.4f}")
    
    return combined_df

In [None]:
predictions_df = generate_prediction_csv(model, train_dataset, train_df, validation_dataset, val_df, "/kaggle/working/flood_predictions.csv")