In [1]:
import os
import gc
from glob import glob
import sys
import math
import time
import random
import shutil
from pathlib import Path
from typing import Dict, List
from scipy.stats import entropy
from scipy.signal import butter, lfilter, freqz
from contextlib import contextmanager
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, log_loss
from tqdm.auto import tqdm
from functools import partial
import cv2
from PIL import Image
import torch
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau, OneCycleLR, CosineAnnealingLR, CosineAnnealingWarmRestarts
from sklearn.preprocessing import LabelEncoder
from torchvision.transforms import v2
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import train_test_split
import albumentations as A
from albumentations import (Compose, Normalize, Resize, RandomResizedCrop, HorizontalFlip, VerticalFlip, ShiftScaleRotate, Transpose)
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform
import timm
from scipy import optimize
from scipy.optimize import minimize_scalar
import warnings 
warnings.filterwarnings('ignore')
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from matplotlib import pyplot as plt
import joblib
VERSION=16

  check_for_updates()


In [4]:
class CFG:
    # define kaggle input paths 
    data_path = "/kaggle/input/flood-prediction/"
    original_data_path = "/kaggle/input/flood-prediction/data/"

    # Files containing OOF predictions from Stage 3 models
    fastai_oof_file = data_path + "fastai_train_with_oof.csv"
    gbdt_oof_file = data_path + "gbdt_train_with_oof.csv"

    # Files containing Test predictions from Stage 3 models
    fastai_test_file = data_path + "fastai_test_with_oof.csv"
    gbdt_test_file = data_path + "gbdt_test_with_oof.csv"

    # Files containing Static Flood Probability from Stage 2
    stage2_train_file = data_path + "train_with_cv_results_accuracy_0_8931.csv"
    stage2_test_file = data_path + "test_with_cv_results_accuracy_0_8931.csv"

    # Sample submission file
    submission_file = original_data_path + "SampleSubmission.csv" # Adjust path if needed

    seed = 2024
    n_ensemble_models = 3 # We have FastAI, LGB, CATT predictions available

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    # Torch seeds are not needed here unless using PyTorch directly
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False

seed_everything()


In [5]:

print("Loading OOF prediction files...")
# Load FastAI OOF
fastai_oof = pd.read_csv(CFG.fastai_oof_file)
print(f"FastAI OOF columns: {fastai_oof.columns.tolist()}")
# Load GBDT OOF (contains LGB and CATT)
gbdt_oof = pd.read_csv(CFG.gbdt_oof_file)
print(f"GBDT OOF columns: {gbdt_oof.columns.tolist()}")

print("\nLoading Test prediction files...")
# Load FastAI Test preds
fastai_test = pd.read_csv(CFG.fastai_test_file)
print(f"FastAI Test columns: {fastai_test.columns.tolist()}")
# Load GBDT Test preds
gbdt_test = pd.read_csv(CFG.gbdt_test_file)
print(f"GBDT Test columns: {gbdt_test.columns.tolist()}")

print("\nLoading Stage 2 static probability results...")
# Load Static Probabilities (OOF for Train)
stage2_train = pd.read_csv(CFG.stage2_train_file)
print(f"Stage 2 Train columns: {stage2_train.columns.tolist()}")
# Load Static Probabilities (Test)
stage2_test = pd.read_csv(CFG.stage2_test_file)
print(f"Stage 2 Test columns: {stage2_test.columns.tolist()}")

print("\nLoading Sample Submission...")
submission_df = pd.read_csv(CFG.submission_file)


Loading OOF prediction files...
FastAI OOF columns: ['event_id', 'location_id', 'event_t', 'flood_probability', 'label', 'oof_fastai']
GBDT OOF columns: ['event_id', 'location_id', 'event_t', 'flood_probability', 'label', 'oof_lgb', 'oof_catt', 'ensemble_preds']

Loading Test prediction files...
FastAI Test columns: ['event_id', 'location_id', 'event_t', 'flood_probability', 'label', 'fastai_preds']
GBDT Test columns: ['event_id', 'location_id', 'event_t', 'flood_probability', 'label', 'lgb_preds', 'catt_preds', 'ensemble_preds']

Loading Stage 2 static probability results...
Stage 2 Train columns: ['location_id', 'label', 'fold', 'image_path', 'oof_preds', 'oof_correct_prob', 'predicted_class']
Stage 2 Test columns: ['location_id', 'event_id_counts', 'image_path', 'predicted_prob', 'predicted_class']

Loading Sample Submission...


In [6]:
# --- Prepare OOF Data ---
# Select necessary columns and rename for clarity
oof_df = fastai_oof[['event_id', 'location_id', 'event_t', 'label', 'oof_fastai']].copy()
oof_df = pd.merge(oof_df, gbdt_oof[['event_id', 'oof_lgb', 'oof_catt']], on='event_id', how='left')

# Merge static probability from Stage 2 OOF results
stage2_train_prob = stage2_train[['location_id', 'oof_correct_prob']].rename(columns={'oof_correct_prob': 'static_prob'})
oof_df = pd.merge(oof_df, stage2_train_prob, on='location_id', how='left')

# Sort to ensure alignment
oof_df = oof_df.sort_values(by=['location_id', 'event_t']).reset_index(drop=True)

print("Prepared OOF DataFrame head:")
display(oof_df.head())
print(f"OOF DataFrame shape: {oof_df.shape}")
print(f"OOF NaNs check:\n{oof_df.isnull().sum()}")


# --- Prepare Test Data ---
# Select necessary columns and rename
test_df = fastai_test[['event_id', 'location_id', 'event_t', 'fastai_preds']].copy()
test_df = pd.merge(test_df, gbdt_test[['event_id', 'lgb_preds', 'catt_preds']], on='event_id', how='left')

# Merge static probability from Stage 2 test results
# Note: Column name in stage2_test is 'predicted_prob' based on user input
stage2_test_prob = stage2_test[['location_id', 'predicted_prob']].rename(columns={'predicted_prob': 'static_prob'})
test_df = pd.merge(test_df, stage2_test_prob, on='location_id', how='left')

# Sort to ensure alignment (important for final submission)
test_df = test_df.sort_values(by='event_id').reset_index(drop=True)

print("\nPrepared Test DataFrame head:")
display(test_df.head())
print(f"Test DataFrame shape: {test_df.shape}")
print(f"Test NaNs check:\n{test_df.isnull().sum()}")

# Handle potential NaNs if merges failed (e.g., fill with mean/median or investigate)
# For simplicity, let's check and fill static_prob if needed
median_static_prob_oof = oof_df['static_prob'].median()
median_static_prob_test = test_df['static_prob'].median()
oof_df['static_prob'] = oof_df['static_prob'].fillna(median_static_prob_oof)
test_df['static_prob'] = test_df['static_prob'].fillna(median_static_prob_test)
print("\nNaN check after fill:")
print(f"OOF NaNs: {oof_df['static_prob'].isnull().sum()}")
print(f"Test NaNs: {test_df['static_prob'].isnull().sum()}")


Prepared OOF DataFrame head:


Unnamed: 0,event_id,location_id,event_t,label,oof_fastai,oof_lgb,oof_catt,static_prob
0,id_05v6zjuaf300_X_0,id_05v6zjuaf300,0,0.0,0.000132,1.178418e-07,1.751966e-07,0.9997
1,id_05v6zjuaf300_X_1,id_05v6zjuaf300,1,0.0,0.000133,1.635817e-07,8.539053e-07,0.9997
2,id_05v6zjuaf300_X_2,id_05v6zjuaf300,2,0.0,0.000126,1.500422e-07,4.935893e-07,0.9997
3,id_05v6zjuaf300_X_3,id_05v6zjuaf300,3,0.0,0.000117,1.439613e-07,3.372869e-07,0.9997
4,id_05v6zjuaf300_X_4,id_05v6zjuaf300,4,0.0,0.000127,1.144397e-07,3.060345e-07,0.9997


OOF DataFrame shape: (492020, 8)
OOF NaNs check:
event_id       0
location_id    0
event_t        0
label          0
oof_fastai     0
oof_lgb        0
oof_catt       0
static_prob    0
dtype: int64

Prepared Test DataFrame head:


Unnamed: 0,event_id,location_id,event_t,fastai_preds,lgb_preds,catt_preds,static_prob
0,id_066zz28m11mr_X_0,id_066zz28m11mr,0,0.000165,6.188e-08,4.520225e-08,0.000834
1,id_066zz28m11mr_X_1,id_066zz28m11mr,1,0.000167,6.776189e-08,2.96491e-08,0.000834
2,id_066zz28m11mr_X_10,id_066zz28m11mr,10,0.000157,7.236391e-08,3.108074e-08,0.000834
3,id_066zz28m11mr_X_100,id_066zz28m11mr,100,0.000176,8.779468e-06,5.077424e-07,0.000834
4,id_066zz28m11mr_X_101,id_066zz28m11mr,101,0.000178,4.431223e-06,1.325167e-07,0.000834


Test DataFrame shape: (163520, 7)
Test NaNs check:
event_id        0
location_id     0
event_t         0
fastai_preds    0
lgb_preds       0
catt_preds      0
static_prob     0
dtype: int64

NaN check after fill:
OOF NaNs: 0
Test NaNs: 0


In [8]:

# Define target and prediction columns
target_col = 'label'
model_oof_cols = ['oof_fastai', 'oof_lgb', 'oof_catt'] 

oof_preds_arrays = {}
print("Individual Model OOF Log Loss Scores:")
for col in model_oof_cols:
    # Clip predictions for log loss stability
    preds = np.clip(oof_df[col].values, 1e-15, 1 - 1e-15)
    oof_preds_arrays[col] = preds # Store clipped array
    score = log_loss(oof_df[target_col].values, preds)
    print(f"- {col}: {score:.6f}")

# Calculate uniform ensemble OOF predictions
uniform_ensemble_oof = np.mean(list(oof_preds_arrays.values()), axis=0)
uniform_ensemble_score = log_loss(oof_df[target_col].values, np.clip(uniform_ensemble_oof, 1e-15, 1 - 1e-15))
print(f"\nUniform Ensemble OOF Log Loss Score: {uniform_ensemble_score:.6f}")



Individual Model OOF Log Loss Scores:
- oof_fastai: 0.002653
- oof_lgb: 0.002428
- oof_catt: 0.002672

Uniform Ensemble OOF Log Loss Score: 0.002307


In [9]:

# Use a threshold (e.g., 0.7 as used in their final test norm, or optimize it later)
normalization_threshold_demo = 0.7
epsilon = 1e-8

print(f"\nLogloss before normalizing (Uniform Ensemble): {uniform_ensemble_score:.6f}")

# Create temporary df for calculation
temp_oof_df = oof_df[['location_id', 'static_prob', target_col]].copy()
temp_oof_df['uniform_ensemble'] = uniform_ensemble_oof

locations_to_normalize = temp_oof_df[temp_oof_df['static_prob'] >= normalization_threshold_demo]['location_id'].unique()
temp_oof_df['oof_sum_prob'] = temp_oof_df.groupby('location_id')['uniform_ensemble'].transform('sum')

temp_oof_df['ensemble_oof_norm'] = temp_oof_df['uniform_ensemble'] # Copy original values
mask = temp_oof_df['location_id'].isin(locations_to_normalize)
valid_divisor_mask = mask & (temp_oof_df['oof_sum_prob'] > epsilon)

temp_oof_df.loc[valid_divisor_mask, 'ensemble_oof_norm'] = (
    temp_oof_df.loc[valid_divisor_mask, 'uniform_ensemble'] /
    temp_oof_df.loc[valid_divisor_mask, 'oof_sum_prob']
)

# Clip for log loss
temp_oof_df['ensemble_oof_norm'] = np.clip(temp_oof_df['ensemble_oof_norm'], epsilon, 1 - epsilon)

normalized_uniform_score = log_loss(temp_oof_df[target_col].values, temp_oof_df['ensemble_oof_norm'].values)
print(f"Logloss after normalizing (Uniform Ensemble, Threshold={normalization_threshold_demo}): {normalized_uniform_score:.6f}")

del temp_oof_df # Clean up
gc.collect()




Logloss before normalizing (Uniform Ensemble): 0.002307
Logloss after normalizing (Uniform Ensemble, Threshold=0.7): 0.002252


21

In [10]:

# Prepare list of OOF prediction arrays (use the clipped ones)
ens = [oof_preds_arrays[col] for col in model_oof_cols]
labels = oof_df[target_col].values
n_models = len(ens)

# Objective function for the optimizer
def objective(weights, predictions, true_labels):
    """Calculates log loss for weighted ensemble predictions."""
    if np.sum(weights) == 0: # Avoid division by zero if all weights are zero
         return 1.0 # Return high loss

    # Normalize weights to sum to 1 (common practice with Nelder-Mead)
    weights = weights / np.sum(weights)

    # Calculate weighted prediction
    ensemble_preds = np.zeros_like(true_labels, dtype=float)
    for i, w in enumerate(weights):
        ensemble_preds += w * predictions[i]

    # Clip for stability and calculate log loss
    ensemble_preds = np.clip(ensemble_preds, 1e-15, 1 - 1e-15)
    loss = log_loss(true_labels, ensemble_preds)
    return loss

# Initial guess: equal weights
initial_weights = np.array([1.0 / n_models] * n_models)

# Bounds for weights (e.g., >0)
bounds = [(0.0, 1.0)] * n_models # Weights between 0 and 1

print(f"\nOptimizing weights for {n_models} models...")

# Use minimize function
# Nelder-Mead doesn't strictly enforce bounds or constraints during optimization,
# but the internal normalization in `objective` helps keep weights positive and summing to 1 conceptually.
# Methods like 'L-BFGS-B' or 'SLSQP' handle bounds/constraints more directly if needed.
result = optimize.minimize(
    objective,
    initial_weights,
    args=(ens, labels), # Pass predictions and labels to the objective function
    method='Nelder-Mead',
    # bounds=bounds, # Nelder-Mead doesn't use bounds argument directly
    options={'disp': True, 'maxiter': 1000, 'adaptive': True} # Increase maxiter, use adaptive parameters
)

# Optimized weights (renormalize just in case)
optimized_weights = result.x / np.sum(result.x)
best_oof_score_optimized = result.fun

print(f"\nOptimization successful: {result.success}")
print(f"Optimized Weights: {optimized_weights}")
print(f"Sum of Weights: {np.sum(optimized_weights)}")
print(f"Optimized Ensemble OOF Log Loss: {best_oof_score_optimized:.6f}")




Optimizing weights for 3 models...
Optimization terminated successfully.
         Current function value: 0.002301
         Iterations: 57
         Function evaluations: 105

Optimization successful: True
Optimized Weights: [0.35456422 0.43411358 0.21132219]
Sum of Weights: 0.9999999999999999
Optimized Ensemble OOF Log Loss: 0.002301


In [11]:

# Select test prediction columns corresponding to OOF columns used in 'ens'
model_test_cols = ['fastai_preds', 'lgb_preds', 'catt_preds'] # Ensure order matches model_oof_cols
test_preds_arrays = [test_df[col].values for col in model_test_cols]

# Calculate weighted ensemble predictions for the test set
weighted_test_preds = np.zeros_like(test_preds_arrays[0], dtype=float)
for i, w in enumerate(optimized_weights):
    weighted_test_preds += w * test_preds_arrays[i]

# Store weighted predictions in the test dataframe
test_df['ensemble_label'] = weighted_test_preds

print("Weighted ensemble predictions calculated for test set.")
display(test_df[['event_id', 'ensemble_label']].head())


Weighted ensemble predictions calculated for test set.


Unnamed: 0,event_id,ensemble_label
0,id_066zz28m11mr_X_0,5.9e-05
1,id_066zz28m11mr_X_1,5.9e-05
2,id_066zz28m11mr_X_10,5.6e-05
3,id_066zz28m11mr_X_100,6.6e-05
4,id_066zz28m11mr_X_101,6.5e-05


In [12]:

# --- Optimize Normalization Threshold (Optional but Recommended) ---
# We use the OOF data to find the best threshold

# Recalculate the OPTIMIZED OOF ensemble prediction
optimized_oof_preds = np.zeros_like(labels, dtype=float)
for i, w in enumerate(optimized_weights):
    optimized_oof_preds += w * ens[i] # Use original ens OOF arrays

temp_opt_oof_df = oof_df[['location_id', 'static_prob', target_col]].copy()
temp_opt_oof_df['opt_ensemble'] = optimized_oof_preds

# Define the scoring function again for the optimized OOF preds
def calculate_normalized_logloss_opt(threshold, df):
    locations_to_normalize = df[df['static_prob'] >= threshold]['location_id'].unique()
    df['oof_sum_prob'] = df.groupby('location_id')['opt_ensemble'].transform('sum')
    epsilon = 1e-8
    df['preds_norm'] = df['opt_ensemble'] # Default to original optimized ensemble

    mask = df['location_id'].isin(locations_to_normalize)
    valid_divisor_mask = mask & (df['oof_sum_prob'] > epsilon)

    df.loc[valid_divisor_mask, 'preds_norm'] = (
        df.loc[valid_divisor_mask, 'opt_ensemble'] /
        df.loc[valid_divisor_mask, 'oof_sum_prob']
    )
    df['preds_norm'] = np.clip(df['preds_norm'], epsilon, 1 - epsilon)
    return log_loss(df['label'], df['preds_norm'])

print("\nOptimizing normalization threshold on OPTIMIZED OOF ensemble...")
opt_thresh_result = minimize_scalar(
    lambda t: calculate_normalized_logloss_opt(t, temp_opt_oof_df.copy()),
    bounds=(0.01, 0.99),
    method='bounded'
)

best_threshold_final = opt_thresh_result.x
best_final_oof_score_normalized = opt_thresh_result.fun

print(f"Optimal Normalization Threshold (from optimized OOF): {best_threshold_final:.4f}")
print(f"Best OOF Log Loss (Optimized Ensemble + Optimized Norm): {best_final_oof_score_normalized:.6f}")

del temp_opt_oof_df # Clean up
gc.collect()


# --- Apply Normalization to Test Predictions using the optimized threshold ---
print(f"\nApplying normalization to TEST predictions using threshold: {best_threshold_final:.4f}")

locations_to_normalize_test = test_df[test_df['static_prob'] >= best_threshold_final]['location_id'].unique()
test_df['oof_sum_prob'] = test_df.groupby('location_id')['ensemble_label'].transform('sum')

test_df['final_label'] = test_df['ensemble_label'] # Copy weighted predictions

mask_test = test_df['location_id'].isin(locations_to_normalize_test)
valid_divisor_mask_test = mask_test & (test_df['oof_sum_prob'] > epsilon)

test_df.loc[valid_divisor_mask_test, 'final_label'] = (
    test_df.loc[valid_divisor_mask_test, 'ensemble_label'] /
    test_df.loc[valid_divisor_mask_test, 'oof_sum_prob']
)

# Final clip for submission
test_df['final_label'] = np.clip(test_df['final_label'], epsilon, 1 - epsilon)

print("Normalization applied to test predictions.")
display(test_df[['event_id', 'ensemble_label', 'final_label']].head())



Optimizing normalization threshold on OPTIMIZED OOF ensemble...
Optimal Normalization Threshold (from optimized OOF): 0.6863
Best OOF Log Loss (Optimized Ensemble + Optimized Norm): 0.002241

Applying normalization to TEST predictions using threshold: 0.6863
Normalization applied to test predictions.


Unnamed: 0,event_id,ensemble_label,final_label
0,id_066zz28m11mr_X_0,5.9e-05,5.9e-05
1,id_066zz28m11mr_X_1,5.9e-05,5.9e-05
2,id_066zz28m11mr_X_10,5.6e-05,5.6e-05
3,id_066zz28m11mr_X_100,6.6e-05,6.6e-05
4,id_066zz28m11mr_X_101,6.5e-05,6.5e-05


In [13]:

# Ensure submission is sorted by event_id, matching test_df
submission_df = submission_df.sort_values('event_id').reset_index(drop=True)

# Create submission file using the FINAL normalized predictions
final_submission = test_df[['event_id', 'final_label']].rename(columns={'final_label': 'label'})

# Verify alignment before assigning
if not submission_df['event_id'].equals(final_submission['event_id']):
     print("Error: event_id mismatch between sample submission and predictions!")
     # Attempt merge as fallback
     submission_final = pd.merge(submission_df[['event_id']], final_submission, on='event_id', how='left')
else:
     submission_final = final_submission

print("\nFinal Submission DataFrame head:")
display(submission_final.head())

# Save to /kaggle/working/
output_filename = "/kaggle/working/submission_adapted_ensemble.csv"
submission_final.to_csv(output_filename, index=False)

print(f"\nSubmission file saved to {output_filename}")




Final Submission DataFrame head:


Unnamed: 0,event_id,label
0,id_066zz28m11mr_X_0,5.9e-05
1,id_066zz28m11mr_X_1,5.9e-05
2,id_066zz28m11mr_X_10,5.6e-05
3,id_066zz28m11mr_X_100,6.6e-05
4,id_066zz28m11mr_X_101,6.5e-05



Submission file saved to /kaggle/working/submission_adapted_ensemble.csv


# End of assignment