# Water Quality Prediction: Benchmark Notebook (CatBoost Version)

## Challenge Overview
Welcome to the EY AI & Data Challenge 2026!...

## Load In Dependencies

In [None]:
!pip install uv
!uv pip install -r requirements.txt
!pip install catboost

In [None]:
import snowflake
from snowflake.snowpark.context import get_active_session
session = get_active_session()

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from IPython.display import display
import xarray as xr
import rioxarray as rxr
import rasterio
from rasterio.windows import Window
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.spatial import cKDTree
from sklearn.metrics import r2_score, mean_squared_error

from catboost import CatBoostRegressor

import pystac_client
import planetary_computer as pc
from odc.stac import stac_load
from pystac.extensions.eo import EOExtension as eo
from datetime import date
from tqdm import tqdm
import os

## Load Data

In [None]:
Water_Quality_df = pd.read_csv("water_quality_training_dataset.csv")
landsat_train_features = pd.read_csv("landsat_features_training_200m.csv")
Terraclimate_df = pd.read_csv("terraclimate_features_training_full.csv")
urbanization_df = pd.read_csv("urbanization_train.csv")
cat_cols = ["Total Alkalinity", "Electrical Conductance", "Dissolved Reactive Phosphorus"]
urbanization_df = urbanization_df.drop(columns=cat_cols, errors='ignore')

In [None]:
def combine_three_datasets(dataset1, dataset2, dataset3):
    data = pd.concat([dataset1, dataset2, dataset3], axis=1)
    data = data.loc[:, ~data.columns.duplicated()]
    return data

wq_data = combine_three_datasets(Water_Quality_df, landsat_train_features, Terraclimate_df)

## Feature Engineering & Alignment (Train & Validation)

In [None]:
def classify_region(lat, lon):
    try:
        lat, lon = float(lat), float(lon)
    except:
        return "Unknown"
    if lon < 20.5 and lat <= -32.0: return "West_Coast"
    elif 20.5 <= lon <= 27.5 and -34.8 <= lat <= -32.0: return "South_Coast"
    elif lon >= 29.0 and -31.0 <= lat <= -26.5: return "East_Coast"
    elif 24.0 <= lon < 29.0 and -34.0 <= lat < -30.5: return "Eastern_Cape"
    elif lat > -29.0: return "Interior"
    else: return "Northern_Arid"

def get_season(month):
    if month in [12, 1, 2]: return "Summer"
    elif month in [3, 4, 5]: return "Autumn"
    elif month in [6, 7, 8]: return "Winter"
    else: return "Spring"

def preprocess_features(df, is_train=True, train_columns=None):
    df_processed = df.copy()
    df_processed['season'] = pd.to_datetime(df_processed['Sample Date'], dayfirst=True).dt.month.apply(get_season)
    df_processed['region'] = df_processed.apply(lambda row: classify_region(row['Latitude'], row['Longitude']), axis=1)
    df_processed['Region_Season'] = df_processed['region'] + '_' + df_processed['season']
    df_processed = df_processed.drop(columns=['season', 'region'])
    df_processed = pd.get_dummies(df_processed, columns=['Region_Season'], prefix='RS')
    
    cols_to_drop = ['Latitude', 'Longitude', 'Sample Date', 'Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus']
    cols_to_drop_existing = [c for c in cols_to_drop if c in df_processed.columns]
    X = df_processed.drop(columns=cols_to_drop_existing)
    
    if not is_train and train_columns is not None:
        X = X.reindex(columns=train_columns, fill_value=0)
        return X
    return X, X.columns.tolist()

## Model Helper Functions (CatBoost)

In [None]:
def split_data(X, y, test_size=0.3, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

def scale_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, scaler

def train_model(X_train_scaled, y_train):
    model = CatBoostRegressor(
        iterations=500,          # 迭代次數 (類似 n_estimators)
        learning_rate=0.05,      # 學習率
        depth=6,                 # 樹的深度，CatBoost 通常設 6 就很強
        l2_leaf_reg=3,           # L2 正則化參數，防止過擬合
        loss_function='RMSE',    # 損失函數
        random_seed=42,
        verbose=False            # 設為 False 避免訓練時印出一大堆進度條
    )
    model.fit(X_train_scaled, y_train)
    return model

def evaluate_model(model, X_scaled, y_true, dataset_name="Test"):
    y_pred = model.predict(X_scaled)
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"\n{dataset_name} Evaluation:")
    print(f"R²: {r2:.3f}")
    print(f"RMSE: {rmse:.3f}")
    return y_pred, r2, rmse

In [None]:
def run_pipeline(X, y, param_name="Parameter"):
    print(f"\n{'='*60}")
    print(f"Training Model for {param_name}")
    print(f"{'='*60}")
    
    X_train, X_test, y_train, y_test = split_data(X, y)
    X_train_scaled, X_test_scaled, scaler = scale_data(X_train, X_test)
    model = train_model(X_train_scaled, y_train)
    
    y_train_pred, r2_train, rmse_train = evaluate_model(model, X_train_scaled, y_train, "Train")
    y_test_pred, r2_test, rmse_test = evaluate_model(model, X_test_scaled, y_test, "Test")

    feat_imp = pd.Series(model.feature_importances_, index=X.columns)
    top25 = feat_imp.nlargest(25).sort_values()

    fig, ax = plt.subplots(figsize=(9, 7))
    bars = ax.barh(top25.index, top25.values, color="steelblue", edgecolor="white")
    for bar, val in zip(bars, top25.values):
        ax.text(val + 0.001, bar.get_y() + bar.get_height() / 2, f"{val:.4f}", va="center", ha="left", fontsize=8)
    ax.set_xlabel("Feature Importance")
    ax.set_title(f"Top 25 Feature Importances — {param_name}")
    plt.tight_layout()
    plt.show()

    results = {"Parameter": param_name, "R2_Train": r2_train, "RMSE_Train": rmse_train, "R2_Test": r2_test, "RMSE_Test": rmse_test}
    return model, scaler, pd.DataFrame([results]), feat_imp.nlargest(25)

## Train the Models

In [None]:
wq_data = wq_data.fillna(wq_data.median(numeric_only=True))

y_TA  = wq_data['Total Alkalinity']
y_EC  = wq_data['Electrical Conductance']
y_DRP = wq_data['Dissolved Reactive Phosphorus']

X, train_feature_cols = preprocess_features(wq_data, is_train=True)

model_TA,  scaler_TA,  results_TA,  top25_TA  = run_pipeline(X, y_TA,  "Total Alkalinity")
model_EC,  scaler_EC,  results_EC,  top25_EC  = run_pipeline(X, y_EC,  "Electrical Conductance")
model_DRP, scaler_DRP, results_DRP, top25_DRP = run_pipeline(X, y_DRP, "Dissolved Reactive Phosphorus")

In [None]:
results_summary = pd.concat([results_TA, results_EC, results_DRP], ignore_index=True)
display(results_summary)

## Validation & Submission

In [None]:
test_file = pd.read_csv("submission_template.csv")
landsat_val_features = pd.read_csv("landsat_features_validation_200m.csv")
Terraclimate_val_df = pd.read_csv("terraclimate_features_validation_full.csv")
urbanization_val_df = pd.read_csv("urbanization_val.csv")
urbanization_val_df = urbanization_val_df.drop(columns=cat_cols, errors='ignore')

val_data = combine_three_datasets(test_file, landsat_val_features, Terraclimate_val_df)
val_data = val_data.fillna(val_data.median(numeric_only=True))

# Use the unified preprocess function, aligning validation columns to the training columns
submission_val_data = preprocess_features(val_data, is_train=False, train_columns=train_feature_cols)
display(submission_val_data.head())

In [None]:
X_sub_scaled_TA = scaler_TA.transform(submission_val_data)
pred_TA_submission = model_TA.predict(X_sub_scaled_TA)

X_sub_scaled_EC = scaler_EC.transform(submission_val_data)
pred_EC_submission = model_EC.predict(X_sub_scaled_EC)

X_sub_scaled_DRP = scaler_DRP.transform(submission_val_data)
pred_DRP_submission = model_DRP.predict(X_sub_scaled_DRP)

In [None]:
submission_df = pd.DataFrame({
    'Latitude': test_file['Latitude'].values,
    'Longitude': test_file['Longitude'].values,
    'Sample Date': test_file['Sample Date'].values,
    'Total Alkalinity': pred_TA_submission,
    'Electrical Conductance': pred_EC_submission,
    'Dissolved Reactive Phosphorus': pred_DRP_submission
})
display(submission_df.head())

In [None]:
submission_df.to_csv("/tmp/submission_v10.csv", index=False)
session.sql("""
    PUT file:///tmp/submission_v10.csv
    'snow://workspace/USER$.PUBLIC."EY-AI-and-Data-Challenge"/versions/live/'
    AUTO_COMPRESS=FALSE
    OVERWRITE=TRUE
""").collect()
print("File saved! Refresh the browser to see the files in the sidebar")