# Improved Water Quality Prediction Models

This notebook implements a systematic experiment to improve water quality prediction
over the baseline benchmark, addressing overfitting, feature redundancy, and model diversity.

## Phases
1. Fix evaluation methodology (5-fold CV)
2. Feature selection + spatial clustering
3. Target variable transformation (log1p)
4. Improved missing value handling
5. Model diversity and tuning (RF, XGBoost, LightGBM)

## Benchmark Results (to beat)
| Parameter | R² Test |
|---|---|
| Total Alkalinity | 0.546 |
| Electrical Conductance | 0.585 |
| Dissolved Reactive Phosphorus | 0.529 |

## Step 1: Load Dependencies

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score, KFold, GroupKFold
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.inspection import permutation_importance

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

RANDOM_STATE = 42
N_FOLDS = 5
TARGETS = ['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus']

print('Libraries loaded.')

## Step 2: Load & Merge Data

In [None]:
wq_df = pd.read_csv('water_quality_training_dataset.csv')
landsat_train = pd.read_csv('landsat_features_training_enhanced.csv')
landsat_val = pd.read_csv('landsat_features_validation_enhanced.csv')
tc_train = pd.read_csv('terraclimate_features_training.csv')
tc_val = pd.read_csv('terraclimate_features_validation.csv')
submission_template = pd.read_csv('submission_template.csv')

print(f'Water quality:    {wq_df.shape}')
print(f'Landsat train:    {landsat_train.shape}')
print(f'Landsat val:      {landsat_val.shape}')
print(f'TerraClimate:     {tc_train.shape}')
print(f'Submission:       {submission_template.shape}')

In [None]:
def build_frame(wq, landsat, terraclimate):
    df = pd.concat([wq, landsat, terraclimate], axis=1)
    df = df.loc[:, ~df.columns.duplicated()]
    return df

def add_temporal_features(df):
    df = df.copy()
    dates = pd.to_datetime(df['Sample Date'], dayfirst=True, errors='coerce')
    df['Month'] = dates.dt.month.astype(float)
    df['Year'] = dates.dt.year.astype(float)
    return df

train_df = build_frame(wq_df, landsat_train, tc_train)
train_df = add_temporal_features(train_df)
print(f'Merged training shape: {train_df.shape}')
print(f'Columns: {list(train_df.columns)}')

## Step 3: Handle Missing Values (Phase 4)

1,085 rows (~11.6%) have all Landsat bands as NaN. Instead of naive median fill,
we add a `landsat_missing` indicator and then impute with median so the model
can learn to treat these rows differently.

In [None]:
landsat_cols = ['nir', 'green', 'swir16', 'swir22', 'NDMI', 'MNDWI',
                'NDVI', 'NDWI', 'NDSI_water', 'NDTI', 'Turbidity_Index',
                'Chlorophyll_Proxy', 'BSI', 'SWIR22_NIR_ratio', 'SWIR16_NIR_ratio',
                'Green_NIR_ratio', 'SWIR22_Green_ratio', 'SWIR16_Green_ratio',
                'log_nir', 'log_green', 'log_swir16', 'log_swir22',
                'nir_squared', 'swir22_squared']

train_df['landsat_missing'] = train_df['nir'].isnull().astype(float)

missing_before = train_df.isnull().sum()
print('Missing values before imputation:')
print(missing_before[missing_before > 0].to_string())
print(f'\nlandsat_missing == 1 count: {int(train_df["landsat_missing"].sum())}')

train_df = train_df.fillna(train_df.median(numeric_only=True))
print(f'\nMissing after imputation: {train_df.isnull().sum().sum()}')

## Step 4: Spatial Clustering + Feature Engineering (Phase 2)

KMeans on Latitude/Longitude to create spatial clusters that proxy for
watershed/catchment identity.

In [None]:
N_CLUSTERS = 8
kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=RANDOM_STATE, n_init=10)
train_df['Cluster'] = kmeans.fit_predict(train_df[['Latitude', 'Longitude']]).astype(float)

fig, ax = plt.subplots(1, 1, figsize=(8, 8))
scatter = ax.scatter(train_df['Longitude'], train_df['Latitude'],
                     c=train_df['Cluster'], cmap='tab10', s=5, alpha=0.5)
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title(f'Spatial Clusters (k={N_CLUSTERS})')
plt.colorbar(scatter, ax=ax, label='Cluster')
plt.tight_layout()
plt.show()

print(f'Cluster distribution:\n{train_df["Cluster"].value_counts().sort_index()}')

## Step 5: Define Feature Sets

**Baseline** (4 features): identical to benchmark.  
**Curated** (~12 features): removes redundant pairs, adds spatial + missing indicator.

In [None]:
BASELINE_FEATURES = ['swir22', 'NDMI', 'MNDWI', 'pet']

CURATED_FEATURES = [
    'swir22', 'NDMI', 'MNDWI', 'pet',
    'Latitude', 'Longitude', 'Cluster',
    'NDVI', 'BSI', 'Chlorophyll_Proxy',
    'Month', 'landsat_missing',
]

ALL_ENHANCED_FEATURES = [
    'nir', 'green', 'swir16', 'swir22',
    'NDMI', 'MNDWI', 'NDVI', 'NDSI_water',
    'Chlorophyll_Proxy', 'BSI',
    'SWIR22_NIR_ratio', 'SWIR16_NIR_ratio',
    'Green_NIR_ratio', 'SWIR22_Green_ratio',
    'pet', 'Month', 'Year',
    'Latitude', 'Longitude', 'Cluster',
    'landsat_missing',
]

CURATED_FEATURES = [c for c in CURATED_FEATURES if c in train_df.columns]
ALL_ENHANCED_FEATURES = [c for c in ALL_ENHANCED_FEATURES if c in train_df.columns]

print(f'Baseline features ({len(BASELINE_FEATURES)}): {BASELINE_FEATURES}')
print(f'Curated features  ({len(CURATED_FEATURES)}): {CURATED_FEATURES}')
print(f'All enhanced      ({len(ALL_ENHANCED_FEATURES)}): {ALL_ENHANCED_FEATURES}')

## Step 6: Cross-Validation Pipeline (Phase 1)

Replace the single 70/30 split with 5-fold CV. Report mean +/- std for R² and RMSE.

In [None]:
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

neg_rmse_scorer = make_scorer(rmse_scorer, greater_is_better=False)

def cv_evaluate(model, X, y, cv=None, label='', target_name=''):
    """Run k-fold CV and return summary dict."""
    if cv is None:
        cv = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

    r2_scores = cross_val_score(model, X, y, cv=cv, scoring='r2', n_jobs=-1)
    rmse_scores = -cross_val_score(model, X, y, cv=cv, scoring=neg_rmse_scorer, n_jobs=-1)

    result = {
        'Model': label,
        'Target': target_name,
        'R2_mean': round(r2_scores.mean(), 4),
        'R2_std': round(r2_scores.std(), 4),
        'RMSE_mean': round(rmse_scores.mean(), 4),
        'RMSE_std': round(rmse_scores.std(), 4),
    }
    print(f'  {label:30s} | {target_name:35s} | '
          f'R²: {result["R2_mean"]:.4f} ± {result["R2_std"]:.4f} | '
          f'RMSE: {result["RMSE_mean"]:.2f} ± {result["RMSE_std"]:.2f}')
    return result

## Step 7: Baseline Experiment (Reproduce Benchmark with CV)

In [None]:
print('=' * 110)
print('BASELINE MODEL (swir22, NDMI, MNDWI, pet) — 5-fold CV')
print('=' * 110)

baseline_rf = RandomForestRegressor(
    n_estimators=200, max_features='sqrt', min_samples_leaf=2,
    random_state=RANDOM_STATE, n_jobs=-1
)

baseline_results = []
for target in TARGETS:
    X = train_df[BASELINE_FEATURES].values
    y = train_df[target].values
    res = cv_evaluate(baseline_rf, X, y, label='Baseline RF', target_name=target)
    baseline_results.append(res)

baseline_df = pd.DataFrame(baseline_results)
print('\nBaseline Summary:')
baseline_df

## Step 8: Curated Features + RF (Phase 2 test)

In [None]:
print('=' * 110)
print('CURATED FEATURES + RF — 5-fold CV')
print('=' * 110)

curated_results = []
for target in TARGETS:
    X = train_df[CURATED_FEATURES].values
    y = train_df[target].values
    res = cv_evaluate(baseline_rf, X, y, label='Curated RF', target_name=target)
    curated_results.append(res)

curated_df = pd.DataFrame(curated_results)
print('\nCurated Summary:')
curated_df

## Step 9: Target Transformation — log1p (Phase 3)

DRP is heavily right-skewed (15% outliers by IQR, mean 43.5 vs median 20).
EC has high variance. log1p compresses extreme values, expm1 reverses.

In [None]:
from sklearn.base import BaseEstimator, RegressorMixin, clone

class LogTransformedRegressor(BaseEstimator, RegressorMixin):
    """Wraps a regressor with log1p target transformation."""
    def __init__(self, base_estimator):
        self.base_estimator = base_estimator

    def fit(self, X, y):
        self.estimator_ = clone(self.base_estimator)
        self.estimator_.fit(X, np.log1p(y))
        return self

    def predict(self, X):
        return np.expm1(self.estimator_.predict(X))

print('=' * 110)
print('CURATED FEATURES + log1p RF — 5-fold CV')
print('=' * 110)

log_rf = LogTransformedRegressor(baseline_rf)

log_results = []
for target in TARGETS:
    X = train_df[CURATED_FEATURES].values
    y = train_df[target].values
    res = cv_evaluate(log_rf, X, y, label='Curated log1p RF', target_name=target)
    log_results.append(res)

log_df = pd.DataFrame(log_results)
print('\nlog1p Transform Summary:')
log_df

## Step 10: Model Diversity — XGBoost & LightGBM (Phase 5)

Gradient boosting models with built-in regularization handle feature
redundancy more gracefully than RF.

In [None]:
xgb_model = XGBRegressor(
    n_estimators=500, learning_rate=0.05, max_depth=6,
    subsample=0.8, colsample_bytree=0.7,
    reg_alpha=0.1, reg_lambda=1.0,
    random_state=RANDOM_STATE, n_jobs=-1, verbosity=0
)

lgbm_model = LGBMRegressor(
    n_estimators=500, learning_rate=0.05, max_depth=6,
    subsample=0.8, colsample_bytree=0.7,
    reg_alpha=0.1, reg_lambda=1.0,
    random_state=RANDOM_STATE, n_jobs=-1, verbose=-1
)

models = {
    'RF':      baseline_rf,
    'XGBoost': xgb_model,
    'LightGBM': lgbm_model,
}

print('=' * 110)
print('MODEL COMPARISON — Curated features, 5-fold CV (raw targets)')
print('=' * 110)

comparison_results = []
for model_name, model in models.items():
    for target in TARGETS:
        X = train_df[CURATED_FEATURES].values
        y = train_df[target].values
        res = cv_evaluate(model, X, y, label=f'Curated {model_name}', target_name=target)
        comparison_results.append(res)

comparison_df = pd.DataFrame(comparison_results)
print('\nModel Comparison Summary:')
comparison_df

## Step 11: Model Diversity with log1p Targets

In [None]:
log_models = {
    'log1p RF':      LogTransformedRegressor(baseline_rf),
    'log1p XGBoost': LogTransformedRegressor(xgb_model),
    'log1p LightGBM': LogTransformedRegressor(lgbm_model),
}

print('=' * 110)
print('MODEL COMPARISON — Curated features, 5-fold CV (log1p targets)')
print('=' * 110)

log_comparison_results = []
for model_name, model in log_models.items():
    for target in TARGETS:
        X = train_df[CURATED_FEATURES].values
        y = train_df[target].values
        res = cv_evaluate(model, X, y, label=f'Curated {model_name}', target_name=target)
        log_comparison_results.append(res)

log_comparison_df = pd.DataFrame(log_comparison_results)
print('\nlog1p Model Comparison Summary:')
log_comparison_df

## Step 12: Hyperparameter Tuning (Phase 5 continued)

RandomizedSearchCV on the best-performing model family, tuned per-target.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

xgb_param_dist = {
    'base_estimator__n_estimators': randint(200, 800),
    'base_estimator__max_depth': randint(3, 10),
    'base_estimator__learning_rate': uniform(0.01, 0.19),
    'base_estimator__subsample': uniform(0.6, 0.4),
    'base_estimator__colsample_bytree': uniform(0.4, 0.6),
    'base_estimator__reg_alpha': uniform(0, 2),
    'base_estimator__reg_lambda': uniform(0.5, 2.5),
}

lgbm_param_dist = {
    'base_estimator__n_estimators': randint(200, 800),
    'base_estimator__max_depth': randint(3, 10),
    'base_estimator__learning_rate': uniform(0.01, 0.19),
    'base_estimator__subsample': uniform(0.6, 0.4),
    'base_estimator__colsample_bytree': uniform(0.4, 0.6),
    'base_estimator__reg_alpha': uniform(0, 2),
    'base_estimator__reg_lambda': uniform(0.5, 2.5),
}

rf_param_dist = {
    'base_estimator__n_estimators': randint(100, 500),
    'base_estimator__max_depth': [10, 15, 20, 30, None],
    'base_estimator__min_samples_leaf': randint(1, 10),
    'base_estimator__max_features': ['sqrt', 'log2', 0.3, 0.5, 0.8],
}

cv_inner = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

best_models = {}
tuning_results = []

candidate_configs = [
    ('log1p XGBoost', LogTransformedRegressor(xgb_model), xgb_param_dist),
    ('log1p LightGBM', LogTransformedRegressor(lgbm_model), lgbm_param_dist),
    ('log1p RF', LogTransformedRegressor(baseline_rf), rf_param_dist),
]

for target in TARGETS:
    print(f'\n{"=" * 80}')
    print(f'Tuning for: {target}')
    print(f'{"=" * 80}')

    X = train_df[CURATED_FEATURES].values
    y = train_df[target].values

    best_score = -np.inf
    best_name = ''
    best_est = None

    for name, base_model, param_dist in candidate_configs:
        search = RandomizedSearchCV(
            base_model, param_dist, n_iter=30, cv=cv_inner,
            scoring='r2', random_state=RANDOM_STATE, n_jobs=-1, refit=True
        )
        search.fit(X, y)
        score = search.best_score_
        print(f'  {name:20s} best CV R²: {score:.4f}')

        tuning_results.append({
            'Target': target, 'Model': name,
            'Best_R2_CV': round(score, 4),
            'Best_Params': str(search.best_params_)
        })

        if score > best_score:
            best_score = score
            best_name = name
            best_est = search.best_estimator_

    best_models[target] = (best_name, best_est)
    print(f'  >>> Best for {target}: {best_name} (R²={best_score:.4f})')

tuning_df = pd.DataFrame(tuning_results)
print('\n' + '=' * 80)
print('Tuning Summary:')
tuning_df[['Target', 'Model', 'Best_R2_CV']]

## Step 13: Final Comparison — All Approaches

In [None]:
all_results = pd.concat([
    baseline_df.assign(Experiment='1. Baseline (4 feat)'),
    curated_df.assign(Experiment='2. Curated RF (12 feat)'),
    log_df.assign(Experiment='3. Curated log1p RF'),
], ignore_index=True)

for target in TARGETS:
    name, est = best_models[target]
    trow = tuning_df[(tuning_df['Target'] == target) & (tuning_df['Model'] == name)].iloc[0]
    all_results = pd.concat([all_results, pd.DataFrame([{
        'Model': f'Tuned {name}',
        'Target': target,
        'R2_mean': trow['Best_R2_CV'],
        'R2_std': np.nan,
        'RMSE_mean': np.nan,
        'RMSE_std': np.nan,
        'Experiment': '4. Tuned best model',
    }])], ignore_index=True)

print('\nFull Comparison:')
pivot = all_results.pivot_table(index='Target', columns='Experiment', values='R2_mean')
print(pivot.to_string())

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for i, target in enumerate(TARGETS):
    subset = all_results[all_results['Target'] == target].sort_values('R2_mean')
    axes[i].barh(subset['Experiment'], subset['R2_mean'])
    axes[i].set_xlabel('R² (CV mean)')
    axes[i].set_title(target)
    axes[i].set_xlim(0, max(0.8, subset['R2_mean'].max() + 0.05))
plt.tight_layout()
plt.savefig('model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## Step 14: Permutation Feature Importance (Best Models)

In [None]:
from sklearn.model_selection import train_test_split

fig, axes = plt.subplots(1, 3, figsize=(18, 7))

for i, target in enumerate(TARGETS):
    name, model = best_models[target]
    X = train_df[CURATED_FEATURES].values
    y = train_df[target].values

    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.3, random_state=RANDOM_STATE)
    model.fit(X_tr, y_tr)

    perm_imp = permutation_importance(
        model, X_te, y_te, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1)

    sorted_idx = perm_imp.importances_mean.argsort()
    axes[i].boxplot(
        perm_imp.importances[sorted_idx].T,
        vert=False,
        labels=np.array(CURATED_FEATURES)[sorted_idx])
    axes[i].set_title(f'{target}\n({name})')
    axes[i].set_xlabel('Permutation Importance')

plt.tight_layout()
plt.savefig('feature_importance_enhanced.png', dpi=150, bbox_inches='tight')
plt.show()

## Step 15: Generate Submission Predictions

In [None]:
val_df = build_frame(
    submission_template[['Latitude', 'Longitude', 'Sample Date']],
    landsat_val, tc_val
)
val_df = add_temporal_features(val_df)

val_df['landsat_missing'] = val_df['nir'].isnull().astype(float)
val_df = val_df.fillna(train_df.median(numeric_only=True))

val_df['Cluster'] = kmeans.predict(val_df[['Latitude', 'Longitude']]).astype(float)

print(f'Validation shape: {val_df.shape}')
print(f'Missing after impute: {val_df[CURATED_FEATURES].isnull().sum().sum()}')

In [None]:
X_train_full = train_df[CURATED_FEATURES].values
X_val = val_df[CURATED_FEATURES].values

predictions = {}
for target in TARGETS:
    name, model = best_models[target]
    y_full = train_df[target].values
    model.fit(X_train_full, y_full)
    preds = model.predict(X_val)
    preds = np.maximum(preds, 0)
    predictions[target] = preds
    print(f'{target:35s} — {name:20s} — '
          f'mean={preds.mean():.2f}, std={preds.std():.2f}, '
          f'min={preds.min():.2f}, max={preds.max():.2f}')

submission_df = pd.DataFrame({
    'Latitude': submission_template['Latitude'].values,
    'Longitude': submission_template['Longitude'].values,
    'Sample Date': submission_template['Sample Date'].values,
    'Total Alkalinity': predictions['Total Alkalinity'],
    'Electrical Conductance': predictions['Electrical Conductance'],
    'Dissolved Reactive Phosphorus': predictions['Dissolved Reactive Phosphorus'],
})

submission_df.to_csv('submission.csv', index=False)
print(f'\nSubmission saved: submission.csv ({submission_df.shape})')
submission_df.head(10)

## Step 16: Submission Distribution Sanity Check

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for i, target in enumerate(TARGETS):
    axes[i].hist(train_df[target].values, bins=50, alpha=0.5, label='Training', density=True)
    axes[i].hist(predictions[target], bins=30, alpha=0.7, label='Predicted', density=True)
    axes[i].set_title(target)
    axes[i].legend()
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Density')
plt.tight_layout()
plt.savefig('submission_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

print('Done. All phases complete.')