
# 03 Hydrologic Model Development & Evaluation

In this notebook we build a baseline machine-learning model to predict whether a heavy precipitation event will occur the following day (our proxy for a flood event). We use the engineered features from `data/processed/analytical_data.csv` as inputs and define the binary target `Flood_Event_Imminent` (1 if next day's total precipitation exceeds 10 mm, 0 otherwise).

Although more sophisticated time-series models (e.g., LSTM or Transformer networks) could be employed, we begin with a simpler ensemble model – a Random Forest classifier – to establish a baseline. The dataset is split chronologically to avoid data leakage: the earliest 80% of records are used for training and the most recent 20% for testing.  

Model performance is evaluated using accuracy, precision, recall, F1-score and ROC–AUC.  The trained model is saved to the `models/` directory for future use.


In [1]:

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import joblib

# Ensure models and reports directories exist
os.makedirs('models', exist_ok=True)
os.makedirs('reports', exist_ok=True)

# Load processed analytical data
analytical_df = pd.read_csv('data/processed/analytical_data.csv')

# Define feature columns and target
target_col = 'Flood_Event_Imminent'
exclude_cols = ['Date', 'Next_Day_Precip', target_col]
feature_cols = [col for col in analytical_df.columns if col not in exclude_cols]

X = analytical_df[feature_cols]
# Keep only numeric columns
X = X.select_dtypes(include=['number'])
y = analytical_df[target_col]

# Fill missing values
X = X.fillna(0)

# Chronological train-test split
split_index = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

# Train Random Forest classifier
clf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)
print('ROC AUC:', auc)

# Confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Save model
model_path = 'models/hydrologic_model.pkl'
joblib.dump(clf, model_path)
print(f'Saved trained model to {model_path}')

# Save performance summary to markdown
report_lines = []
report_lines.append('# Hydrologic Model Performance Summary')
report_lines.append(f'- Accuracy: {accuracy:.4f}')
report_lines.append(f'- Precision: {precision:.4f}')
report_lines.append(f'- Recall: {recall:.4f}')
report_lines.append(f'- F1-score: {f1:.4f}')
report_lines.append(f'- ROC AUC: {auc:.4f}')
report_lines.append('')
report_lines.append('## Confusion Matrix')
report_lines.append(str(cm))
report_lines.append('')
report_lines.append('## Classification Report')
report_lines.append(classification_report(y_test, y_pred))

perf_path = 'reports/model_performance_summary.md'
with open(perf_path, 'w') as f:
    f.write('\n'.join(report_lines))
print(f'Saved model performance summary to {perf_path}')


Accuracy: 0.9724137931034482
Precision: 0.0
Recall: 0.0
F1-score: 0.0
ROC AUC: 0.6640070921985816
Confusion Matrix:
[[141   0]
 [  4   0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       141
           1       0.00      0.00      0.00         4

    accuracy                           0.97       145
   macro avg       0.49      0.50      0.49       145
weighted avg       0.95      0.97      0.96       145

Saved trained model to models/hydrologic_model.pkl
Saved model performance summary to reports/model_performance_summary.md


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
