In [49]:
# necessary import
import pickle # to manipulate models
import zipfile # to manipulate .zip file
import xgboost as xgb # for gradient boosting classifier
import numpy as np # for matrices and numerical manipulations
import pandas as pd # for dataframes
import matplotlib.pyplot as plt # for plots
import seaborn as sns # for visualizing data
import matplotlib
import sklearn # for machine learning algorithms

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, accuracy_score
import joblib  # Using joblib instead of pickle for compression
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [10]:
# # System versions
# print("Platform:", sys.platform)
# print("Python version:", sys.version)
# print("---" * 47)

# Libraries versions
print("matplotlib version:", matplotlib.__version__)
print("seaborn version:", sns.__version__)
print("xgboost version:", xgb.__version__)
print("sklearn version:", sklearn.__version__)
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)

matplotlib version: 3.10.7
seaborn version: 0.13.2
xgboost version: 3.1.1
sklearn version: 1.6.1
pandas version: 2.3.3
numpy version: 2.2.6


In [91]:
weather = pd.read_csv("../data/merged_aez_weather.csv")

In [92]:
weather.columns = weather.columns.str.lower().str.strip()

In [93]:
numerical_weather = list(weather.dtypes[(weather.dtypes == 'int64') | (weather.dtypes == 'float64')].index)
categorical_weather = list(weather.dtypes[weather.dtypes == 'object'].index)

## Check unique AEZs (different climatic zones)

In [94]:
weather.aez.unique()

array(['Highlands (Humid)', 'Upper Midlands (High Potential)',
       'Lower Midlands (Semi-Arid)', 'Coastal Lowlands (Humid)',
       'Arid Lowlands (Arid)'], dtype=object)

In [17]:
weather.aez.value_counts()

aez
Highlands (Humid)                  11992
Upper Midlands (High Potential)    11992
Lower Midlands (Semi-Arid)         11992
Coastal Lowlands (Humid)           11992
Arid Lowlands (Arid)               11992
Name: count, dtype: int64

## Feature Engineering
- Convert date and extract temporal features By converting the date column into a datetime format, we can extract useful features such as: month, day of year, week number. These features allow the model to learn seasonal and periodic weather behavior.

- Cyclical encoding for month (captures seasonality better)-> Months follow a circular pattern (December → January).
Using cyclical encoding: month_sin = sin(2π * month / 12), month_cos = cos(2π * month / 12). This represents seasonality smoothly and avoids misleading jumps (e.g., from 12 to 1). Imagine months arranged in a circle like a clock, instead of hard coding january:1 and December = 12, which makes the model think that january is very far from december while in reality December is next to january in terms of seasonality.

- Create lagged features PER AEZ (different zones have different patterns) -> Rainfall patterns differ across Agro-Ecological Zones (AEZs).To capture zone-specific temporal dependencies, lag features were generated separately for each AEZ: Rainfall 1 day ago, Rainfall 3 days ago, Rainfall 7 days ago, Temperature/RH lags, etc. This helps the model learn short-term weather persistence and trends within each zone.

- Rolling statistics PER AEZ -> Rolling windows were computed to provide smoothed historical context: Rolling mean (e.g., 7-day or 14-day averages). Rolling standard deviation (captures variability). Again computed by AEZ, because climate behavior differs between zones. These features give the model aggregated context instead of only instantaneous readings.

- Fill NaN in std columns (first few rows) -> Rolling statistics produce NaN values for the first few rows (e.g., first 6 records in a 7-day window). These NaNs were filled with 0 for standard deviation: Standard deviation of a single value is effectively 0,Ensures the model receives consistent numeric data.

- Drop rows with NaN from lagged features -> Lag features require previous observations. The first several rows for each AEZ do not have enough historical data, so these rows are dropped to avoid: Missing values. Incorrect model training. This preserves data quality.

- Binary classification target: Will it rain? (>1mm threshold) -> A classification label was defined: 1 = Yes, it will rain (precipitation > 1 mm), 0 = No, it will not rain. This enables classification models to predict the probability of rainfall, while a separate regressor predicts the amount. Choosing a 1 mm threshold avoids false “rain” labels for negligible drizzle.

In [19]:

# Convert date and extract temporal features
weather['date'] = pd.to_datetime(weather['date'])
weather['month'] = weather['date'].dt.month
weather['day_of_year'] = weather['date'].dt.dayofyear
weather['year'] = weather['date'].dt.year
weather['week_of_year'] = weather['date'].dt.isocalendar().week.astype(int)

In [21]:
# Cyclical encoding for month (captures seasonality better)
weather['month_sin'] = np.sin(2 * np.pi * weather['month'] / 12)
weather['month_cos'] = np.cos(2 * np.pi * weather['month'] / 12)

In [23]:
# Create lagged features PER AEZ (different zones have different patterns)
for lag in [1, 3, 7, 14]:
    weather[f'rainfall_lag_{lag}'] = weather.groupby('aez')['prectotcorr'].shift(lag)
    weather[f'temp_lag_{lag}'] = weather.groupby('aez')['t2m'].shift(lag)
    weather[f'humidity_lag_{lag}'] = weather.groupby('aez')['rh2m'].shift(lag)

In [None]:
# Rolling statistics PER AEZ
for window in [7, 14, 30]: #roll over 7, 14, and 30 days (ie 1,2,3 and 2,3,4 and 3,4,5 if roll is 3)
    weather[f'rainfall_{window}d_avg'] = (
        weather.groupby('aez')['prectotcorr']
        .rolling(window, min_periods=1).mean()
        .reset_index(level=0, drop=True)
    )
    weather[f'rainfall_{window}d_std'] = (
        weather.groupby('aez')['prectotcorr']
        .rolling(window, min_periods=1).std()
        .reset_index(level=0, drop=True)
    )
    weather[f'temp_{window}d_avg'] = (
        weather.groupby('aez')['t2m']
        .rolling(window, min_periods=1).mean()
        .reset_index(level=0, drop=True)
    )

In [28]:
# Fill NaN in std columns (first few rows)
weather = weather.fillna(0)
# Drop rows with NaN from lagged features
weather = weather.dropna()

weather.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59960 entries, 0 to 59959
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               59960 non-null  datetime64[ns]
 1   prectotcorr        59960 non-null  float64       
 2   t2m                59960 non-null  float64       
 3   rh2m               59960 non-null  float64       
 4   allsky_sfc_sw_dwn  59960 non-null  float64       
 5   aez                59960 non-null  object        
 6   month              59960 non-null  int32         
 7   day_of_year        59960 non-null  int32         
 8   year               59960 non-null  int32         
 9   week_of_year       59960 non-null  int64         
 10  month_sin          59960 non-null  float64       
 11  month_cos          59960 non-null  float64       
 12  rainfall_lag_1     59960 non-null  float64       
 13  temp_lag_1         59960 non-null  float64       
 14  humidi

In [30]:
# Binary classification target: Will it rain? (>1mm threshold)
weather['will_rain'] = (weather['prectotcorr'] > 1).astype(int)

In [32]:
weather.shape

(59960, 34)

In [33]:
weather.will_rain.value_counts(normalize=True)

will_rain
0    0.538442
1    0.461558
Name: proportion, dtype: float64

##  ENCODING LOCATION (AEZ) - KEY FEATURE FOR KENYA'S DIVERSE CLIMATE
- To make the AEZ information usable by machine learning models, it must be encoded into numerical form. Two encoding approaches were used:
- Label encoding for AEZ -> Label encoding converts each AEZ category into a unique integer eg coastal -1 highlands -2
- One-Hot encoding for AEZ (better for tree-based models) -> One-hot encoding creates one binary column per AEZ.


In [36]:
# Label encoding for AEZ
le_aez = LabelEncoder()
weather['aez_encoded'] = le_aez.fit_transform(weather['aez'])

print(f"AEZ encoding mapping:")
for i, aez in enumerate(le_aez.classes_):
    print(f"  {aez}: {i}")

AEZ encoding mapping:
  Arid Lowlands (Arid): 0
  Coastal Lowlands (Humid): 1
  Highlands (Humid): 2
  Lower Midlands (Semi-Arid): 3
  Upper Midlands (High Potential): 4


In [38]:
# One-Hot encoding for AEZ (better for tree-based models)
ohe_aez = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
aez_onehot = ohe_aez.fit_transform(weather[['aez']])
aez_onehot_df = pd.DataFrame(
    aez_onehot, 
    columns=[f'aez_{cat}' for cat in ohe_aez.categories_[0]],
    index=weather.index
)
weather = pd.concat([weather, aez_onehot_df], axis=1)

print(f"One-hot encoded AEZ columns: {aez_onehot_df.columns.tolist()}")


One-hot encoded AEZ columns: ['aez_Arid Lowlands (Arid)', 'aez_Coastal Lowlands (Humid)', 'aez_Highlands (Humid)', 'aez_Lower Midlands (Semi-Arid)', 'aez_Upper Midlands (High Potential)']


In [44]:
weather.columns = weather.columns.str.lower()

## # FEATURE SELECTION -> Focusing on What Truly Drives Weather & Crop Predictions
- In weather modeling and crop suitability analysis, selecting the right features is essential for capturing underlying climate behavior, reducing noise, and improving model accuracy.

- Features including location (AEZ) - CRITICAL for Kenya's diverse climate -> Because of these strong climatic differences, AEZ becomes one of the most important features in predicting rainfall and recommending crops. Without AEZ, the model would mix together entirely different climate behaviors and lose accuracy.

- Add one-hot encoded AEZ columns -> To allow machine-learning models (especially tree-based models like RandomForest and GradientBoosting) to correctly understand AEZ categories, each AEZ is converted into its own binary column using one-hot encoding. Prevents the model from assuming any ordering between AEZ categories, Gives each zone equal opportunity to influence the prediction, Allows the model to learn relationships like: “Coastal areas tend to get more rainfall in March–May”, “Semi-arid zones experience strong dry seasons”.

In [43]:
base_features = [
    # Climate variables
    'T2M', 'RH2M', 'ALLSKY_SFC_SW_DWN',
    # Temporal features
    'month', 'day_of_year', 'week_of_year',
    'month_sin', 'month_cos',
    # Location feature (encoded)
    'AEZ_encoded',
    # Lagged features
    'rainfall_lag_1', 'rainfall_lag_3', 'rainfall_lag_7', 'rainfall_lag_14',
    'temp_lag_1', 'temp_lag_3', 'temp_lag_7',
    'humidity_lag_1', 'humidity_lag_3', 'humidity_lag_7',
    # Rolling statistics
    'rainfall_7d_avg', 'rainfall_14d_avg', 'rainfall_30d_avg',
    'rainfall_7d_std', 'rainfall_14d_std',
    'temp_7d_avg', 'temp_14d_avg'
]
base_features = [feat.lower() for feat in base_features]

In [45]:
# Add one-hot encoded AEZ columns
aez_columns = [col for col in weather.columns if col.startswith('aez_') and col != 'aez_encoded']
feature_cols = base_features + aez_columns

print(f"Total features: {len(feature_cols)}")
print(f"Features:\n{feature_cols}")

X = weather[feature_cols]
y_classification = weather['will_rain']
y_regression = weather['prectotcorr']

Total features: 36
Features:
['t2m', 'rh2m', 'allsky_sfc_sw_dwn', 'month', 'day_of_year', 'week_of_year', 'month_sin', 'month_cos', 'aez_encoded', 'rainfall_lag_1', 'rainfall_lag_3', 'rainfall_lag_7', 'rainfall_lag_14', 'temp_lag_1', 'temp_lag_3', 'temp_lag_7', 'humidity_lag_1', 'humidity_lag_3', 'humidity_lag_7', 'rainfall_7d_avg', 'rainfall_14d_avg', 'rainfall_30d_avg', 'rainfall_7d_std', 'rainfall_14d_std', 'temp_7d_avg', 'temp_14d_avg', 'aez_arid lowlands (arid)', 'aez_coastal lowlands (humid)', 'aez_highlands (humid)', 'aez_lower midlands (semi-arid)', 'aez_upper midlands (high potential)', 'aez_arid lowlands (arid)', 'aez_coastal lowlands (humid)', 'aez_highlands (humid)', 'aez_lower midlands (semi-arid)', 'aez_upper midlands (high potential)']


 ## TRAIN-TEST SPLIT
 - How this train_test_solit works:
 Step 1: Split X and y_classification together. This gives: X_train_class, X_test_class, y_train_class, y_test_class, Step 2: Split X and y_regression together. This gives: X_train_reg,X_test_reg,y_train_reg,y_test_reg
 
 - feature scaling
 -> Scaling ensures that all numerical features have a consistent range. This is especially important for algorithms that are sensitive to the magnitude of inputs. Scaling puts everything on the same scale, usually 0 to 1 or with mean 0 and variance 1. Why scaling matters: Some features (e.g., temperature) might be around ~20 Others (e.g., solar radiation) might be ~500 Others (e.g., rainfall) can be 0–100+ Without scaling: Large-magnitude features dominate smaller ones. Gradient-based models learn poorly. Convergence becomes slow

In [47]:
X_train, X_test, y_class_train, y_class_test = train_test_split(
    X, y_classification, test_size=0.2, random_state=42, stratify=y_classification
)
_, _, y_reg_train, y_reg_test = train_test_split(
    X, y_regression, test_size=0.2, random_state=42
)

print(X_train.shape, X_test.shape)

(47968, 46) (11992, 46)


In [48]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


### RAINFALL CLASSIFICATION MODELS — Predicting “Will it Rain?”
- This classification is based on weather predictors such as temperature, humidity, month, solar radiation, and location (AEZ). Multiple  machine-learning models were evaluated to capture both linear and nonlinear patterns in Kenya’s diverse climate.

- 1. Logistic Regression -> Logistic Regression provides a strong baseline for classification. It assumes a linear relationship between the input features and the probability of rainfall.
- 2. Random Forest Classifier -> Random Forest is an ensemble of many decision trees.
Each tree learns different aspects of rainfall behavior, and the forest combines them to produce a stable prediction.
- 3. XGBoost Classifier -> XGBoost is a state-of-the-art boosting algorithm that builds trees sequentially, with each tree correcting the errors of the previous ones.
- 4. K-Fold Cross Validation -> Because weather datasets may contain noise, seasonality, and localized climate behavior, K-Fold Cross Validation (CV) is used to reliably estimate model performance.

In [50]:
# 1. Logistic Regression
classification_results = {}

log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
log_reg.fit(X_train_scaled, y_class_train)
y_pred_log = log_reg.predict(X_test_scaled)
y_pred_proba_log = log_reg.predict_proba(X_test_scaled)[:, 1]

acc_log = accuracy_score(y_class_test, y_pred_log)
auc_log = roc_auc_score(y_class_test, y_pred_proba_log)
classification_results['Logistic Regression'] = {'accuracy': acc_log, 'auc': auc_log}
print(f"Accuracy: {acc_log:.4f}")
print(f"AUC-ROC: {auc_log:.4f}")

Accuracy: 0.8716
AUC-ROC: 0.9412


## Random Classifier

In [51]:
rf_class = RandomForestClassifier(
    n_estimators=100, 
    max_depth=15, 
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)


In [54]:
rf_class.fit(X_train_scaled, y_class_train)
y_pred_rf = rf_class.predict(X_test_scaled)
y_pred_proba_rf = rf_class.predict_proba(X_test_scaled)[:, 1]

In [55]:
acc_rf = accuracy_score(y_class_test, y_pred_rf)
auc_rf = roc_auc_score(y_class_test, y_pred_proba_rf)
classification_results['Random Forest'] = {'accuracy': acc_rf, 'auc': auc_rf}
print(f"Accuracy: {acc_rf:.4f}")
print(f"AUC-ROC: {auc_rf:.4f}")

Accuracy: 0.8772
AUC-ROC: 0.9537


## Xgboost

In [56]:
xgb_class = XGBClassifier(
    n_estimators=100, 
    max_depth=6, 
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    eval_metric='auc'
)

In [57]:
xgb_class.fit(X_train_scaled, y_class_train)
y_pred_xgb = xgb_class.predict(X_test_scaled)
y_pred_proba_xgb = xgb_class.predict_proba(X_test_scaled)[:, 1]


In [58]:
acc_xgb = accuracy_score(y_class_test, y_pred_xgb)
auc_xgb = roc_auc_score(y_class_test, y_pred_proba_xgb)
classification_results['XGBoost'] = {'accuracy': acc_xgb, 'auc': auc_xgb}
print(f"Accuracy: {acc_xgb:.4f}")
print(f"AUC-ROC: {auc_xgb:.4f}")

Accuracy: 0.8924
AUC-ROC: 0.9621


In [59]:
# kfold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores_acc = cross_val_score(xgb_class, X_train_scaled, y_class_train, cv=kfold, scoring='accuracy')
cv_scores_auc = cross_val_score(xgb_class, X_train_scaled, y_class_train, cv=kfold, scoring='roc_auc')
print(f"CV Accuracy: {cv_scores_acc.mean():.4f} (+/- {cv_scores_acc.std():.4f})")
print(f"CV AUC-ROC: {cv_scores_auc.mean():.4f} (+/- {cv_scores_auc.std():.4f})")


CV Accuracy: 0.8933 (+/- 0.0029)
CV AUC-ROC: 0.9637 (+/- 0.0017)


## RAINFALL REGRESSION MODELS (How much rain?)
- Because rainfall amounts can vary widely across Kenya’s climatological zones and seasons, multiple regression models were evaluated to capture both simple and complex relationships.

1. Linear Regression -> Linear Regression serves as a simple baseline model for predicting rainfall amount based on features such as temperature, humidity, solar radiation, month, and AEZ.

2. Random Forest Regressor -> Random Forest Regressor builds many decision trees and averages their predictions.
This allows it to capture nonlinear patterns that naturally occur in weather systems.

3. XGBoost Regressor -> XGBoost is a powerful gradient boosting framework known for top-tier accuracy.

4. K-Fold Cross Validation for regression -> ensures that model evaluation is stable, fair, and robust.


### Linear regression

In [60]:
regression_results = {}

lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_reg_train)
y_pred_lin = lin_reg.predict(X_test_scaled)
y_pred_lin = np.maximum(0, y_pred_lin)  # Ensure non-negative

rmse_lin = np.sqrt(mean_squared_error(y_reg_test, y_pred_lin))
r2_lin = r2_score(y_reg_test, y_pred_lin)
regression_results['Linear Regression'] = {'rmse': rmse_lin, 'r2': r2_lin}
print(f"RMSE: {rmse_lin:.4f}")
print(f"R² Score: {r2_lin:.4f}")


RMSE: 5.0885
R² Score: 0.0002


In [61]:
rf_reg = RandomForestRegressor(
    n_estimators=100, 
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

In [62]:
rf_reg.fit(X_train_scaled, y_reg_train)
y_pred_rf_reg = rf_reg.predict(X_test_scaled)

In [63]:
rmse_rf = np.sqrt(mean_squared_error(y_reg_test, y_pred_rf_reg))
r2_rf = r2_score(y_reg_test, y_pred_rf_reg)
regression_results['Random Forest'] = {'rmse': rmse_rf, 'r2': r2_rf}
print(f"RMSE: {rmse_rf:.4f}")
print(f"R² Score: {r2_rf:.4f}")

RMSE: 5.1060
R² Score: -0.0067


### Xgboost

In [64]:
xgb_reg = XGBRegressor(
    n_estimators=100, 
    max_depth=6, 
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

In [65]:
xgb_reg.fit(X_train_scaled, y_reg_train)
y_pred_xgb_reg = xgb_reg.predict(X_test_scaled)

In [66]:
rmse_xgb = np.sqrt(mean_squared_error(y_reg_test, y_pred_xgb_reg))
r2_xgb = r2_score(y_reg_test, y_pred_xgb_reg)
regression_results['XGBoost'] = {'rmse': rmse_xgb, 'r2': r2_xgb}
print(f"RMSE: {rmse_xgb:.4f}")
print(f"R² Score: {r2_xgb:.4f}")

RMSE: 5.1320
R² Score: -0.0170


### Kfold

In [67]:
cv_rmse = cross_val_score(xgb_reg, X_train_scaled, y_reg_train, cv=kfold, 
                          scoring='neg_root_mean_squared_error')
cv_r2 = cross_val_score(xgb_reg, X_train_scaled, y_reg_train, cv=kfold, scoring='r2')
print(f"CV RMSE: {-cv_rmse.mean():.4f} (+/- {cv_rmse.std():.4f})")
print(f"CV R²: {cv_r2.mean():.4f} (+/- {cv_r2.std():.4f})")

CV RMSE: 5.1638 (+/- 0.1980)
CV R²: -0.0217 (+/- 0.0038)


### Feature importance analysis  which features influence the predictions the most.
- Check AEZ feature importance

In [78]:
# Feature importance analysis  which features influence the predictions the most.
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb_class.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features (XGBoost Classifier):")
print(feature_importance.head(15).to_string(index=False))

# Check AEZ feature importance
aez_features = feature_importance[
    feature_importance['feature'].str.contains('AEZ', case=False)
]
print(f"\nAEZ (Location) Feature Importance:")
print(aez_features.to_string(index=False))


Top 15 Most Important Features (XGBoost Classifier):
                            feature  importance
                     rainfall_lag_1    0.298141
                    rainfall_7d_avg    0.135368
                    rainfall_7d_std    0.088868
                               rh2m    0.069336
                     humidity_lag_1    0.030174
                          month_cos    0.029336
                     humidity_lag_3    0.027136
       aez_coastal lowlands (humid)    0.023056
                  allsky_sfc_sw_dwn    0.021648
                          month_sin    0.020490
                     humidity_lag_7    0.019283
                     rainfall_lag_3    0.017921
              aez_highlands (humid)    0.017686
                        aez_encoded    0.016234
aez_upper midlands (high potential)    0.014561

AEZ (Location) Feature Importance:
                            feature  importance
       aez_coastal lowlands (humid)    0.023056
              aez_highlands (humid)    0.01768

## SELECTING THE BEST Model

### Classification results

In [79]:
for model, metrics in classification_results.items():
    print(f"  {model}: Accuracy={metrics['accuracy']:.4f}, AUC={metrics['auc']:.4f}")


  Logistic Regression: Accuracy=0.8716, AUC=0.9412
  Random Forest: Accuracy=0.8772, AUC=0.9537
  XGBoost: Accuracy=0.8924, AUC=0.9621


### Regression results

In [80]:
for model, metrics in regression_results.items():
    print(f"  {model}: RMSE={metrics['rmse']:.4f}, R²={metrics['r2']:.4f}")

  Linear Regression: RMSE=5.0885, R²=0.0002
  Random Forest: RMSE=5.1060, R²=-0.0067
  XGBoost: RMSE=5.1320, R²=-0.0170


### selecting the best model

In [81]:
# Select best models
best_classifier = max(classification_results.items(), key=lambda x: x[1]['auc'])
best_regressor = min(regression_results.items(), key=lambda x: x[1]['rmse'])
print(f"\nBest Classifier: {best_classifier[0]} (AUC: {best_classifier[1]['auc']:.4f})")
print(f"Best Regressor: {best_regressor[0]} (RMSE: {best_regressor[1]['rmse']:.4f})")


Best Classifier: XGBoost (AUC: 0.9621)
Best Regressor: Linear Regression (RMSE: 5.0885)


## Saving with joblib compression of 5

In [84]:
import os
os.makedirs('../../models', exist_ok=True)

In [87]:

# Save with joblib compression=5 for smaller file sizes (suitable for Render)
joblib.dump(xgb_class, '../models/rainfall_classifier.joblib', compress=5)
joblib.dump(xgb_reg, '../models/rainfall_regressor.joblib', compress=5)
joblib.dump(scaler, '../models/scaler_rainfall.joblib', compress=5)
joblib.dump(le_aez, '../models/aez_label_encoder.joblib', compress=5)
joblib.dump(ohe_aez, '../models/aez_onehot_encoder.joblib', compress=5)
joblib.dump(feature_cols, '../models/rainfall_feature_columns.joblib', compress=5)

['../models/rainfall_feature_columns.joblib']