In [1]:
import os
os.makedirs("models", exist_ok=True)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import joblib
from tqdm.auto import tqdm
tqdm.pandas()
import xgboost as xgb
from catboost import CatBoostRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv(r"C:\Ibrahim\Personal\University Stuff\Machine Learning\Project\ML Irrigation Project\data\processed\final_cleaned_district_dataset.csv")

TARGETS = ["Irrigation_Area", "Crop_Yield"]
ID_COLS = ["District", "Year"]   #keep for reference, not use as features

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,District,Year,Avg_Rainfall,Avg_Temperature,Crop_Yield,Irrigation_Area,Rain_Next_1,Rain_Next_24,Rain_Next_7d
0,2,PK203,1983,3.331681,19.611718,2973.113131,118.704918,10.591415,2.269813,2.383304
1,3,PK203,1984,2.419928,19.920437,2973.113131,118.704918,11.6739,2.423514,2.544689
2,4,PK203,1985,2.637431,20.242908,2973.113131,118.704918,11.46534,2.390116,2.509621
3,5,PK203,1986,2.657394,19.578992,2973.113131,118.704918,11.417544,2.382497,2.501622
4,6,PK203,1987,2.810428,20.42715,2973.113131,118.704918,11.513817,2.397244,2.517106


In [4]:
#minimal cleaning
print("shape:", df.shape)
print(df.columns.tolist())
print(df.isna().sum())

#drop exact duplicates (if any)
df = df.drop_duplicates().reset_index(drop=True)

#ensure int
df['Year'] = df['Year'].astype(int)

shape: (1260, 10)
['Unnamed: 0', 'District', 'Year', 'Avg_Rainfall', 'Avg_Temperature', 'Crop_Yield', 'Irrigation_Area', 'Rain_Next_1', 'Rain_Next_24', 'Rain_Next_7d']
Unnamed: 0         0
District           0
Year               0
Avg_Rainfall       0
Avg_Temperature    0
Crop_Yield         0
Irrigation_Area    0
Rain_Next_1        0
Rain_Next_24       0
Rain_Next_7d       0
dtype: int64


In [5]:
#select features that are likely useful for ML
FEATURE_COLS = [c for c in df.columns if c not in ID_COLS + TARGETS]

print("candidate features:", FEATURE_COLS)

ml_df = df[ID_COLS + FEATURE_COLS + TARGETS].copy()
ml_df.shape

candidate features: ['Unnamed: 0', 'Avg_Rainfall', 'Avg_Temperature', 'Rain_Next_1', 'Rain_Next_24', 'Rain_Next_7d']


(1260, 10)

In [6]:
#create X, y splits per target
def get_xy_for_target(df, target, test_size=0.2, random_state=42):
    #drop rows where target is NaN
    sub = df.dropna(subset=[target]).copy()
    X = sub[FEATURE_COLS].copy()
    y = sub[target].values
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    return X_train, X_test, y_train, y_test

In [8]:
#preprocessing
X_train, X_test, y_train, y_test = get_xy_for_target(df, "Irrigation_Area")

#auto-detect numeric vs categorical
numeric_features = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

#numeric transformer: impute -> scale
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

#categorical transformer: impute -> one-hot
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, cat_features)
])

In [9]:
#model builder

def make_model_pipeline(estimator, preprocessor=preprocessor):
    return Pipeline([
        ("preproc", preprocessor),
        ("model", estimator)
    ])

#instantiate estimators
estimators = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1),
    "XGBoost": xgb.XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1, tree_method='hist'),
    "CatBoost": CatBoostRegressor(iterations=1000, verbose=0, random_seed=42)
}

In [12]:
#train all models for each target, save results + models
results = []

for target in TARGETS:
    print(f"\n--- Training for target: {target} ---")
    X_train, X_test, y_train, y_test = get_xy_for_target(ml_df, target)
    for name, est in estimators.items():
        print("Training:", name)
        pipe = make_model_pipeline(est)
        pipe.fit(X_train, y_train)

        #predictions & metrics
        y_pred = pipe.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        #save pipeline
        model_fname = f"models/{target.replace(' ','_')}_{name}.joblib"
        joblib.dump(pipe, model_fname)

        print(f" -> {name} RMSE:{rmse:.4f} MAE:{mae:.4f} R2:{r2:.4f}  saved: {model_fname}")
        results.append({
            "target": target, "model": name, "rmse": rmse, "mae": mae, "r2": r2, "file": model_fname
        })

#show leaderboard
res_df = pd.DataFrame(results).sort_values(["target","rmse"])
display(res_df)
res_df.to_csv("models/training_results.csv", index=False)



--- Training for target: Irrigation_Area ---
Training: LinearRegression
 -> LinearRegression RMSE:0.9805 MAE:0.5816 R2:-0.0369  saved: models/Irrigation_Area_LinearRegression.joblib
Training: RandomForest
 -> RandomForest RMSE:0.0575 MAE:0.0927 R2:0.9392  saved: models/Irrigation_Area_RandomForest.joblib
Training: XGBoost
 -> XGBoost RMSE:0.0582 MAE:0.1119 R2:0.9384  saved: models/Irrigation_Area_XGBoost.joblib
Training: CatBoost
 -> CatBoost RMSE:0.0538 MAE:0.1217 R2:0.9431  saved: models/Irrigation_Area_CatBoost.joblib

--- Training for target: Crop_Yield ---
Training: LinearRegression
 -> LinearRegression RMSE:154.6094 MAE:7.5438 R2:-0.0510  saved: models/Crop_Yield_LinearRegression.joblib
Training: RandomForest
 -> RandomForest RMSE:7.7725 MAE:0.9620 R2:0.9472  saved: models/Crop_Yield_RandomForest.joblib
Training: XGBoost
 -> XGBoost RMSE:8.5500 MAE:1.3293 R2:0.9419  saved: models/Crop_Yield_XGBoost.joblib
Training: CatBoost
 -> CatBoost RMSE:7.9231 MAE:1.5150 R2:0.9461  saved: m

Unnamed: 0,target,model,rmse,mae,r2,file
5,Crop_Yield,RandomForest,7.772453,0.961988,0.947166,models/Crop_Yield_RandomForest.joblib
7,Crop_Yield,CatBoost,7.923137,1.515031,0.946141,models/Crop_Yield_CatBoost.joblib
6,Crop_Yield,XGBoost,8.550042,1.329312,0.94188,models/Crop_Yield_XGBoost.joblib
4,Crop_Yield,LinearRegression,154.609364,7.543814,-0.050977,models/Crop_Yield_LinearRegression.joblib
3,Irrigation_Area,CatBoost,0.05376,0.121738,0.94315,models/Irrigation_Area_CatBoost.joblib
1,Irrigation_Area,RandomForest,0.057456,0.092745,0.939242,models/Irrigation_Area_RandomForest.joblib
2,Irrigation_Area,XGBoost,0.05824,0.111939,0.938412,models/Irrigation_Area_XGBoost.joblib
0,Irrigation_Area,LinearRegression,0.980503,0.581625,-0.036856,models/Irrigation_Area_LinearRegression.joblib


In [13]:
#load a saved model and run a test inference
pipe = joblib.load("models/Irrigation_Area_XGBoost.joblib")
sample_X = X_test.iloc[:5]  # from previously defined X_test in last loop
print("sample preds:", pipe.predict(sample_X))

sample preds: [118.49821 118.19231 118.72395 118.24513 120.22445]
