In [2]:
import pandas as pd
import numpy as np
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

In [3]:
df = pd.read_csv("Final_Dataset_after_temperature.csv")

print("Original Dataset Shape:", df.shape)


Original Dataset Shape: (173026, 8)


In [4]:
df = df.dropna()

In [5]:
df = df.drop(columns=["Production_in_tons"])

print("After Cleaning Shape:", df.shape)


After Cleaning Shape: (173026, 7)


In [6]:
# Outlier Removal (2% - 98%)
q_low = df["Yield_ton_per_hec"].quantile(0.02)
q_high = df["Yield_ton_per_hec"].quantile(0.98)

df = df[(df["Yield_ton_per_hec"] > q_low) &
        (df["Yield_ton_per_hec"] < q_high)]

print("After Outlier Removal Shape:", df.shape)

After Outlier Removal Shape: (166104, 7)


In [7]:
# Log Transformation of Target
df["Yield_log"] = np.log1p(df["Yield_ton_per_hec"])

In [8]:
#Feature Engineering
df["Rainfall_Temp"] = df["rainfall"] * df["temperature"]
df["Rainfall_sq"] = df["rainfall"] ** 2
df["Temp_sq"] = df["temperature"] ** 2
df["Area_log"] = np.log1p(df["Area_in_hectares"])


In [9]:
X = df.drop(["Yield_ton_per_hec", "Yield_log"], axis=1)
y = df["Yield_log"]

categorical_cols = ["State_Name", "Crop_Type", "Crop"]
numeric_cols = [col for col in X.columns if col not in categorical_cols]

In [10]:
# Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)


In [11]:
# Model Definition (XGBoost)
model = XGBRegressor(
    n_estimators=600,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1,
    reg_lambda=2,
    random_state=42,
    tree_method="hist"
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\nTraining Model... Please wait.")

pipeline.fit(X_train, y_train)



Training Model... Please wait.


In [13]:
# Evaluation
y_pred_log = pipeline.predict(X_test)

# Convert back from log scale
y_pred = np.expm1(y_pred_log)
y_actual = np.expm1(y_test)

r2 = r2_score(y_actual, y_pred)
rmse = np.sqrt(mean_squared_error(y_actual, y_pred))

print("\n===============================")
print("üìä Model Performance Results")
print("===============================")
print("R¬≤ Score:", round(r2, 4))
print("RMSE:", round(rmse, 4))


üìä Model Performance Results
R¬≤ Score: 0.9006
RMSE: 2.0489


In [14]:
os.makedirs("models", exist_ok=True)
joblib.dump(pipeline, "models/crop_yield_pipeline.pkl")

print("\n‚úÖ Model Saved Successfully!")
print("üìÅ Saved at: models/crop_yield_pipeline.pkl")


‚úÖ Model Saved Successfully!
üìÅ Saved at: models/crop_yield_pipeline.pkl
