In [12]:
# ============================================================
# 1) IMPORTS
# ============================================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from xgboost import XGBRegressor

# ============================================================
# 2) LOAD DATA
# ============================================================
df = pd.read_csv("crop_yield.csv")   

df.head()

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [2]:
# ============================================================
# 3) CHECK INFO
# ============================================================
print(df.info())
print(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Crop             19689 non-null  object 
 1   Crop_Year        19689 non-null  int64  
 2   Season           19689 non-null  object 
 3   State            19689 non-null  object 
 4   Area             19689 non-null  float64
 5   Production       19689 non-null  int64  
 6   Annual_Rainfall  19689 non-null  float64
 7   Fertilizer       19689 non-null  float64
 8   Pesticide        19689 non-null  float64
 9   Yield            19689 non-null  float64
dtypes: float64(5), int64(2), object(3)
memory usage: 1.5+ MB
None
          Crop_Year          Area    Production  Annual_Rainfall  \
count  19689.000000  1.968900e+04  1.968900e+04     19689.000000   
mean    2009.127584  1.799266e+05  1.643594e+07      1437.755177   
std        6.498099  7.328287e+05  2.630568e+08       816.909589   


In [3]:
# ============================================================
# 4) FEATURE / TARGET SPLIT
# ============================================================
X = df.drop("Yield", axis=1)
y = df["Yield"]


In [4]:
# ============================================================
# 5) IDENTIFY COLUMN TYPES
# ============================================================
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

categorical_cols, numeric_cols


(['Crop', 'Season', 'State'],
 ['Crop_Year',
  'Area',
  'Production',
  'Annual_Rainfall',
  'Fertilizer',
  'Pesticide'])

In [5]:
# ============================================================
# 6) PREPROCESSING PIPELINE
#    - One-Hot Encode categoricals
#    - Scale numeric features
# ============================================================

from sklearn.preprocessing import OneHotEncoder

preprocess = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("numeric", StandardScaler(), numeric_cols),
    ]
)


In [6]:
# ============================================================
# 7) TRAIN-TEST SPLIT
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [13]:
# ============================================================
# 6.1) LASSO REGRESSION (L1) + GRIDSEARCHCV
# ============================================================

lasso_pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", Lasso(max_iter=10000))
])

lasso_params = {
    "model__alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10]
}

lasso_grid = GridSearchCV(
    estimator=lasso_pipe,
    param_grid=lasso_params,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

lasso_grid.fit(X_train, y_train)
print("Best Lasso Params:", lasso_grid.best_params_)


Best Lasso Params: {'model__alpha': 0.1}


In [14]:
# ============================================================
# 6.2) RIDGE REGRESSION (L2) + GRIDSEARCHCV
# ============================================================

ridge_pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", Ridge(max_iter=10000))
])

ridge_params = {
    "model__alpha": [0.01, 0.1, 1, 10, 50, 100]
}

ridge_grid = GridSearchCV(
    estimator=ridge_pipe,
    param_grid=ridge_params,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

ridge_grid.fit(X_train, y_train)
print("Best Ridge Params:", ridge_grid.best_params_)


Best Ridge Params: {'model__alpha': 0.1}


In [15]:
# ============================================================
# 6.3) XGBOOST REGRESSOR + GRIDSEARCHCV
# ============================================================

xgb_pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        eval_metric="rmse"
    ))
])

xgb_params = {
    "model__n_estimators": [200, 300, 500],
    "model__max_depth": [4, 6, 8],
    "model__learning_rate": [0.01, 0.05, 0.1],
    "model__subsample": [0.7, 0.8, 1.0],
    "model__colsample_bytree": [0.7, 0.8, 1.0],
}

xgb_grid = GridSearchCV(
    estimator=xgb_pipe,
    param_grid=xgb_params,
    cv=3,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

xgb_grid.fit(X_train, y_train)
print("Best XGBoost Params:", xgb_grid.best_params_)


Best XGBoost Params: {'model__colsample_bytree': 1.0, 'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 200, 'model__subsample': 0.7}


In [16]:
# ============================================================
# 7) LINEAR REGRESSION (NO HYPERPARAMETERS)
# ============================================================

lr_pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

lr_pipe.fit(X_train, y_train)


In [17]:
# ============================================================
# 8) EVALUATION FUNCTION
# ============================================================

def evaluate(model, name):
    y_pred = model.predict(X_test)
    mae  = mean_absolute_error(y_test, y_pred)
    mse  = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_test, y_pred)

    return [name, mae, mse, rmse, r2]


In [18]:
# ============================================================
# 9) EVALUATE ALL MODELS
# ============================================================

results = []

results.append(evaluate(lr_pipe, "Linear Regression"))
results.append(evaluate(lasso_grid.best_estimator_, "Lasso (tuned)"))
results.append(evaluate(ridge_grid.best_estimator_, "Ridge (tuned)"))
results.append(evaluate(xgb_grid.best_estimator_, "XGBoost (tuned)"))

results_df = pd.DataFrame(results, columns=["Model", "MAE", "MSE", "RMSE", "R²"])
results_df


Unnamed: 0,Model,MAE,MSE,RMSE,R²
0,Linear Regression,62.980417,158461.89358,398.072724,0.802229
1,Lasso (tuned),60.614591,158399.962742,397.994928,0.802306
2,Ridge (tuned),63.033578,158360.03827,397.944768,0.802356
3,XGBoost (tuned),12.302151,31531.81665,177.572004,0.960646


In [19]:
results_df.sort_values(by="RMSE")


Unnamed: 0,Model,MAE,MSE,RMSE,R²
3,XGBoost (tuned),12.302151,31531.81665,177.572004,0.960646
2,Ridge (tuned),63.033578,158360.03827,397.944768,0.802356
1,Lasso (tuned),60.614591,158399.962742,397.994928,0.802306
0,Linear Regression,62.980417,158461.89358,398.072724,0.802229


In [20]:
# ============================================================
# FUNCTION: Predict using best model & export to CSV
# ============================================================

import pandas as pd
import numpy as np

def predict_and_export(input_data, output_file="predictions_output.csv"):
    """
    Accepts input in: array, DataFrame, or CSV path.
    Automatically uses the best model based on RMSE (from results_df).
    Saves predictions to a CSV file.
    """
    
    # --------------------------------------------------------
    # 1) Detect input format
    # --------------------------------------------------------
    if isinstance(input_data, str):
        # CSV path
        df_input = pd.read_csv(input_data)
    
    elif isinstance(input_data, pd.DataFrame):
        df_input = input_data.copy()
    
    elif isinstance(input_data, (list, np.ndarray)):
        # Convert array → DataFrame with same column order as training data X
        df_input = pd.DataFrame(input_data, columns=X.columns)
    
    else:
        raise ValueError("Unsupported input format.")
    
    # --------------------------------------------------------
    # 2) Identify BEST MODEL (lowest RMSE)
    # --------------------------------------------------------
    best_model_name = results_df.sort_values(by="RMSE").iloc[0]["Model"]
    print("Using best model:", best_model_name)

    if best_model_name == "Linear Regression":
        best_model = lr_pipe

    elif best_model_name == "Lasso (tuned)":
        best_model = lasso_grid.best_estimator_

    elif best_model_name == "Ridge (tuned)":
        best_model = ridge_grid.best_estimator_

    elif best_model_name == "XGBoost (tuned)":
        best_model = xgb_grid.best_estimator_
    
    # --------------------------------------------------------
    # 3) Make predictions  
    # --------------------------------------------------------
    predictions = best_model.predict(df_input)

    # --------------------------------------------------------
    # 4) Save to CSV
    # --------------------------------------------------------
    output_df = df_input.copy()
    output_df["Predicted_Yield"] = predictions

    output_df.to_csv(output_file, index=False)
    print(f"Predictions saved to: {output_file}")

    return output_df


In [21]:
sample_df = pd.DataFrame({
    "Crop": ["Arecanut"],
    "Crop_Year": [2020],
    "Season": ["Whole Year"],
    "State": ["Assam"],
    "Area": [70000],
    "Production": [56000],
    "Annual_Rainfall": [2000],
    "Fertilizer": [7000000],
    "Pesticide": [20000]
})

predict_and_export(sample_df)


Using best model: XGBoost (tuned)
Predictions saved to: predictions_output.csv


Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Predicted_Yield
0,Arecanut,2020,Whole Year,Assam,70000,56000,2000,7000000,20000,1.161688
