In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [15]:
df = pd.read_csv("minimum-support-prices.csv")  

df.head()


Unnamed: 0,id,year,crop,season,min_support_price
0,0,2022-2023,Paddy - Common,Kharif,2040.0
1,1,2022-2023,Paddy - Grade 'A',Kharif,2060.0
2,2,2022-2023,Jowar - Hybrid,Kharif,2970.0
3,3,2022-2023,Jowar - Maldandi,Kharif,2990.0
4,4,2022-2023,Bajra,Kharif,2350.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 736 non-null    int64  
 1   year               736 non-null    object 
 2   crop               736 non-null    object 
 3   season             736 non-null    object 
 4   min_support_price  736 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 28.9+ KB


In [17]:
df["start_year"] = df["year"].apply(lambda x: int(str(x).split("-")[0]))
df["end_year"] = df["year"].apply(lambda x: int(str(x).split("-")[1]))

df.drop("year", axis=1, inplace=True)
df.head()

Unnamed: 0,id,crop,season,min_support_price,start_year,end_year
0,0,Paddy - Common,Kharif,2040.0,2022,2023
1,1,Paddy - Grade 'A',Kharif,2060.0,2022,2023
2,2,Jowar - Hybrid,Kharif,2970.0,2022,2023
3,3,Jowar - Maldandi,Kharif,2990.0,2022,2023
4,4,Bajra,Kharif,2350.0,2022,2023


In [18]:
X = df.drop(["min_support_price", "id"], axis=1)
y = df["min_support_price"]


In [19]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

categorical_cols, numeric_cols


(['crop', 'season'], ['start_year', 'end_year'])

In [20]:
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), numeric_cols),
    ]
)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [22]:
lasso_pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", Lasso(max_iter=10000))
])

lasso_params = {
    "model__alpha": [0.001, 0.01, 0.1, 1, 10, 50]
}

lasso_grid = GridSearchCV(
    lasso_pipe, lasso_params, cv=5,
    scoring="neg_mean_squared_error", n_jobs=-1
)

lasso_grid.fit(X_train, y_train)
lasso_grid.best_params_


{'model__alpha': 0.1}

In [23]:
ridge_pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", Ridge(max_iter=10000))
])

ridge_params = {
    "model__alpha": [0.01, 0.1, 1, 10, 50, 100]
}

ridge_grid = GridSearchCV(
    ridge_pipe, ridge_params, cv=5,
    scoring="neg_mean_squared_error", n_jobs=-1
)

ridge_grid.fit(X_train, y_train)
ridge_grid.best_params_


{'model__alpha': 0.1}

In [24]:
xgb_pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        eval_metric="rmse"
    ))
])

xgb_params = {
    "model__n_estimators": [200, 300, 500],
    "model__max_depth": [4, 6, 8],
    "model__learning_rate": [0.01, 0.05, 0.1],
    "model__subsample": [0.7, 0.8, 1.0],
    "model__colsample_bytree": [0.7, 0.8, 1.0]
}

xgb_grid = GridSearchCV(
    xgb_pipe, xgb_params, cv=3,
    scoring="neg_mean_squared_error", n_jobs=-1
)

xgb_grid.fit(X_train, y_train)
xgb_grid.best_params_


{'model__colsample_bytree': 1.0,
 'model__learning_rate': 0.1,
 'model__max_depth': 4,
 'model__n_estimators': 500,
 'model__subsample': 0.7}

In [25]:
lr_pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

lr_pipe.fit(X_train, y_train)


In [26]:
def evaluate(model, name):
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    mse = mean_squared_error(y_test, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, pred)
    return [name, mae, mse, rmse, r2]

results = []

results.append(evaluate(lr_pipe, "Linear Regression"))
results.append(evaluate(lasso_grid.best_estimator_, "Lasso (tuned)"))
results.append(evaluate(ridge_grid.best_estimator_, "Ridge (tuned)"))
results.append(evaluate(xgb_grid.best_estimator_, "XGBoost (tuned)"))

results_df = pd.DataFrame(
    results, columns=["Model", "MAE", "MSE", "RMSE", "R2"]
)

results_df.sort_values("RMSE")


Unnamed: 0,Model,MAE,MSE,RMSE,R2
3,XGBoost (tuned),136.880159,39060.342962,197.636897,0.989674
0,Linear Regression,514.76522,490206.114644,700.147209,0.870406
1,Lasso (tuned),514.579293,490549.856535,700.392645,0.870315
2,Ridge (tuned),513.798382,490860.539899,700.614402,0.870233


In [27]:
best_model_name = results_df.sort_values("RMSE").iloc[0]["Model"]

if best_model_name == "Linear Regression":
    best_model = lr_pipe
elif best_model_name == "Lasso (tuned)":
    best_model = lasso_grid.best_estimator_
elif best_model_name == "Ridge (tuned)":
    best_model = ridge_grid.best_estimator_
elif best_model_name == "XGBoost (tuned)":
    best_model = xgb_grid.best_estimator_

print("Best Model:", best_model_name)


Best Model: XGBoost (tuned)


In [28]:
def msp_predict(input_data, output_file="msp_predictions.csv"):
    """
    Accept list, DataFrame, or CSV path.
    Generates predictions using the best model.
    Saves to CSV.
    """
    # Handle input types
    if isinstance(input_data, str):
        df_input = pd.read_csv(input_data)
    elif isinstance(input_data, pd.DataFrame):
        df_input = input_data.copy()
    elif isinstance(input_data, (list, np.ndarray)):
        df_input = pd.DataFrame(input_data, columns=X.columns)
    else:
        raise ValueError("Unsupported input format.")

    # Predict
    preds = best_model.predict(df_input)

    # Output CSV
    out = df_input.copy()
    out["predicted_min_support_price"] = preds

    out.to_csv(output_file, index=False)
    print(f"Saved: {output_file}")

    return out


In [29]:
df_new = pd.DataFrame({
    "crop": ["Paddy - Common"],
    "season": ["Kharif"],
    "start_year": [2022],
    "end_year": [2023]
})

msp_predict(df_new)


Saved: msp_predictions.csv


Unnamed: 0,crop,season,start_year,end_year,predicted_min_support_price
0,Paddy - Common,Kharif,2022,2023,2103.403809
