In [1]:
import numpy as np
import pandas as pd
import joblib
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [2]:
df = pd.read_csv("data/market_data.csv", parse_dates=["Arrival_Date"], dayfirst=True)

In [3]:
df["year"]        = df["Arrival_Date"].dt.year
df["month"]       = df["Arrival_Date"].dt.month
df["day_of_week"] = df["Arrival_Date"].dt.dayofweek
df["day_of_month"]= df["Arrival_Date"].dt.day

In [4]:
feature_cols = [
    "year", "month", "day_of_week", "day_of_month",
    "State", "District", "Commodity", "Variety", "Grade"
]
X = df[feature_cols]
y = df["Modal Price"]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [6]:
numeric_feats = ["year", "month", "day_of_week", "day_of_month"]
cat_feats     = ["State", "District", "Commodity", "Variety", "Grade"]

In [7]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_feats),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_feats),
])

In [8]:
models = {
    "KNN": (
        KNeighborsRegressor(),
        {"reg__n_neighbors": [3,5,7,9,11]}
    ),
    "DecisionTree": (
        DecisionTreeRegressor(random_state=42),
        {"reg__max_depth": [None,5,10,20], "reg__min_samples_leaf": [1,5,10]}
    ),
    "RandomForest": (
        RandomForestRegressor(random_state=42, n_jobs=-1),
        {"reg__n_estimators": [100,200], "reg__max_depth": [None,10,20], "reg__min_samples_leaf": [1,5]}
    ),
    "AdaBoost": (
        AdaBoostRegressor(random_state=42),
        {"reg__n_estimators": [50,100,200], "reg__learning_rate": [0.01,0.1,1]}
    ),
    "SVR": (
        SVR(),
        {"reg__C": [0.1,1,10], "reg__gamma": ["scale","auto"], "reg__kernel": ["rbf"]}
    ),
    "Linear": (
        LinearRegression(),
        {}
    ),
    "Ridge": (
        Ridge(random_state=42),
        {"reg__alpha": [0.1,1,10,50]}
    ),
    "Lasso": (
        Lasso(random_state=42, max_iter=5000),
        {"reg__alpha": [0.001,0.01,0.1,1]}
    ),
    "XGBoost": (
        XGBRegressor(random_state=42, n_jobs=-1),
        {"reg__n_estimators": [100,200], "reg__learning_rate": [0.01,0.05,0.1], "reg__max_depth": [3,6,10]}
    ),
    "CatBoost": (
        CatBoostRegressor(random_seed=42, silent=True),
        {"reg__iterations": [200,500], "reg__depth": [4,6,8], "reg__learning_rate": [0.05,0.1]}
    ),
}


In [9]:
results = []
best_rmse = np.inf
best_model = None
best_name = None

In [10]:
for name, (estimator, param_grid) in models.items():
    pipe = Pipeline([
        ("pre", preprocessor),
        ("reg", estimator)
    ])
    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_grid,
        n_iter=10,
        cv=3,
        scoring="neg_root_mean_squared_error",
        n_jobs=-1,
        random_state=42,
        verbose=0
    )
    search.fit(X_train, y_train)
    preds = search.best_estimator_.predict(X_test)
    
    rmse = sqrt(mean_squared_error(y_test, preds))
    mae  = mean_absolute_error(y_test, preds)
    r2   = r2_score(y_test, preds)
    
    results.append({
        "Model": name,
        "Best_Params": search.best_params_,
        "Test_RMSE": rmse,
        "Test_MAE": mae,
        "Test_R2": r2 
    })
    
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = search.best_estimator_
        best_name = name

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [11]:
results_df = pd.DataFrame(results).sort_values("Test_RMSE").reset_index(drop=True)
print(results_df)

          Model                                        Best_Params  \
0  RandomForest  {'reg__n_estimators': 200, 'reg__min_samples_l...   
1         Ridge                                  {'reg__alpha': 1}   
2         Lasso                                  {'reg__alpha': 1}   
3        Linear                                                 {}   
4      CatBoost  {'reg__learning_rate': 0.1, 'reg__iterations':...   
5  DecisionTree  {'reg__min_samples_leaf': 1, 'reg__max_depth':...   
6       XGBoost  {'reg__n_estimators': 200, 'reg__max_depth': 6...   
7           KNN                            {'reg__n_neighbors': 3}   
8      AdaBoost  {'reg__n_estimators': 100, 'reg__learning_rate...   
9           SVR  {'reg__kernel': 'rbf', 'reg__gamma': 'scale', ...   

     Test_RMSE     Test_MAE   Test_R2  
0  2382.981529   755.776502  0.731381  
1  2409.122979  1010.666334  0.725455  
2  2434.287950  1018.008570  0.719690  
3  2447.023217  1010.041885  0.716749  
4  2449.771835  1058.807126  

In [12]:
print(f"\nBest model: {best_name} with RMSE = {best_rmse:.2f}")


Best model: RandomForest with RMSE = 2382.98


In [13]:
joblib.dump(best_model, 'market_price_model.pkl') 

['market_price_model.pkl']