In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("../data/processed/fe_data.csv")  # Adjusted path
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,id,district,market,commodity,variety,grade,min_price,max_price,modal_price,date,...,month_cos,commodity_mean,commodity_std,commodity_price_zscore,market_avg_price,state_avg_price,market_price_deviation,lag_1,lag_7,pct_change_7
0,349,Kolkata,Mechua,Mango,Other,Small,5000.0,6000.0,6000.0,2024-08-18,...,-0.5,4509.215305,2570.158349,0.580036,11219.129403,4370.801956,-5219.129403,8000.0,9000.0,-0.333296
1,2827,Jalpaiguri,Belacoba,Brinjal,Other,FAQ,3300.0,3400.0,3400.0,2024-08-18,...,-0.5,2317.446171,1712.154131,0.632276,2758.549967,4370.801956,641.450033,3900.0,3300.0,0.030294
2,949,Kolkata,Mechua,Apple,Apple,Small,12000.0,15000.0,14000.0,2024-08-18,...,-0.5,9002.859968,3338.386223,1.496873,11219.129403,4370.801956,2780.870597,18000.0,15000.0,-0.066662
3,1793,Jalpaiguri,Belacoba,Green Chilli,Green Chilly,FAQ,10500.0,11000.0,10700.0,2024-08-18,...,-0.5,3710.494545,2285.883067,3.057683,2758.549967,4370.801956,7941.450033,6300.0,10400.0,0.028843
4,10382,Jalpaiguri,Jalpaiguri Sadar,Brinjal,Round/Long,FAQ,3800.0,4000.0,3900.0,2024-08-18,...,-0.5,2317.446171,1712.154131,0.924306,2743.373606,4370.801956,1156.626394,3400.0,4000.0,-0.024994


In [4]:
df.columns

Index(['id', 'district', 'market', 'commodity', 'variety', 'grade',
       'min_price', 'max_price', 'modal_price', 'date', 'state', 'target',
       'price_spread', 'price_spread_ratio', 'modal_to_min_ratio',
       'modal_to_max_ratio', 'month', 'week', 'month_sin', 'month_cos',
       'commodity_mean', 'commodity_std', 'commodity_price_zscore',
       'market_avg_price', 'state_avg_price', 'market_price_deviation',
       'lag_1', 'lag_7', 'pct_change_7'],
      dtype='object')

In [5]:
drop_cols = [
    'id',
    'date',
    'modal_price',
    'min_price',
    'max_price',
    'commodity_mean',
    'commodity_std'
]


In [6]:
X = df.drop(columns=drop_cols + ['target'])
y = df['target']


In [7]:
from category_encoders import TargetEncoder

cat_cols = [
    'district',
    'market',
    'commodity',
    'variety',
    'grade',
    'state'
]

encoder = TargetEncoder(cols=cat_cols)
X[cat_cols] = encoder.fit_transform(X[cat_cols], y)


In [8]:
# take first row (you can change index)
row = X.iloc[0]
row


district                      8.768969
market                        9.224355
commodity                     8.251624
variety                       8.113949
grade                         8.335243
state                         8.117831
price_spread               1000.000000
price_spread_ratio            0.166639
modal_to_min_ratio            1.199760
modal_to_max_ratio            0.999833
month                         8.000000
week                         33.000000
month_sin                    -0.866025
month_cos                    -0.500000
commodity_price_zscore        0.580036
market_avg_price          11219.129403
state_avg_price            4370.801956
market_price_deviation    -5219.129403
lag_1                      8000.000000
lag_7                      9000.000000
pct_change_7                 -0.333296
Name: 0, dtype: float64

In [9]:
y.iloc[0]

np.float64(8.699681400989514)

In [10]:
df = df.sort_values('date')

split_date = df['date'].quantile(0.95)

X_train = X[df['date'] < split_date]
y_train = y[df['date'] < split_date]

X_val = X[df['date'] >= split_date]
y_val = y[df['date'] >= split_date]


In [11]:
X_train.head()

Unnamed: 0,district,market,commodity,variety,grade,state,price_spread,price_spread_ratio,modal_to_min_ratio,modal_to_max_ratio,...,week,month_sin,month_cos,commodity_price_zscore,market_avg_price,state_avg_price,market_price_deviation,lag_1,lag_7,pct_change_7
0,8.768969,9.224355,8.251624,8.113949,8.335243,8.117831,1000.0,0.166639,1.19976,0.999833,...,33,-0.866025,-0.5,0.580036,11219.129403,4370.801956,-5219.129403,8000.0,9000.0,-0.333296
1,7.521221,7.51625,7.565934,8.113949,8.056598,8.117831,100.0,0.029403,1.029991,0.999706,...,33,-0.866025,-0.5,0.632276,2758.549967,4370.801956,641.450033,3900.0,3300.0,0.030294
2,8.768969,9.224355,9.04481,9.016923,8.335243,8.117831,3000.0,0.21427,1.166569,0.933271,...,33,-0.866025,-0.5,1.496873,11219.129403,4370.801956,2780.870597,18000.0,15000.0,-0.066662
3,7.521221,7.51625,8.088133,8.058764,8.056598,8.117831,500.0,0.046725,1.018951,0.972639,...,33,-0.866025,-0.5,3.057683,2758.549967,4370.801956,7941.450033,6300.0,10400.0,0.028843
4,7.521221,7.523284,7.565934,7.406769,8.056598,8.117831,200.0,0.051269,1.026046,0.974756,...,33,-0.866025,-0.5,0.924306,2743.373606,4370.801956,1156.626394,3400.0,4000.0,-0.024994


In [22]:
import lightgbm as lgb

model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(50)]
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057839 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3308
[LightGBM] [Info] Number of data points in the train set: 1012846, number of used features: 21
[LightGBM] [Info] Start training from score 8.108349
Training until validation scores don't improve for 50 rounds
[50]	valid_0's l2: 0.00605572
[100]	valid_0's l2: 0.00173564
[150]	valid_0's l2: 0.00121817
[200]	valid_0's l2: 0.000943737
[250]	valid_0's l2: 0.000793302
[300]	valid_0's l2: 0.000695016
[350]	valid_0's l2: 0.000627902
[400]	valid_0's l2: 0.000584415
[450]	valid_0's l2: 0.000544839
[500]	valid_0's l2: 0.000517288
[550]	valid_0's l2: 0.000491518
[600]	valid_0's l2: 0.000475702
[650]	valid_0's l2: 0.000454464
[700]	valid_0's l2: 0.000443021
[750]	valid_0's l2: 0.00042955
[800]	valid_0's l2: 0.000418906
[850]	valid_0's l2: 0.00041078
[900]	valid_0's l2: 0.000404494
[950]	valid_0's l2: 0.000

In [24]:
from sklearn.metrics import mean_squared_error

rmse_log = np.sqrt(mean_squared_error(y_val, model.predict(X_val)))

rmse_log

np.float64(0.019798517607987193)

In [26]:
import numpy as np

y_pred_price = np.expm1(model.predict(X_val))
y_true_price = np.expm1(y_val)

rmse_price = np.sqrt(mean_squared_error(y_true_price, y_pred_price))

rmse_price

np.float64(77.76135980539306)

In [None]:
from sklearn.metrics import mean_absolute_error

mae_price = mean_absolute_error(y_true_price, y_pred_price)
mae_price


Model artifacts saved successfully!


Checking the how accurate modal is 

In [28]:
# Pick one random sample from validation data
sample_idx = X_val.sample(1, random_state=42).index[0]

X_sample = X_val.loc[[sample_idx]]
y_true_log = y_val.loc[sample_idx]


In [29]:
y_pred_log = model.predict(X_sample)[0]

y_pred_log, y_true_log


(np.float64(7.832027098812179), np.float64(7.836369760545124))

In [30]:
y_pred_price = np.expm1(y_pred_log)
y_true_price = np.expm1(y_true_log)

y_pred_price, y_true_price


(np.float64(2519.0325543431823), np.float64(2530.0))

In [31]:
print(f"Predicted Price : ₹{y_pred_price:.2f}")
print(f"Actual Price    : ₹{y_true_price:.2f}")
print(f"Absolute Error  : ₹{abs(y_pred_price - y_true_price):.2f}")


Predicted Price : ₹2519.03
Actual Price    : ₹2530.00
Absolute Error  : ₹10.97


In [32]:
for idx in X_val.sample(5, random_state=1).index:
    X_s = X_val.loc[[idx]]
    y_t = np.expm1(y_val.loc[idx])
    y_p = np.expm1(model.predict(X_s)[0])

    print(f"Actual: ₹{y_t:.2f} | Predicted: ₹{y_p:.2f} | Error: ₹{abs(y_t - y_p):.2f}")


Actual: ₹2247.00 | Predicted: ₹2225.97 | Error: ₹21.03
Actual: ₹4150.00 | Predicted: ₹4148.94 | Error: ₹1.06
Actual: ₹7520.00 | Predicted: ₹7645.93 | Error: ₹125.93
Actual: ₹2050.00 | Predicted: ₹2040.66 | Error: ₹9.34
Actual: ₹7200.00 | Predicted: ₹7217.77 | Error: ₹17.77


In [34]:
import joblib

joblib.dump(model, "../models/trained_model.pkl")


['../models/trained_model.pkl']

In [35]:
joblib.dump(encoder, "../models/target_encoder.pkl")


['../models/target_encoder.pkl']

In [37]:
feature_columns = X_train.columns.tolist()

joblib.dump(feature_columns, "../models/feature_columns.pkl")


['../models/feature_columns.pkl']

In [38]:
import json
from datetime import datetime

metadata = {
    "model_type": "LightGBM Regressor",
    "target": "log1p(modal_price)",
    "rmse_log": float(rmse_log),
    "rmse_price": float(rmse_price),
    "mae_price": 34.14,
    "training_rows": int(len(X_train)),
    "validation_rows": int(len(X_val)),
    "created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}

with open("../models/model_metadata.json", "w") as f:
    json.dump(metadata, f, indent=4)


In [41]:
import os
os.listdir("../models")


['feature_columns.pkl',
 'model_metadata.json',
 'README.md',
 'target_encoder.pkl',
 'trained_model.pkl']

In [43]:
model_test = joblib.load("../models/trained_model.pkl")
encoder_test = joblib.load("../models/target_encoder.pkl")
features_test = joblib.load("../models/feature_columns.pkl")

print("Model, encoder, and features loaded successfully")


Model, encoder, and features loaded successfully
