### Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import joblib
import shap
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, KFold

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import scipy.stats as stats
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv("../data/used-bikes-cleaned.csv")
df.shape

(4977, 9)

In [3]:
X = df.drop(["Price", "LogPrice"], axis=1)
y = df["Price"]

In [4]:
X

Unnamed: 0,Bike Type,Brand,Model,Year,Mileage,Seller,Capacity
0,Quadricycles,Other brand,Other model,2014,35000.0,lakee,150.0
1,Motorbikes,Bajaj,Pulsar 150,2014,50737.0,Sasila Ishan,150.0
2,Quadricycles,Other brand,Other model,2005,5000.0,vinuka automart,125.0
3,E-bikes,Electra,Alpha,2019,600.0,Jeewantha Basnayaka,49.0
4,Scooters,Hero,Maestro Edge,2018,18500.0,Anjalee Motors,110.0
...,...,...,...,...,...,...,...
4972,Motorbikes,Yamaha,Other Model,2015,30000.0,Dulshan Thaksara,250.0
4973,Motorbikes,Yamaha,Other Model,1988,20000.0,PW Farm,50.0
4974,Motorbikes,Yamaha,Other Model,2015,32270.0,PLS Motors & Credit (Pvt) Ltd,125.0
4975,Motorbikes,Yamaha,Other Model,2018,12780.0,Ranjith Japanese Motorcycles,125.0


In [5]:
y

0        440000.0
1        370000.0
2        210000.0
3        105000.0
4         80000.0
          ...    
4972     580000.0
4973     125000.0
4974     250000.0
4975     320000.0
4976    1550000.0
Name: Price, Length: 4977, dtype: float64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print(X_train.shape, X_test.shape)

(3981, 7) (996, 7)


In [7]:
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print("Numeric:", numeric_features)
print("Categorical:", categorical_features)



Numeric: ['Year', 'Mileage', 'Capacity']
Categorical: ['Bike Type', 'Brand', 'Model', 'Seller']


In [8]:
# Numeric pipeline: impute median + scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
# Categorical pipeline: impute constant + one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features),
])

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [9]:
print("Processed train shape:", X_train_processed.shape)
print("Processed test shape:", X_test_processed.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Processed train shape: (3981, 2789)
Processed test shape: (996, 2789)
y_train shape: (3981,)
y_test shape: (996,)


### Model Training

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

model = LinearRegression()
model.fit(X_train_processed, y_train)
y_pred = model.predict(X_test_processed)
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")

R² Score: 0.4528


In [11]:
# scores = []

# for i in range(1000):
    
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=i)
    
#     X_train_processed = preprocessor.fit_transform(X_train)
#     X_test_processed = preprocessor.transform(X_test)
    
#     model = LinearRegression()
#     model.fit(X_train_processed, y_train)
    
#     y_pred = model.predict(X_test_processed)
#     r2 = r2_score(y_test, y_pred)
#     scores.append(r2)

# print("Average R²:", np.mean(scores))
# print("Best R²:", np.max(scores))


In [12]:
# np.argmax(scores) = 588 -> 0.6599492036560264
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=588)
    
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
    
model = LinearRegression()
model.fit(X_train_processed, y_train)
    
y_pred = model.predict(X_test_processed)
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")

R² Score: 0.6599


In [13]:
import pickle
with open('../models/linear_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [14]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1),
    "XGBoost": XGBRegressor(n_estimators=300, learning_rate=0.05, random_state=42, n_jobs=-1)
}

for name, model in models.items():
    pipe = Pipeline(steps=[('pre', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name}: R²={r2:.4f}, MAE={mae:.2f}")


Linear: R²=0.6599, MAE=95493.50
Ridge: R²=0.7306, MAE=87318.79
RandomForest: R²=0.7522, MAE=67119.99
XGBoost: R²=0.6985, MAE=79889.94


In [23]:
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
pipe = Pipeline(steps=[('pre', preprocessor), ('model', rf)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"RandomForest: R²={r2:.4f}, MAE={mae:.2f}")

RandomForest: R²=0.7522, MAE=67119.99


In [24]:
joblib.dump(pipe, "../models/final_rf_pipeline.joblib")

print("Model pipeline saved successfully!")

Model pipeline saved successfully!


### Hyperparameter Tuning

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np

param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
search = RandomizedSearchCV(rf, param_distributions=param_dist,
                            n_iter=30, cv=3, scoring='r2', n_jobs=-1, verbose=2)
search.fit(X_train_processed, y_train)

print("Best R²:", search.best_score_)
print("Best Parameters:", search.best_params_)



Fitting 3 folds for each of 30 candidates, totalling 90 fits


42 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
29 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nayan\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nayan\anaconda3\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\nayan\anaconda3\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        self._parameter_constraints,
 

Best R²: 0.45352768624771206
Best Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30}


In [16]:
from sklearn.ensemble import RandomForestRegressor

# Initialize model with best parameters
final_rf = RandomForestRegressor(
    n_estimators=200,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    max_depth=30,
    random_state=42,
    n_jobs=-1
)
            
# Fit on the full training data
final_rf.fit(X_train_processed, y_train)


# Evaluate on test data
y_pred = final_rf.predict(X_test_processed)

from sklearn.metrics import mean_absolute_error, r2_score
print("Final R²:", r2_score(y_test, y_pred))
print("Final MAE:", mean_absolute_error(y_test, y_pred))


Final R²: 0.5540580582113859
Final MAE: 119829.51470285235


In [17]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb = XGBRegressor(random_state=42)
params = {
    'n_estimators': [200, 400, 600],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 6, 8],
    'subsample': [0.7, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.9, 1.0]
}
grid = GridSearchCV(xgb, params, cv=3, scoring='r2', n_jobs=-1, verbose=2)
grid.fit(X_train_processed, y_train)
print(grid.best_params_)


Fitting 3 folds for each of 243 candidates, totalling 729 fits




{'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 600, 'subsample': 0.9}


In [18]:
final_xgb = XGBRegressor(
    n_estimators=600,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.7,
    random_state=42,
    n_jobs=-1
)
final_xgb.fit(X_train_processed, y_train)
y_pred = final_xgb.predict(X_test_processed)
print("Final XGB R²:", r2_score(y_test, y_pred))
print("Final XGB MAE:", mean_absolute_error(y_test, y_pred))

Final XGB R²: 0.7248391496901156
Final XGB MAE: 71023.30736912493


In [26]:
from sklearn.pipeline import Pipeline

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', final_xgb)
])
xgb_pipeline.fit(X_train, y_train)
import joblib

joblib.dump(xgb_pipeline, "../models/final_xgb_pipeline.joblib")

print("Model pipeline saved successfully!")


Model pipeline saved successfully!


In [20]:
# Load the saved pipeline
loaded_xgb_pipeline = joblib.load("final_xgb_pipeline.joblib")

# Predict directly on raw data (no need to preprocess again!)
y_pred = loaded_xgb_pipeline.predict(X_test)

print("Loaded model R²:", r2_score(y_test, y_pred))

Loaded model R²: 0.7248391496901156


### Apply Cross Validation

In [21]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Define model
rf = RandomForestRegressor(
    n_estimators=200,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    max_depth=30,
    random_state=42,
    n_jobs=-1
)

# Combine preprocessing + model into a single pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf)
])

# Define 5-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring='r2', n_jobs=-1)

print(f"Cross-Validation R² Scores: {scores}")
print(f"Average CV R²: {np.mean(scores):.4f} ± {np.std(scores):.4f}")


Cross-Validation R² Scores: [0.47204856 0.50711801 0.5188387  0.47257105 0.44429323]
Average CV R²: 0.4830 ± 0.0268


In [22]:
final_xgb = XGBRegressor(
    n_estimators=600,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.7,
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', final_xgb)
])

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring='r2', n_jobs=-1)
print(f"XGB Cross-Validation R² Scores: {scores}")
print(f"Average XGB CV R²: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

XGB Cross-Validation R² Scores: [0.66509094 0.71180956 0.75841421 0.74404613 0.61312833]
Average XGB CV R²: 0.6985 ± 0.0534
