Library Imports and Data Loading 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Step 1: Load the Dataset
data = pd.read_csv(r"E:\Price_Prediction_Model\car_price_data.csv")  # Replace with your dataset's file path

# Step 2: View the Dataset
print(data.head())

   car   brand         model damage location                    part  \
0  yes  toyota  toyota camry    yes    front            front bumper   
1  yes  toyota  toyota camry    yes    front            front bumper   
2  yes  toyota  toyota camry    yes    front            front bumper   
3  yes  toyota  toyota camry    yes     side  left and right fenders   
4  yes  toyota  toyota camry    yes     side  left and right fenders   

   severity  price  
0       low   46.0  
1  moderate   75.0  
2      high  250.0  
3       low   48.5  
4  moderate   70.0  


 Display Dataset Information

In [2]:
print("\nDataset Info:")
data.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   car       145 non-null    object 
 1   brand     145 non-null    object 
 2   model     145 non-null    object 
 3   damage    145 non-null    object 
 4   location  145 non-null    object 
 5   part      145 non-null    object 
 6   severity  145 non-null    object 
 7   price     145 non-null    float64
dtypes: float64(1), object(7)
memory usage: 9.2+ KB


Drop 'brand' column if it exists

In [3]:
# Drop 'brand' column if it exists
if 'brand' in data.columns:
    data.drop(columns=['brand'], inplace=True)
    print("'brand' column dropped successfully.")
else:
    print("'brand' column does not exist in the dataset.")

'brand' column dropped successfully.


Handle Outliers in Target (Price)

Outliers in the target variable can distort the performance of regression models.

 We'll use the Interquartile Range (IQR) method to identify and remove outliers in the 'price' column.

In [16]:
# Step 1: Handle Outliers in Target
q1 = data['price'].quantile(0.25)
q3 = data['price'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
data = data[(data['price'] >= lower_bound) & (data['price'] <= upper_bound)]

Log transformation is commonly used to stabilize variance and make the distribution of the target variable more normal.

One-hot encoding converts categorical variables into binary (0 or 1) columns for each unique category.

In [5]:
# Step 2: Log Transform the Target
data['price'] = np.log1p(data['price'])

# Step 3: One-Hot Encoding for Categorical Variables
data = pd.get_dummies(data, columns=['car', 'model', 'damage', 'location', 'part', 'severity'], drop_first=True)

Separate the dataset into the input features (X) and the target variable (y)

Split the data into training and testing sets

In [18]:
# Step 5: Split Data into Features and Target
X = data.drop(columns=['price'])
y = data['price']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Initialize Models with Default Parameters

In [7]:
# Step 6: Initialize Models with Default Parameters
xgb_model = XGBRegressor(random_state=42)
lgb_model = LGBMRegressor(random_state=42)
cat_model = CatBoostRegressor(silent=True, random_state=42)

# Step 7: Hyperparameter Tuning for Each Model
# Hyperparameter tuning aims to find the best combination of hyperparameters to improve model performance.
# We use RandomizedSearchCV for faster searching over a large parameter space.
# Tuning XGBoost

xgb_param_dist = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_param_dist,
    n_iter=50,
    cv=5,
    scoring='r2',
    random_state=42,
    n_jobs=-1
)
xgb_random_search.fit(X_train, y_train)
xgb_best = xgb_random_search.best_estimator_
print("Best Parameters for XGBoost:", xgb_random_search.best_params_)

Best Parameters for XGBoost: {'subsample': 0.6, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.6}


Tuning LightGBM

In this step, we perform hyperparameter tuning for the LightGBM model using RandomizedSearchCV.

We define a parameter distribution to explore a range of values for different hyperparameters.

In [8]:
# Tuning LightGBM
lgb_param_dist = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0]
}

lgb_random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=lgb_param_dist,
    n_iter=50,
    cv=5,
    scoring='r2',
    random_state=42,
    n_jobs=-1
)
lgb_random_search.fit(X_train, y_train)
lgb_best = lgb_random_search.best_estimator_
print("Best Parameters for LightGBM:", lgb_random_search.best_params_)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18
[LightGBM] [Info] Number of data points in the train set: 108, number of used features: 9
[LightGBM] [Info] Start training from score 5.088926
Best Parameters for LightGBM: {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.1}


Tuning CatBoost

In this step, we perform hyperparameter tuning for the CatBoost model using RandomizedSearchCV.

We define a parameter distribution for various hyperparameters of the CatBoost model to search over.

In [9]:
# Tuning CatBoost
cat_param_dist = {
    'iterations': [100, 200, 500],
    'depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0]
}

cat_random_search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=cat_param_dist,
    n_iter=50,
    cv=5,
    scoring='r2',
    random_state=42,
    n_jobs=-1
)
cat_random_search.fit(X_train, y_train)
cat_best = cat_random_search.best_estimator_
print("Best Parameters for CatBoost:", cat_random_search.best_params_)

Best Parameters for CatBoost: {'subsample': 0.8, 'learning_rate': 0.1, 'iterations': 200, 'depth': 3}


stacking_model.fit(X_train, y_train): This trains the stacking regressor on the training data, where each base model is trained on X_train, and their predictions are used by the final estimator (Linear Regression) to make the final prediction.

In [10]:
# Step 8: Stacking Regressor with Best Models
stacking_model = StackingRegressor(
    estimators=[
        ('xgb', xgb_best),
        ('lgb', lgb_best),
        ('cat', cat_best)
    ],
    final_estimator=LinearRegression()
)

# Train Stacking Model
stacking_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18
[LightGBM] [Info] Number of data points in the train set: 108, number of used features: 9
[LightGBM] [Info] Start training from score 5.088926
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18
[LightGBM] [Info] Number of data points in the train set: 86, number of used features: 9
[LightGBM] [Info] Start training from score 5.118553
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14
[LightGBM] [Info] Number of data points in the train set: 86, number of used features: 7
[LightGBM] [Info] Start training from score 

In [11]:
# Step 9: Evaluate the Model
y_pred = stacking_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nStacking Regressor Results:")
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")


Stacking Regressor Results:
Mean Squared Error: 0.08230608209057413
R-squared Score: 0.9587304535788264


In [12]:
# Step 10: Reverse Log Transformation of Predictions (if needed)
y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred)

# Final Evaluation with Actual Target Scale
final_mse = mean_squared_error(y_test_actual, y_pred_actual)
final_r2 = r2_score(y_test_actual, y_pred_actual)

print("\nFinal Results on Original Scale:")
print(f"Mean Squared Error: {final_mse}")
print(f"R-squared Score: {final_r2}")


Final Results on Original Scale:
Mean Squared Error: 14412.374744670662
R-squared Score: 0.7751123031262204


In [13]:
import pickle

# Save the  model to a file
with open('Price_Prediction_model.pkl', 'wb') as file:
    pickle.dump(stacking_model, file)