In [296]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from statsmodels import api as sm
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Link to data set: https://www.kaggle.com/datasets/asinow/car-price-dataset

# Preprocessing

In [297]:
data = pd.read_csv("car_price_dataset.csv")

In [298]:
data.shape

(10000, 10)

In [299]:
data.head()

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
0,Kia,Rio,2020,4.2,Diesel,Manual,289944,3,5,8501
1,Chevrolet,Malibu,2012,2.0,Hybrid,Automatic,5356,2,3,12092
2,Mercedes,GLA,2020,4.2,Diesel,Automatic,231440,4,2,11171
3,Audi,Q5,2023,2.0,Electric,Manual,160971,2,1,11780
4,Volkswagen,Golf,2003,2.6,Hybrid,Semi-Automatic,286618,3,3,2867


In [300]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         10000 non-null  object 
 1   Model         10000 non-null  object 
 2   Year          10000 non-null  int64  
 3   Engine_Size   10000 non-null  float64
 4   Fuel_Type     10000 non-null  object 
 5   Transmission  10000 non-null  object 
 6   Mileage       10000 non-null  int64  
 7   Doors         10000 non-null  int64  
 8   Owner_Count   10000 non-null  int64  
 9   Price         10000 non-null  int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 781.4+ KB


#### Step 1: Check for missing values

In [301]:
print("\nMissing values in the dataset:")
data.isnull().sum()


Missing values in the dataset:


Brand           0
Model           0
Year            0
Engine_Size     0
Fuel_Type       0
Transmission    0
Mileage         0
Doors           0
Owner_Count     0
Price           0
dtype: int64

#### Step 2: Handling missing values

Data contains No missing Value. No need to handle missing values.

#### Step 3 Setting up for Encoding Categorical Columns.


In [302]:
categoricals = data.select_dtypes(include=['object']).columns.tolist()
categoricals

['Brand', 'Model', 'Fuel_Type', 'Transmission']

All the categorical columns are Nominal.

In [303]:
for each in categoricals:
    print(each, len(data[each].unique().tolist()))

Brand 10
Model 30
Fuel_Type 4
Transmission 3


Grouping rare categories to avoid overfitting and reduce cardinality.

In [304]:
threshold = 25
data[categoricals] = data[categoricals].apply(
    lambda each: each.where(each.isin(each.value_counts().nlargest(threshold).index), "Other"))

In [305]:
for each in categoricals:
    print(each, len(data[each].unique().tolist()))

Brand 10
Model 26
Fuel_Type 4
Transmission 3


In [306]:
data.head()

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
0,Kia,Rio,2020,4.2,Diesel,Manual,289944,3,5,8501
1,Chevrolet,Malibu,2012,2.0,Hybrid,Automatic,5356,2,3,12092
2,Mercedes,Other,2020,4.2,Diesel,Automatic,231440,4,2,11171
3,Audi,Q5,2023,2.0,Electric,Manual,160971,2,1,11780
4,Volkswagen,Golf,2003,2.6,Hybrid,Semi-Automatic,286618,3,3,2867


#### Step 4: Applying One-Hot Encoding to nominal columns

In [307]:
data = pd.get_dummies(data, columns=categoricals, drop_first=True)

Dropped the first column to avoid  <h5>Dummy Variable Trap</h5>

In [308]:
X = data.drop(columns=['Price']) # All columns except Target column
y = data['Price'] # Target column

#### Step 5: Split data into train and test sets

In [309]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Step 6: Standardize numerical features

In [310]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# Applying  Multiple Linear Regression

In [311]:
Results = {}

In [312]:
cols =  data.columns.tolist()
cols

['Year',
 'Engine_Size',
 'Mileage',
 'Doors',
 'Owner_Count',
 'Price',
 'Brand_BMW',
 'Brand_Chevrolet',
 'Brand_Ford',
 'Brand_Honda',
 'Brand_Hyundai',
 'Brand_Kia',
 'Brand_Mercedes',
 'Brand_Toyota',
 'Brand_Volkswagen',
 'Model_5 Series',
 'Model_A3',
 'Model_A4',
 'Model_Accord',
 'Model_Camry',
 'Model_Civic',
 'Model_Corolla',
 'Model_E-Class',
 'Model_Elantra',
 'Model_Equinox',
 'Model_Explorer',
 'Model_Fiesta',
 'Model_Focus',
 'Model_Golf',
 'Model_Impala',
 'Model_Malibu',
 'Model_Optima',
 'Model_Other',
 'Model_Passat',
 'Model_Q5',
 'Model_RAV4',
 'Model_Rio',
 'Model_Sonata',
 'Model_Tiguan',
 'Model_Tucson',
 'Fuel_Type_Electric',
 'Fuel_Type_Hybrid',
 'Fuel_Type_Petrol',
 'Transmission_Manual',
 'Transmission_Semi-Automatic']

### All Variables (No Feature Selection)

In [313]:
regressor =  LinearRegression()

In [314]:
regressor.fit(X_train, y_train)

In [315]:
y_pred = regressor.predict(X_test)

In [316]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
Results['All Variables'] = [mse, r2]
mse, r2

(4213.587770470056, 0.9995413937681955)

In [317]:
X_train

array([[-0.6553775 ,  0.17613484, -1.33393191, ..., -0.56945944,
        -0.71514348,  1.42207221],
       [-0.22042852,  1.56362082,  0.87031347, ..., -0.56945944,
        -0.71514348,  1.42207221],
       [-0.80036049, -0.95119752, -1.57078105, ..., -0.56945944,
        -0.71514348,  1.42207221],
       ...,
       [-0.80036049,  1.3034672 , -1.10112397, ...,  1.7560513 ,
        -0.71514348,  1.42207221],
       [ 0.64946944,  1.21674933,  1.33525966, ..., -0.56945944,
        -0.71514348,  1.42207221],
       [ 1.3743844 , -1.12463327,  0.76531146, ..., -0.56945944,
        -0.71514348,  1.42207221]])

In [318]:
X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)
cols.remove('Price')
cols.insert(0, 'intercept')

In [319]:
X_train_const

array([[ 1.        , -0.6553775 ,  0.17613484, ..., -0.56945944,
        -0.71514348,  1.42207221],
       [ 1.        , -0.22042852,  1.56362082, ..., -0.56945944,
        -0.71514348,  1.42207221],
       [ 1.        , -0.80036049, -0.95119752, ..., -0.56945944,
        -0.71514348,  1.42207221],
       ...,
       [ 1.        , -0.80036049,  1.3034672 , ...,  1.7560513 ,
        -0.71514348,  1.42207221],
       [ 1.        ,  0.64946944,  1.21674933, ..., -0.56945944,
        -0.71514348,  1.42207221],
       [ 1.        ,  1.3743844 , -1.12463327, ..., -0.56945944,
        -0.71514348,  1.42207221]])

In [320]:
model_all = sm.OLS(y_train, X_train_const).fit()
y_pred_all = model_all.predict(X_test_const)

In [321]:
mse_all = mean_squared_error(y_test, y_pred_all)
r2_all = r2_score(y_test, y_pred_all)
model_all.summary(), mse_all, r2_all

(<class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 Dep. Variable:                  Price   R-squared:                       0.999
 Model:                            OLS   Adj. R-squared:                  0.999
 Method:                 Least Squares   F-statistic:                 2.529e+05
 Date:                Sun, 09 Mar 2025   Prob (F-statistic):               0.00
 Time:                        21:59:26   Log-Likelihood:                -47360.
 No. Observations:                8000   AIC:                         9.480e+04
 Df Residuals:                    7961   BIC:                         9.507e+04
 Df Model:                          38                                         
 Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
 ---------------------------------------------------------------------

### Backward Elimination

Iteratively remove features with p-value > 0.05

In [322]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)

X_opt = X_train_const.copy()

while True:
    model = sm.OLS(y_train, X_opt).fit()
    p_values = model.pvalues

    max_p_value = p_values.max()
    if max_p_value > 0.05:  
        feature_to_remove = p_values.idxmax()
        X_opt.drop(columns=[feature_to_remove], inplace=True)
    else:
        break  


In [323]:
selected_indices =  X_opt.columns.tolist()
selected_indices = selected_indices[1:]
selected_columns = [cols[each] for each in selected_indices]
print('Selected Columns from backward elimination:', selected_columns)

Selected Columns from backward elimination: ['intercept', 'Year', 'Engine_Size', 'Model_Camry', 'Model_Optima', 'Model_Tucson', 'Fuel_Type_Electric', 'Fuel_Type_Petrol', 'Transmission_Manual']


In [324]:
model_backward = sm.OLS(y_train, X_opt).fit()
y_pred_backward = model_backward.predict(X_test_const[X_opt.columns]) 

mse = mean_squared_error(y_test, y_pred_backward)
r2  = r2_score(y_test, y_pred_backward)
Results['Backward Elimination'] = [mse, r2] 

In [325]:
print(model_backward.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 1.069e+06
Date:                Sun, 09 Mar 2025   Prob (F-statistic):               0.00
Time:                        21:59:29   Log-Likelihood:                -47368.
No. Observations:                8000   AIC:                         9.476e+04
Df Residuals:                    7990   BIC:                         9.483e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       8828.7453      1.009   8748.344      0.0

### Forward Selection

In [326]:
from sklearn.feature_selection import f_regression
selected_features = ['const']
remaining_features = list(X_train.columns)
best_score = 0
while remaining_features:
    scores = {}
    for feature in remaining_features:
        temp_features = selected_features + [feature]
        model = sm.OLS(y_train, X_train_const[temp_features]).fit()
        scores[feature] = model.rsquared
    best_feature = max(scores, key=scores.get)
    if scores[best_feature] > best_score:
        best_score = scores[best_feature]
        selected_features.append(best_feature)
        remaining_features.remove(best_feature)
    else:
        break

X_train_forward = X_train_const[selected_features]
X_test_forward = X_test_const[selected_features]


In [327]:
model_forward = sm.OLS(y_train, X_train_forward).fit()
y_pred_forward = model_forward.predict(X_test_forward)

mse_forward = mean_squared_error(y_test, y_pred_forward)
r2_forward = r2_score(y_test, y_pred_forward)

In [328]:
Results['Forward Selection'] = [mse_forward, r2_forward]

In [329]:
selected_features  = selected_features[1:]
Selected_Columns = [cols[int(each)] for each in selected_features]
print('Selected Columns from forward selection:', Selected_Columns)

Selected Columns from forward selection: ['intercept', 'Engine_Size', 'Year', 'Model_Tucson', 'Fuel_Type_Electric', 'Fuel_Type_Petrol', 'Transmission_Manual', 'Model_Camry', 'Model_Optima', 'Brand_Hyundai', 'Model_Q5', 'Model_A4', 'Brand_Chevrolet', 'Brand_Kia', 'Mileage', 'Model_Tiguan', 'Brand_Volkswagen', 'Model_Malibu', 'Model_RAV4', 'Model_Accord', 'Model_A3', 'Model_Equinox', 'Owner_Count', 'Model_Elantra', 'Model_Explorer', 'Model_5 Series', 'Doors', 'Model_E-Class', 'Brand_Ford', 'Model_Corolla', 'Model_Focus', 'Brand_Toyota', 'Model_Impala', 'Model_Passat', 'Brand_BMW', 'Fuel_Type_Hybrid', 'Model_Other', 'Brand_Honda']


In [330]:
model_forward.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.999
Model:,OLS,Adj. R-squared:,0.999
Method:,Least Squares,F-statistic:,252900.0
Date:,"Sun, 09 Mar 2025",Prob (F-statistic):,0.0
Time:,22:00:00,Log-Likelihood:,-47360.0
No. Observations:,8000,AIC:,94800.0
Df Residuals:,7961,BIC:,95070.0
Df Model:,38,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8828.7452,1.010,8741.789,0.000,8826.765,8830.725
0,2058.8882,1.012,2034.781,0.000,2056.905,2060.872
2,-1721.8599,1.013,-1700.247,0.000,-1723.845,-1719.875
1,1144.3550,1.012,1130.306,0.000,1142.370,1146.340
39,870.7602,1.240,702.113,0.000,868.329,873.191
40,424.8263,1.234,344.208,0.000,422.407,427.246
42,-704.7733,1.170,-602.251,0.000,-707.067,-702.479
43,-702.0938,1.172,-599.138,0.000,-704.391,-699.797
19,3.5251,2.003,1.760,0.078,-0.401,7.451

0,1,2,3
Omnibus:,15469.448,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26555788.503
Skew:,15.341,Prob(JB):,0.0
Kurtosis:,283.581,Cond. No.,10.1


### Bidirectional Elimination

In [333]:
def bidirectional_elimination(X=X_train_const, y=y_train, significance_level=0.05):
    selected_features = []
    remaining_features = list(X.columns)
    remaining_features.remove('const')  # Exclude intercept from selection

    best_score = 0  # Track the best model score

    while remaining_features:
        # Forward Selection: Add the most significant feature
        best_feature = None
        best_pval = float('inf')
        
        for feature in remaining_features:
            temp_features = selected_features + [feature]
            X_temp = sm.add_constant(X[temp_features])
            model = sm.OLS(y, X_temp).fit()
            pval = model.pvalues[feature]

            if pval < best_pval:
                best_pval = pval
                best_feature = feature
                
        if best_pval < significance_level:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
            
            # Backward Elimination: Remove insignificant features while ensuring score improvement
            while len(selected_features) > 0:
                X_temp = sm.add_constant(X[selected_features])
                model = sm.OLS(y, X_temp).fit()
                pvalues = model.pvalues.drop('const')
                
                worst_pval_feature = pvalues.idxmax()
                worst_pval = pvalues.max()
                
                if worst_pval > significance_level and model.rsquared_adj > best_score: #Check if removal improves score
                    selected_features.remove(worst_pval_feature)
                    best_score = model.rsquared_adj  # Update best score
                else:
                    break
        else:
            break

    return selected_features

In [334]:
selected_indices = bidirectional_elimination()
selected_columns = [cols[int(each)] for each in selected_indices]
print("Selected Features:", selected_columns)

Selected Features: ['intercept', 'Year', 'Engine_Size', 'Model_Tucson', 'Fuel_Type_Electric', 'Fuel_Type_Petrol', 'Transmission_Manual', 'Model_Camry', 'Model_Optima']


In [335]:
X_train_bidirectional = X_train_const[selected_indices]
X_test_bidirectional = X_test_const[selected_indices]

In [336]:
model_bidirectional = sm.OLS(y_train, X_train_bidirectional).fit()
y_pred_bidirectional = model_bidirectional.predict(X_test_bidirectional)


In [337]:
mse_bidirectional = mean_squared_error(y_test, y_pred_bidirectional)
r2_bidirectional = r2_score(y_test, y_pred_bidirectional)
Results['Bidirectional Elimination'] = [mse_bidirectional, r2_bidirectional]

In [338]:
model_bidirectional.summary()

0,1,2,3
Dep. Variable:,Price,R-squared (uncentered):,0.112
Model:,OLS,Adj. R-squared (uncentered):,0.111
Method:,Least Squares,F-statistic:,111.6
Date:,"Sun, 09 Mar 2025",Prob (F-statistic):,5.13e-198
Time:,22:02:06,Log-Likelihood:,-84038.0
No. Observations:,8000,AIC:,168100.0
Df Residuals:,7991,BIC:,168200.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
0.0,2058.8663,98.851,20.828,0.000,1865.093,2252.640
1.0,1144.3533,98.874,11.574,0.000,950.534,1338.172
2.0,-1721.8677,98.887,-17.412,0.000,-1915.712,-1528.023
39.0,870.6904,104.905,8.300,0.000,665.049,1076.332
40.0,424.7570,104.928,4.048,0.000,219.072,630.442
42.0,-704.7793,114.306,-6.166,0.000,-928.848,-480.711
43.0,-702.0780,114.397,-6.137,0.000,-926.327,-477.829
19.0,2.7806,99.179,0.028,0.978,-191.636,197.197
31.0,2.3320,99.170,0.024,0.981,-192.067,196.731

0,1,2,3
Omnibus:,15488.124,Durbin-Watson:,0.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26712196.783
Skew:,15.382,Prob(JB):,0.0
Kurtosis:,284.407,Cond. No.,1.75


#  Results

In [339]:
for i,j in Results.items():
    print(f'{i}: MSE: {j[0]} R2: {j[1]}')

All Variables: MSE: 4213.587770470056 R2: 0.9995413937681955
Backward Elimination: MSE: 4177.848329952681 R2: 0.9995452836432936
Forward Selection: MSE: 4213.587770469663 R2: 0.9995413937681955
Bidirectional Elimination: MSE: 77911807.57864165 R2: -7.4799089120990665


## Key Findings:
- The models using all variables, backward elimination, and forward selection perform exceptionally well, with R² values close to 1, indicating that the models explain almost all the variance in the target variable.

- The bidirectional elimination approach did not perform well, suggesting that this method may not be appropriate for this dataset or that the implementation needs further refinement.