In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score   

In [3]:
df = pd.read_csv('Car_Price_Prediction.csv')

In [4]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Size,Mileage,Fuel Type,Transmission,Price
0,Honda,Model B,2015,3.9,74176,Petrol,Manual,30246.207931
1,Ford,Model C,2014,1.7,94799,Electric,Automatic,22785.747684
2,BMW,Model B,2006,4.1,98385,Electric,Manual,25760.290347
3,Honda,Model B,2015,2.6,88919,Electric,Automatic,25638.003491
4,Honda,Model C,2004,3.4,138482,Petrol,Automatic,21021.386657


In [5]:
df.tail()

Unnamed: 0,Make,Model,Year,Engine Size,Mileage,Fuel Type,Transmission,Price
995,Toyota,Model D,2002,1.9,5445,Petrol,Manual,22765.597091
996,Honda,Model B,2020,3.1,149112,Diesel,Manual,30392.575567
997,Ford,Model C,2008,1.9,195387,Petrol,Automatic,16446.892292
998,Toyota,Model A,2003,4.4,246,Petrol,Automatic,27396.156708
999,Audi,Model C,2020,2.1,31153,Diesel,Automatic,31620.864907


In [6]:
df.shape

(1000, 8)

In [7]:
df.columns

Index(['Make', 'Model', 'Year', 'Engine Size', 'Mileage', 'Fuel Type',
       'Transmission', 'Price'],
      dtype='object')

In [8]:
df.dtypes

Make             object
Model            object
Year              int64
Engine Size     float64
Mileage           int64
Fuel Type        object
Transmission     object
Price           float64
dtype: object

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Make          1000 non-null   object 
 1   Model         1000 non-null   object 
 2   Year          1000 non-null   int64  
 3   Engine Size   1000 non-null   float64
 4   Mileage       1000 non-null   int64  
 5   Fuel Type     1000 non-null   object 
 6   Transmission  1000 non-null   object 
 7   Price         1000 non-null   float64
dtypes: float64(2), int64(2), object(4)
memory usage: 62.6+ KB


In [10]:
df.describe()

Unnamed: 0,Year,Engine Size,Mileage,Price
count,1000.0,1000.0,1000.0,1000.0
mean,2010.688,2.7983,97192.487,25136.61553
std,6.288577,1.024137,59447.31576,5181.401368
min,2000.0,1.0,56.0,6704.953524
25%,2005.0,1.9,44768.75,21587.87837
50%,2011.0,2.8,94411.5,25189.325247
75%,2016.0,3.7,148977.75,28806.368974
max,2021.0,4.5,199867.0,41780.504635


In [11]:
df.nunique()

Make               5
Model              5
Year              22
Engine Size       36
Mileage          997
Fuel Type          3
Transmission       2
Price           1000
dtype: int64

In [12]:
df['Make'].unique()

array(['Honda', 'Ford', 'BMW', 'Audi', 'Toyota'], dtype=object)

In [13]:
df['Make'].value_counts()

Make
Ford      225
Audi      212
Honda     198
Toyota    187
BMW       178
Name: count, dtype: int64

In [14]:
# One-Hot Encoding for Fuel Type
# This creates binary columns: Fuel_Diesel, Fuel_Electric, Fuel_Petrol
df = pd.get_dummies(df, columns=["Fuel Type"], drop_first=False)

In [15]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Size,Mileage,Transmission,Price,Fuel Type_Diesel,Fuel Type_Electric,Fuel Type_Petrol
0,Honda,Model B,2015,3.9,74176,Manual,30246.207931,False,False,True
1,Ford,Model C,2014,1.7,94799,Automatic,22785.747684,False,True,False
2,BMW,Model B,2006,4.1,98385,Manual,25760.290347,False,True,False
3,Honda,Model B,2015,2.6,88919,Automatic,25638.003491,False,True,False
4,Honda,Model C,2004,3.4,138482,Automatic,21021.386657,False,False,True


In [16]:
# Encode Transmission as binary (Auto=1, Manual=0)
df["Transmission_Auto"] = (df["Transmission"] == "Automatic").astype(int)
df.drop("Transmission", axis=1, inplace=True)

df.head()

Unnamed: 0,Make,Model,Year,Engine Size,Mileage,Price,Fuel Type_Diesel,Fuel Type_Electric,Fuel Type_Petrol,Transmission_Auto
0,Honda,Model B,2015,3.9,74176,30246.207931,False,False,True,0
1,Ford,Model C,2014,1.7,94799,22785.747684,False,True,False,1
2,BMW,Model B,2006,4.1,98385,25760.290347,False,True,False,0
3,Honda,Model B,2015,2.6,88919,25638.003491,False,True,False,1
4,Honda,Model C,2004,3.4,138482,21021.386657,False,False,True,1


In [17]:
# Make + Model → One-hot encoding
df = pd.get_dummies(df, columns=["Make", "Model"], drop_first=True)

In [18]:
df.head()

Unnamed: 0,Year,Engine Size,Mileage,Price,Fuel Type_Diesel,Fuel Type_Electric,Fuel Type_Petrol,Transmission_Auto,Make_BMW,Make_Ford,Make_Honda,Make_Toyota,Model_Model B,Model_Model C,Model_Model D,Model_Model E
0,2015,3.9,74176,30246.207931,False,False,True,0,False,False,True,False,True,False,False,False
1,2014,1.7,94799,22785.747684,False,True,False,1,False,True,False,False,False,True,False,False
2,2006,4.1,98385,25760.290347,False,True,False,0,True,False,False,False,True,False,False,False
3,2015,2.6,88919,25638.003491,False,True,False,1,False,False,True,False,True,False,False,False
4,2004,3.4,138482,21021.386657,False,False,True,1,False,False,True,False,False,True,False,False


In [19]:
# Split features and target
X = df.drop("Price", axis=1)
y = df["Price"]

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)s

In [21]:
# Scale numerical and dummy features
# Scaling ensures fair weight contribution for linear models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)

print("Linear Regression Results")
print("MAE :", mean_absolute_error(y_test, y_pred_lr))
#print("RMSE:", mean_squared_error(y_test, y_pred_lr, squared=False))
print("R²  :", r2_score(y_test, y_pred_lr))


Linear Regression Results
MAE : 1640.477742191896
R²  : 0.8408448457658892


In [23]:
# Multiple Linear Regression
mlr = LinearRegression()
mlr.fit(X_train_scaled, y_train)

y_pred_mlr = mlr.predict(X_test_scaled)

print("Multiple Linear Regression Results")
print("MAE:", mean_absolute_error(y_test, y_pred_mlr))
#print("RMSE:", mean_squared_error(y_test, y_pred_mlr, squared=False))
print("R²:", r2_score(y_test, y_pred_mlr))


Multiple Linear Regression Results
MAE: 1640.477742191896
R²: 0.8408448457658892


In [24]:
# Polynomial Regression
# Adds polynomial terms like x^2, x^3 for non-linear relationships

poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

poly_reg = LinearRegression()
poly_reg.fit(X_train_poly, y_train)

y_pred_poly = poly_reg.predict(X_test_poly)

print("Polynomial Regression Results")
print("MAE:", mean_absolute_error(y_test, y_pred_poly))
# print("RMSE:", mean_squared_error(y_test, y_pred_poly, squared=False))
print("R²:", r2_score(y_test, y_pred_poly))


Polynomial Regression Results
MAE: 1819.1203570011398
R²: 0.8016851554474702


In [25]:
# Ridge Regression
# L2 regularization reduces overfitting by shrinking coefficients

ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)

y_pred_ridge = ridge.predict(X_test_scaled)

print("Ridge Regression Results")
print("MAE:", mean_absolute_error(y_test, y_pred_ridge))
# print("RMSE:", mean_squared_error(y_test, y_pred_ridge, squared=False))
print("R²:", r2_score(y_test, y_pred_ridge))


Ridge Regression Results
MAE: 1639.6090212902002
R²: 0.8409488124529934


In [26]:
# Lasso Regression
# L1 regularization may zero-out unnecessary coefficients

lasso = Lasso(alpha=0.01, max_iter=10000)
lasso.fit(X_train_scaled, y_train)

y_pred_lasso = lasso.predict(X_test_scaled)

print("Lasso Regression Results")
print("MAE:", mean_absolute_error(y_test, y_pred_lasso))
# print("RMSE:", mean_squared_error(y_test, y_pred_lasso, squared=False))
print("R²:", r2_score(y_test, y_pred_lasso))


Lasso Regression Results
MAE: 1640.470012727073
R²: 0.8408462415530787


In [27]:
# ElasticNet Regression
# Combination of L1 and L2 penalties

enet = ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=10000)
enet.fit(X_train_scaled, y_train)

y_pred_enet = enet.predict(X_test_scaled)

print("ElasticNet Regression Results")
print("MAE:", mean_absolute_error(y_test, y_pred_enet))
# print("RMSE:", mean_squared_error(y_test, y_pred_enet, squared=False))
print("R²:", r2_score(y_test, y_pred_enet))


ElasticNet Regression Results
MAE: 1637.046172431581
R²: 0.8412402488074994


In [28]:
# Create a dictionary to store results

results = {
    "Linear Regression": {
        "MAE": mean_absolute_error(y_test, y_pred_lr),
        #"RMSE": mean_squared_error(y_test, y_pred_lr, squared=False),
        "R2": r2_score(y_test, y_pred_lr)
    },

    "Multiple Linear Regression": {
        "MAE": mean_absolute_error(y_test, y_pred_mlr),
        #"RMSE": mean_squared_error(y_test, y_pred_mlr, squared=False),
        "R2": r2_score(y_test, y_pred_mlr)
    },

    "Polynomial Regression": {
        "MAE": mean_absolute_error(y_test, y_pred_poly),
        #"RMSE": mean_squared_error(y_test, y_pred_poly, squared=False),
        "R2": r2_score(y_test, y_pred_poly)
    },

    "Ridge Regression": {
        "MAE": mean_absolute_error(y_test, y_pred_ridge),
        #"RMSE": mean_squared_error(y_test, y_pred_ridge, squared=False),
        "R2": r2_score(y_test, y_pred_ridge)
    },

    "Lasso Regression": {
        "MAE": mean_absolute_error(y_test, y_pred_lasso),
        #"RMSE": mean_squared_error(y_test, y_pred_lasso, squared=False),
        "R2": r2_score(y_test, y_pred_lasso)
    },

    "ElasticNet Regression": {
        "MAE": mean_absolute_error(y_test, y_pred_enet),
        #"RMSE": mean_squared_error(y_test, y_pred_enet, squared=False),
        "R2": r2_score(y_test, y_pred_enet)
    }
}

# Convert to DataFrame
results_df = pd.DataFrame(results).T
print(results_df)


                                    MAE        R2
Linear Regression           1640.477742  0.840845
Multiple Linear Regression  1640.477742  0.840845
Polynomial Regression       1819.120357  0.801685
Ridge Regression            1639.609021  0.840949
Lasso Regression            1640.470013  0.840846
ElasticNet Regression       1637.046172  0.841240
