In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df = pd.read_csv(r"used_cars_data.csv")

In [3]:
df.drop_duplicates(inplace=True)

In [4]:
df.isna().sum()

Location                0
Year                    0
Kilometers_Driven       1
Fuel_Type               0
Transmission            0
Owner_Type              0
Seats                  53
New_Price               0
Price                1232
mileage_num            83
engine_num             46
power_num             175
Brand                   0
Model                   0
dtype: int64

In [5]:
df["Seats"] = df.groupby(["Brand", "Model"])["Seats"].transform(lambda x: x.fillna(x.median()))

In [7]:
df[df["Seats"].isna()]

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,New_Price,Price,mileage_num,engine_num,power_num,Brand,Model
2369,Chennai,2008,56000.0,Petrol,Manual,Second,,7.88,1.5,19.5,1061.0,,maruti,estilo
3882,Kolkata,2010,40000.0,Petrol,Manual,Second,,7.88,2.5,19.5,1061.0,,maruti,estilo
5893,Chennai,2008,51000.0,Petrol,Manual,Second,,7.88,1.75,19.5,1061.0,,maruti,estilo


In [8]:
df["Seats"] = df["Seats"].fillna(5.0) # or median

In [11]:
df.isna().sum()

Location                0
Year                    0
Kilometers_Driven       1
Fuel_Type               0
Transmission            0
Owner_Type              0
Seats                   0
New_Price               0
Price                1232
mileage_num            83
engine_num             46
power_num             175
Brand                   0
Model                   0
dtype: int64

In [13]:
col_list = ["mileage_num", "engine_num", "power_num", "Kilometers_Driven"]

for col in col_list:
    df[col] = df[col].fillna(df[col].median())

df.isna().sum()

Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Seats                   0
New_Price               0
Price                1232
mileage_num             0
engine_num              0
power_num               0
Brand                   0
Model                   0
dtype: int64

In [14]:
df = df[df["Price"].notna()].copy()

df.isna().sum()

Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Seats                0
New_Price            0
Price                0
mileage_num          0
engine_num           0
power_num            0
Brand                0
Model                0
dtype: int64

# Data Preparation

In [17]:
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,New_Price,Price,mileage_num,engine_num,power_num,Brand,Model
0,Mumbai,2010,72000.0,CNG,Manual,First,5.0,5.51,1.75,26.6,998.0,58.16,maruti,wagon
1,Pune,2015,41000.0,Diesel,Manual,First,5.0,16.06,12.5,19.67,1582.0,126.2,hyundai,creta
2,Chennai,2011,46000.0,Petrol,Manual,First,5.0,8.61,4.5,18.2,1199.0,88.7,honda,jazz
3,Chennai,2012,87000.0,Diesel,Manual,First,7.0,11.27,6.0,20.77,1248.0,88.76,maruti,ertiga
4,Coimbatore,2013,40670.0,Diesel,Automatic,Second,5.0,53.14,17.74,15.2,1968.0,140.8,audi,a4


In [16]:
x = df.drop(["Price"], axis=1)
y = df["Price"]

In [18]:
# one hot encoding
x = pd.get_dummies(x, columns=x.select_dtypes(include=["object", "category"]).columns.tolist(), drop_first=True)

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  x = pd.get_dummies(x, columns=x.select_dtypes(include=["object", "category"]).columns.tolist(), drop_first=True)


In [19]:
x.head()

Unnamed: 0,Year,Kilometers_Driven,Seats,New_Price,mileage_num,engine_num,power_num,Location_Bangalore,Location_Chennai,Location_Coimbatore,...,Model_xenon,Model_xf,Model_xj,Model_xuv300,Model_xuv500,Model_xylo,Model_yeti,Model_z4,Model_zen,Model_zest
0,2010,72000.0,5.0,5.51,26.6,998.0,58.16,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2015,41000.0,5.0,16.06,19.67,1582.0,126.2,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2011,46000.0,5.0,8.61,18.2,1199.0,88.7,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,2012,87000.0,7.0,11.27,20.77,1248.0,88.76,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,2013,40670.0,5.0,53.14,15.2,1968.0,140.8,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [20]:
x = x.astype(float)

In [21]:
x.head()

Unnamed: 0,Year,Kilometers_Driven,Seats,New_Price,mileage_num,engine_num,power_num,Location_Bangalore,Location_Chennai,Location_Coimbatore,...,Model_xenon,Model_xf,Model_xj,Model_xuv300,Model_xuv500,Model_xylo,Model_yeti,Model_z4,Model_zen,Model_zest
0,2010.0,72000.0,5.0,5.51,26.6,998.0,58.16,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015.0,41000.0,5.0,16.06,19.67,1582.0,126.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2011.0,46000.0,5.0,8.61,18.2,1199.0,88.7,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2012.0,87000.0,7.0,11.27,20.77,1248.0,88.76,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2013.0,40670.0,5.0,53.14,15.2,1968.0,140.8,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)

print(f"train size: {x_train.shape}")
print(f"test size: {x_test.shape}")

train size: (4212, 264)
test size: (1806, 264)


In [23]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [25]:
# metrics
from sklearn.metrics import root_mean_squared_error, r2_score
y_train_pred = lin_reg.predict(x_train)
y_test_prod = lin_reg.predict(x_test)

print(f"Train rmse: {root_mean_squared_error(y_train, y_train_pred)}")
print(f"Test rmse: {root_mean_squared_error(y_test, y_test_prod)}")

Train rmse: 4.072542045829786
Test rmse: 5.259614722268179
