In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import re

In [2]:
df = pd.read_csv("used_cars_data.csv").drop("S.No.", axis=1)

In [3]:
# looking at which columns have the most missing values
df.isnull().sum().sort_values(ascending=False)

New_Price            6247
Price                1234
Seats                  53
Engine                 46
Power                  46
Mileage                 2
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
dtype: int64

In [4]:
# For now we hypothesize that we don't need "Name" and "Location" columns for our study
# so we will drop those.

df = df.drop(["Name","Location"], axis=1)
#

In [5]:
#There are alot of missing values in the new price. We can drop this.

df = df.drop(columns=["New_Price"])
df.isnull().sum().sort_values(ascending=False)

Price                1234
Seats                  53
Engine                 46
Power                  46
Mileage                 2
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
dtype: int64

In [6]:
# After dropping New_Price we still have 182 missing values on the Price (the very variable we aim to predict)
# and Mileage. We are going to drop those rows.

df = df.dropna(subset=["Price"])
df.isnull().sum().sort_values(ascending=False) # No more missing values/


Seats                42
Engine               36
Power                36
Mileage               2
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Price                 0
dtype: int64

In [7]:
# Just drop the remaining misssing rows
df.dropna(inplace=True)
df.isnull().sum().sort_values(ascending=False)

Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
dtype: int64

In [8]:
cols_with_units = ["Mileage", "Engine", "Power"]

for col in cols_with_units:
    sep = df.apply(lambda x : x[col].split(" ")[1],axis=1) #separate units from figures
    units = sep.unique()
    freq = dict(sep.value_counts())
    print(units,freq)

['kmpl' 'km/kg'] {'kmpl': 5909, 'km/kg': 66}
['CC'] {'CC': 5975}
['bhp'] {'bhp': 5975}


In [9]:
# Extract the numeric part of the columns with units and convert to float
def split_it(value):
    try:
        num_part = re.findall(r"\d+\.*\d*",value)[0]
    except IndexError as e:
        if "null" in value:
            return np.nan
    unit_part = re.findall("\D+",value)[-1]
    return float(num_part)

In [10]:
for col in cols_with_units:
    df[col] = df[col].apply(split_it).astype("float")
    print(col)

Mileage
Engine
Power


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5975 entries, 0 to 7250
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               5975 non-null   int64  
 1   Kilometers_Driven  5975 non-null   int64  
 2   Fuel_Type          5975 non-null   object 
 3   Transmission       5975 non-null   object 
 4   Owner_Type         5975 non-null   object 
 5   Mileage            5975 non-null   float64
 6   Engine             5975 non-null   float64
 7   Power              5872 non-null   float64
 8   Seats              5975 non-null   float64
 9   Price              5975 non-null   float64
dtypes: float64(5), int64(2), object(3)
memory usage: 513.5+ KB


In [12]:
# Model building
y = df["Price"]
X = df.drop(["Price"],axis=1)

In [13]:
X = pd.get_dummies(X, columns=["Fuel_Type", "Transmission","Owner_Type"], drop_first=True,)
X.head()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Fuel_Type_Diesel,Fuel_Type_LPG,Fuel_Type_Petrol,Transmission_Manual,Owner_Type_Fourth & Above,Owner_Type_Second,Owner_Type_Third
0,2012,85000,0.0,2179.0,115.0,5.0,1,0,0,0,0,1,0
1,2010,125000,0.0,2179.0,115.0,5.0,1,0,0,0,0,1,0
2,2012,119203,0.0,2179.0,115.0,5.0,1,0,0,0,0,0,0
3,2013,72000,0.0,2179.0,115.0,5.0,1,0,0,0,0,0,0
4,2013,46000,0.0,2179.0,115.0,5.0,1,0,0,0,0,1,0


In [14]:
power_median = X["Power"].median()
X["Power"].fillna(power_median,inplace=True)

In [15]:
# Lets us build linear regression model using statsmodel 
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

X = sm.add_constant(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

olsmod_train = sm.OLS(y_train, X_train)
ols_train = olsmod_train.fit()
print(ols_train.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.696
Model:                            OLS   Adj. R-squared:                  0.695
Method:                 Least Squares   F-statistic:                     733.3
Date:                Tue, 16 Feb 2021   Prob (F-statistic):               0.00
Time:                        20:44:42   Log-Likelihood:                -13525.
No. Observations:                4182   AIC:                         2.708e+04
Df Residuals:                    4168   BIC:                         2.717e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 

In [16]:
olsmod_test = sm.OLS(y_test, X_test)
ols_test = olsmod_test.fit()
print(ols_test.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.718
Model:                            OLS   Adj. R-squared:                  0.716
Method:                 Least Squares   F-statistic:                     348.8
Date:                Tue, 16 Feb 2021   Prob (F-statistic):               0.00
Time:                        20:44:55   Log-Likelihood:                -5765.9
No. Observations:                1793   AIC:                         1.156e+04
Df Residuals:                    1779   BIC:                         1.164e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 