<a href="https://colab.research.google.com/github/ibshafique/mlops_with_poridhi/blob/main/prerequisite_projects/Car_Price/car_price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prerequisites


Importing the related libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

Importing the dataset.

In [2]:
train_url = "https://raw.githubusercontent.com/ibshafique/mlops_with_poridhi/refs/heads/main/prerequisite_projects/Car_Price/dataset/car_price.csv"
train_df = pd.read_csv(train_url)

# Data Insights

In [3]:
train_df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [4]:
train_df.shape

(8128, 13)

In [5]:
train_df.isnull().sum()

Unnamed: 0,0
name,0
year,0
selling_price,0
km_driven,0
fuel,0
seller_type,0
transmission,0
owner,0
mileage,221
engine,221


In [6]:
train_df.duplicated().sum()

1202

# Data Cleaning

The 'torque' values donot have much relation with the price of a car. So we are dropping this column.

In [7]:
train_df = train_df.drop(columns=['torque'])

In the previous section we saw there are some rows with missing values, so we are dropping those rows.

We are also dropping the duplicated values of rows.

In [8]:
train_df.dropna(inplace=True)
train_df.drop_duplicates(inplace=True)
train_df.shape

(6718, 12)

In [9]:
train_df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,5.0


Now we will be extracting the values from these columns:

i. name
ii. mileage
iii. engine
iv. max_power

In [10]:
train_df['name'] = train_df['name'].str.split(' ').str[0]
train_df['mileage'] = train_df['mileage'].str.split(' ').str[0]
train_df['engine'] = train_df['engine'].str.split(' ').str[0]
train_df['max_power'] = train_df['max_power'].str.split(' ').str[0]
train_df.head()
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6718 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6718 non-null   object 
 1   year           6718 non-null   int64  
 2   selling_price  6718 non-null   int64  
 3   km_driven      6718 non-null   int64  
 4   fuel           6718 non-null   object 
 5   seller_type    6718 non-null   object 
 6   transmission   6718 non-null   object 
 7   owner          6718 non-null   object 
 8   mileage        6718 non-null   object 
 9   engine         6718 non-null   object 
 10  max_power      6718 non-null   object 
 11  seats          6718 non-null   float64
dtypes: float64(1), int64(3), object(8)
memory usage: 682.3+ KB


In [11]:
print(train_df['name'].unique())
print('')
print(train_df['fuel'].unique())
print('')
print(train_df['seller_type'].unique())
print('')
print(train_df['transmission'].unique())

['Maruti' 'Skoda' 'Honda' 'Hyundai' 'Toyota' 'Ford' 'Renault' 'Mahindra'
 'Tata' 'Chevrolet' 'Datsun' 'Jeep' 'Mercedes-Benz' 'Mitsubishi' 'Audi'
 'Volkswagen' 'BMW' 'Nissan' 'Lexus' 'Jaguar' 'Land' 'MG' 'Volvo' 'Daewoo'
 'Kia' 'Fiat' 'Force' 'Ambassador' 'Ashok' 'Isuzu' 'Opel']

['Diesel' 'Petrol' 'LPG' 'CNG']

['Individual' 'Dealer' 'Trustmark Dealer']

['Manual' 'Automatic']


The columns 'name', 'fuel', 'transmission_type' and 'owner' are objects.
We will convert them to integers.

In [12]:
train_df['name'] = train_df['name'].replace({'Maruti': 1 , 'Skoda': 2, 'Honda': 3, 'Hyundai': 4, 'Toyota': 5, 'Ford': 6, 'Renault': 7,
                                             'Mahindra': 8 , 'Tata': 9 , 'Chevrolet': 10, 'Datsun': 11, 'Jeep': 12, 'Mercedes-Benz': 13,
                                             'Mitsubishi': 14, 'Audi': 15, 'Volkswagen': 16, 'BMW': 17, 'Nissan': 18, 'Lexus': 19,
                                             'Jaguar': 20, 'Land': 21, 'MG': 22, 'Volvo': 23, 'Daewoo': 24, 'Kia': 25, 'Fiat': 26, 'Force': 27,
                                             'Ambassador': 28, 'Ashok': 29, 'Isuzu': 30, 'Opel': 31})

train_df['transmission'] = train_df['transmission'].replace({'Manual': 1, 'Automatic': 2})

train_df['seller_type'] = train_df['seller_type'].replace({'Individual': 1, 'Dealer': 2, 'Trustmark Dealer': 3})

train_df['fuel'] = train_df['fuel'].replace({'Diesel': 1, 'Petrol': 2, 'LPG': 3, 'CNG': 4})

train_df['owner'] = train_df['owner'].replace({'First Owner': 1, 'Second Owner': 2, 'Third Owner': 3, 'Fourth & Above Owner': 4, 'Test Drive Car': 5})


  train_df['name'] = train_df['name'].replace({'Maruti': 1 , 'Skoda': 2, 'Honda': 3, 'Hyundai': 4, 'Toyota': 5, 'Ford': 6, 'Renault': 7,
  train_df['transmission'] = train_df['transmission'].replace({'Manual': 1, 'Automatic': 2})
  train_df['seller_type'] = train_df['seller_type'].replace({'Individual': 1, 'Dealer': 2, 'Trustmark Dealer': 3})
  train_df['fuel'] = train_df['fuel'].replace({'Diesel': 1, 'Petrol': 2, 'LPG': 3, 'CNG': 4})
  train_df['owner'] = train_df['owner'].replace({'First Owner': 1, 'Second Owner': 2, 'Third Owner': 3, 'Fourth & Above Owner': 4, 'Test Drive Car': 5})


In [13]:
train_df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,1,2014,450000,145500,1,1,1,1,23.4,1248,74.0,5.0
1,2,2014,370000,120000,1,1,1,2,21.14,1498,103.52,5.0
2,3,2006,158000,140000,2,1,1,3,17.7,1497,78.0,5.0
3,4,2010,225000,127000,1,1,1,1,23.0,1396,90.0,5.0
4,1,2007,130000,120000,2,1,1,1,16.1,1298,88.2,5.0
