Importing Libraries

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn import metrics

Data Collection and Processing

In [3]:
# loading the data from csv file to pandas dataframe
df = pd.read_csv('car data.csv')

In [4]:
# inspecting the first 5 rows of the dataframe
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [5]:
# checking the number of rows and columns
df.shape

(301, 9)

In [6]:
# getting some information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [7]:
# checking the number of missing values
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [9]:
# checking the distribution of categorical data
print(df.Fuel_Type.value_counts())
print(df.Seller_Type.value_counts())
print(df.Transmission.value_counts())

Petrol    239
Diesel     60
CNG         2
Name: Fuel_Type, dtype: int64
Dealer        195
Individual    106
Name: Seller_Type, dtype: int64
Manual       261
Automatic     40
Name: Transmission, dtype: int64


Encoding the Categorical Data

In [10]:
le = LabelEncoder()
df.Fuel_Type = le.fit_transform(df.Fuel_Type)
df.Seller_Type = le.fit_transform(df.Seller_Type)
df.Transmission = le.fit_transform(df.Transmission)

In [11]:
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,2,0,1,0
1,sx4,2013,4.75,9.54,43000,1,0,1,0
2,ciaz,2017,7.25,9.85,6900,2,0,1,0
3,wagon r,2011,2.85,4.15,5200,2,0,1,0
4,swift,2014,4.6,6.87,42450,1,0,1,0


Splitting the data and Target

In [12]:
x = df.drop(['Car_Name','Selling_Price'],axis=1)
y = df['Selling_Price']

Splitting Training and Test data

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=2)

Model Training

1. Linear Regression

In [14]:
# loading the linear regression model
lrg = LinearRegression()

In [15]:
lrg.fit(x_train, y_train)

LinearRegression()

In [16]:
print("Score of training data :", lrg.score(x_train, y_train))
print("Score of test data :", lrg.score(x_test, y_test))

Score of training data : 0.8863492418513185
Score of test data : 0.8191909743727058


Model Evaluation

In [17]:
# prediction on Training data
y_pred_tr = lrg.predict(x_train)

In [18]:
# R squared Error
error_score = metrics.r2_score(y_train, y_pred_tr)
print("R squared Error : ", error_score)

R squared Error :  0.8863492418513185


Visualize the actual prices and Predicted prices

In [19]:
# prediction on Test data
y_pred_ts = lrg.predict(x_test)

In [20]:
# R squared Error
error_score = metrics.r2_score(y_test, y_pred_ts)
print("R squared Error : ", error_score)

R squared Error :  0.8191909743727058
