In [240]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from matplotlib import pyplot as plt

In [185]:
data = pd.read_csv("car_data.csv")
data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [186]:
data.shape

(301, 9)

In [187]:
data.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [188]:
print(data['Seller_Type'].unique())
print(data['Owner'].unique())
print(data['Transmission'].unique())
print(data['Fuel_Type'].unique())

['Dealer' 'Individual']
[0 1 3]
['Manual' 'Automatic']
['Petrol' 'Diesel' 'CNG']


In [189]:
data.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [190]:
data.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [191]:
df=data[[ 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]
df.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [192]:
df['Current_Year']=2022
df.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current_Year
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0,2022
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0,2022
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0,2022
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0,2022
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0,2022


In [193]:
df['Number_of_Years']=df['Current_Year']-df['Year']
df.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current_Year,Number_of_Years
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0,2022,8
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0,2022,9
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0,2022,5
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0,2022,11
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0,2022,8


In [194]:
df.drop(['Year'], axis=1, inplace=True) # axis=1 means column and inplace=True means it will change in the original dataset
df.drop(['Current_Year'], axis=1, inplace=True)
df.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Number_of_Years
0,3.35,5.59,27000,Petrol,Dealer,Manual,0,8
1,4.75,9.54,43000,Diesel,Dealer,Manual,0,9
2,7.25,9.85,6900,Petrol,Dealer,Manual,0,5
3,2.85,4.15,5200,Petrol,Dealer,Manual,0,11
4,4.6,6.87,42450,Diesel,Dealer,Manual,0,8


In [195]:
df=pd.get_dummies(df, drop_first=True) #drop_first=True to avoid dummy variable trap
df.tail()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,Number_of_Years,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
296,9.5,11.6,33988,0,6,1,0,0,1
297,4.0,5.9,60000,0,7,0,1,0,1
298,3.35,11.0,87934,0,13,0,1,0,1
299,11.5,12.5,9000,0,5,1,0,0,1
300,5.3,5.9,5464,0,6,0,1,0,1


In [196]:
df.corr() #correlation matrix to find the correlation between the variables 

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,Number_of_Years,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
Selling_Price,1.0,0.878983,0.029187,-0.088344,-0.236141,0.552339,-0.540571,-0.550724,-0.367128
Present_Price,0.878983,1.0,0.203647,0.008057,0.047584,0.473306,-0.465244,-0.51203,-0.348715
Kms_Driven,0.029187,0.203647,1.0,0.089216,0.524342,0.172515,-0.172874,-0.101419,-0.16251
Owner,-0.088344,0.008057,0.089216,1.0,0.182104,-0.053469,0.055687,0.124269,-0.050316
Number_of_Years,-0.236141,0.047584,0.524342,0.182104,1.0,-0.064315,0.059959,0.039896,-0.000394
Fuel_Type_Diesel,0.552339,0.473306,0.172515,-0.053469,-0.064315,1.0,-0.979648,-0.350467,-0.098643
Fuel_Type_Petrol,-0.540571,-0.465244,-0.172874,0.055687,0.059959,-0.979648,1.0,0.358321,0.091013
Seller_Type_Individual,-0.550724,-0.51203,-0.101419,0.124269,0.039896,-0.350467,0.358321,1.0,0.06324
Transmission_Manual,-0.367128,-0.348715,-0.16251,-0.050316,-0.000394,-0.098643,0.091013,0.06324,1.0


In [225]:
X=df.iloc[:,1:] #independent features 
y=df.iloc[:,0] #dependent features

In [238]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) #splitting the dataset into training and testing set  #test_size=0.2 means 20% of the dataset will be used for testing and 80% for training
model = LinearRegression().fit(x_train, y_train) #fitting the model on the training set 

y_pred = model.predict(x_test) #predicting the test set results 

#compare y_pred and y_test 
df1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})  #comparing the actual and predicted values
df1

Unnamed: 0,Actual,Predicted
223,8.25,7.468068
150,0.50,-0.310058
226,5.25,4.355680
296,9.50,8.878597
52,18.00,13.889428
...,...,...
46,2.65,3.080367
158,0.48,1.739862
230,6.15,6.830769
179,0.31,-1.714758


In [241]:
score = r2_score(y_test, y_pred) #r2_score is used to find the accuracy of the model
score

0.8639362504421002