## Import Python Libraries 

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression  
from sklearn.linear_model import Lasso
from sklearn import metrics   


## Read Dataset

In [2]:
df = pd.read_csv('CAR DETAILS.csv')

df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [3]:
df.shape

(4340, 8)

In [4]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [6]:
# check Missing value present in dataset 

df.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,year,selling_price,km_driven
count,4340.0,4340.0,4340.0
mean,2013.090783,504127.3,66215.777419
std,4.215344,578548.7,46644.102194
min,1992.0,20000.0,1.0
25%,2011.0,208749.8,35000.0
50%,2014.0,350000.0,60000.0
75%,2016.0,600000.0,90000.0
max,2020.0,8900000.0,806599.0


In [8]:
# check Duplicates prsent in dataset 

df.duplicated().sum()

763

In [9]:
# Handling of duplicates 

df.drop_duplicates(keep='first') 

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner
...,...,...,...,...,...,...,...,...
4335,Hyundai i20 Magna 1.4 CRDi (Diesel),2014,409999,80000,Diesel,Individual,Manual,Second Owner
4336,Hyundai i20 Magna 1.4 CRDi,2014,409999,80000,Diesel,Individual,Manual,Second Owner
4337,Maruti 800 AC BSIII,2009,110000,83000,Petrol,Individual,Manual,Second Owner
4338,Hyundai Creta 1.6 CRDi SX Option,2016,865000,90000,Diesel,Individual,Manual,First Owner


In [10]:
df.shape

(4340, 8)

In [11]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


## Encoding catgorical data 

In [12]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

### Encoding 'fuel type' column

In [13]:
df['fuel'].value_counts()

fuel
Diesel      2153
Petrol      2123
CNG           40
LPG           23
Electric       1
Name: count, dtype: int64

In [14]:
df['fuel'] = encoder.fit_transform(df['fuel'])

In [15]:
df['fuel'].value_counts()

fuel
1    2153
4    2123
0      40
3      23
2       1
Name: count, dtype: int64

### Encoding 'Seller_type' column 

In [16]:
df['seller_type'].value_counts()

seller_type
Individual          3244
Dealer               994
Trustmark Dealer     102
Name: count, dtype: int64

In [17]:
df['seller_type'] = encoder.fit_transform(df['seller_type'])

In [18]:
df['seller_type'].value_counts()

seller_type
1    3244
0     994
2     102
Name: count, dtype: int64

### Encoding of transmission column

In [19]:
df['transmission'].value_counts()

transmission
Manual       3892
Automatic     448
Name: count, dtype: int64

In [20]:
df['transmission'] = encoder.fit_transform(df['transmission'])

In [21]:
df['transmission'].value_counts()

transmission
1    3892
0     448
Name: count, dtype: int64

### Encoding of 'owner' column 

In [22]:
df['owner'].value_counts()

owner
First Owner             2832
Second Owner            1106
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: count, dtype: int64

In [23]:
df['owner'] = encoder.fit_transform(df['owner'])

In [24]:
df['owner'].value_counts()

owner
0    2832
2    1106
4     304
1      81
3      17
Name: count, dtype: int64

In [25]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,4,1,1,0
1,Maruti Wagon R LXI Minor,2007,135000,50000,4,1,1,0
2,Hyundai Verna 1.6 SX,2012,600000,100000,1,1,1,0
3,Datsun RediGO T Option,2017,250000,46000,4,1,1,0
4,Honda Amaze VX i-DTEC,2014,450000,141000,1,1,1,2


In [26]:
sns.heatmap(df.corr(),annot=True)

ValueError: could not convert string to float: 'Maruti 800 AC'

In [None]:
## Save cleaned Sample Dataset 

df.to_csv('sample_data.csv')

## Divide Dataset into Input and output dataset 

In [27]:
x = df.drop(['selling_price','name'] , axis = 1)
y = df['selling_price']

In [28]:
x.shape ,y.shape

((4340, 6), (4340,))

## Spliting Data into into Traing and Test Data 

In [29]:
from sklearn.model_selection import train_test_split 

x_train ,x_test ,y_train ,y_test = train_test_split(x,y,test_size = 0.20 ,random_state = 40)

In [30]:
x_train.shape , x_test.shape , y_train.shape , y_test.shape

((3472, 6), (868, 6), (3472,), (868,))

## Model Training 

## 1. LinearRegression model 

In [31]:
from sklearn.linear_model import LinearRegression

regression = LinearRegression()

In [32]:
## trainig the train data 

regression.fit(x_train , y_train)

##### Predication of test Data

In [33]:
y_pred_test = regression.predict(x_test)

In [34]:
y_pred_test

array([ 6.03940554e+05, -1.92270550e+04,  4.92593460e+05,  3.69533117e+05,
        4.01054005e+05,  1.60870673e+05,  7.70028130e+05,  6.92894726e+05,
        5.23373916e+04,  5.33068905e+05,  1.25535917e+05,  5.43153959e+05,
       -1.34215826e+05,  2.95753023e+05,  2.34966429e+05,  6.28769552e+05,
        2.12964734e+05,  3.26883632e+05, -4.98670315e+04,  4.18167589e+05,
       -6.95524204e+04,  1.02874294e+06,  1.69036225e+05,  8.09745717e+05,
        5.98272190e+05,  6.44815890e+05,  5.20414953e+05,  4.48877510e+05,
        2.43737383e+03,  7.20172016e+05,  5.53033969e+05,  6.46976442e+04,
        4.63426173e+05,  7.86721025e+05,  5.83975489e+05,  4.88978462e+05,
        1.64528013e+06,  3.23830014e+05,  1.24327978e+06,  4.09549672e+05,
        4.48845847e+05,  4.04430993e+05, -1.70059946e+04,  4.62681976e+05,
        1.21364154e+05,  2.46936403e+05,  3.20837521e+05,  5.09286356e+05,
        4.68673767e+05,  3.40206286e+05,  7.10000116e+05,  4.32310354e+05,
        3.98798824e+05,  

#####  Model Evaluation 

In [35]:
from sklearn.metrics import r2_score

score=r2_score(y_test,y_pred_test)
print(score)

0.4706444934865691


### 2.Lasso Regression 

In [36]:
from sklearn.linear_model import Lasso

In [37]:
# loading the linear regression model
lass_reg_model = Lasso()

In [38]:
lass_reg_model.fit(x_train,y_train)

In [39]:
# prediction on Training data
training_data_prediction = lass_reg_model.predict(x_train)

In [40]:
# R squared Error
error_score = metrics.r2_score(y_train, training_data_prediction)
print("R squared Error : ", error_score)

R squared Error :  0.443769449578686


## 3. RandomForestRegressor model 

In [41]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor()

In [42]:
rf_reg.fit(x_train , y_train)

In [43]:
y_pred = rf_reg.predict(x_test)

In [44]:
rf_reg.score(x_train , y_train)

0.919818600284815

In [45]:
rf_reg.score(x_test , y_test)

0.6775457007645378

## save  model and load Model 

In [49]:
import pickle

pickle.dump(rf_reg,open('random_regressor.pkl','wb'))

In [50]:
model =pickle.load(open('random_regressor.pkl','rb'))

In [51]:
model.predict(x_test)

array([ 623324.48412698,  137544.98      ,  418103.44295871,
        245820.        ,  649041.66666667,  241719.96428571,
        942221.66000577,  659727.46031746,  176277.17325758,
        564748.15472222,  176534.28571429,  361271.42857143,
        157199.74833333,  728450.        ,  433249.9       ,
        466503.17113442,  232827.02380952,  280239.94224098,
         75500.43650794,  437847.83702409,   98722.5       ,
        329789.91      ,  300049.84      ,  692505.83333333,
        430612.80952381,  765500.        ,  453256.34275938,
        285639.16666667,   60180.        ,  705243.90873016,
        504117.49      ,  196871.50380952,  493066.65833333,
       1139475.02344877,  474160.        ,  391481.16246499,
       1260900.        ,  260384.75777778,  700278.69047619,
        265190.17524143,  572519.16416667,  261557.77777778,
        129900.        ,  310295.20779221,  259641.7965368 ,
        176384.94      ,  238587.91666667,  564763.86256133,
        219566.64666667,