In [1]:
import pandas as pd
import numpy as np

In [2]:
############## Data Preprocessing ######################
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
############## Model Development #####################

from sklearn.linear_model import LinearRegression,Lasso,ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor

In [4]:
############## Accuracy check ##################
from sklearn.metrics import r2_score


In [5]:
############# Warnings Ignore ##################
import warnings
warnings.filterwarnings('ignore')

In [6]:
data=pd.read_csv('SecondCar.csv')
data

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,Rating,ExShowroom Price,selling_price
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,Diesel,Individual,Manual,Second Owner,9.00,106001,100000
1,Tata Nano Lx BSIV,2012,50000,Petrol,Individual,Manual,Second Owner,9.00,108556,100000
2,Tata Indica Vista Aqua TDI BSIII,2011,70000,Diesel,Individual,Manual,First Owner,11.00,120678,120000
3,Maruti Wagon R LXI Minor,2010,80000,Petrol,Individual,Manual,Second Owner,9.00,122917,100000
4,Chevrolet Beat LT,2010,80000,Petrol,Individual,Manual,Second Owner,10.00,144902,130000
...,...,...,...,...,...,...,...,...,...,...
4335,BMW X5 xDrive 30d xLine,2019,30000,Diesel,Dealer,Automatic,First Owner,11.00,9407487,4950000
4336,BMW 5 Series 520d Luxury Line,2019,12999,Diesel,Dealer,Automatic,First Owner,9.71,9598350,4800000
4337,BMW X5 xDrive 30d xLine,2019,30000,Diesel,Dealer,Automatic,First Owner,11.00,9857238,4950000
4338,Audi RS7 2015-2019 Sportback Performance,2016,13000,Petrol,Dealer,Automatic,First Owner,14.09,14235729,8900000


In [7]:
data.head(2)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,Rating,ExShowroom Price,selling_price
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,Diesel,Individual,Manual,Second Owner,9.0,106001,100000
1,Tata Nano Lx BSIV,2012,50000,Petrol,Individual,Manual,Second Owner,9.0,108556,100000


# Data preprocessing

   1. Data Duplicate ---Remove
   2. Missing Values >75% ---->> Remove column
   3. Missing Values treatment
   4. Unique value based removal --->> when unique value ==1 or unique value == len(data)	
   5. Feature Enigineering --->> variety reducion based on exisitind data
   6. label encoding -->>> Alphabetical order converted to number
   6.5 Correlation-->>
   

# Step 1 Duplicate Data

In [8]:
print('Before duplicate removal--->>',len(data))

Before duplicate removal--->> 4340


In [9]:
data= data.drop_duplicates(keep='first')

In [10]:
print('After duplicates removal----->>',len(data))

After duplicates removal----->> 4340


# Step 2 Missing Value>75%


In [11]:
data.isna().sum()

name                0
year                0
km_driven           0
fuel                3
seller_type         2
transmission        2
owner               3
Rating              4
ExShowroom Price    0
selling_price       0
dtype: int64

In [12]:
missing_value_df=pd.DataFrame(data.isna().sum(),columns=['missing_value_count'])
missing_value_df

Unnamed: 0,missing_value_count
name,0
year,0
km_driven,0
fuel,3
seller_type,2
transmission,2
owner,3
Rating,4
ExShowroom Price,0
selling_price,0


In [13]:
missing_value_df['percentage']=100*missing_value_df['missing_value_count']/len(data)
missing_value_df

Unnamed: 0,missing_value_count,percentage
name,0,0.0
year,0,0.0
km_driven,0,0.0
fuel,3,0.069124
seller_type,2,0.046083
transmission,2,0.046083
owner,3,0.069124
Rating,4,0.092166
ExShowroom Price,0,0.0
selling_price,0,0.0


# filter data based on condition

In [14]:
missing_value_df[missing_value_df['percentage']>0.05] 

Unnamed: 0,missing_value_count,percentage
fuel,3,0.069124
owner,3,0.069124
Rating,4,0.092166


# store list column to be deleted

In [15]:
columns_to_be_deleted=list(missing_value_df[missing_value_df['percentage']>0.07].index)
columns_to_be_deleted

['Rating']

# Delete Column from original data

In [16]:
data.head(2)
data.drop(columns=columns_to_be_deleted,inplace=True)
data.head(2)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,ExShowroom Price,selling_price
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,Diesel,Individual,Manual,Second Owner,106001,100000
1,Tata Nano Lx BSIV,2012,50000,Petrol,Individual,Manual,Second Owner,108556,100000


# Step 3 Missing Value Treatment

In [17]:
data.isna().sum()

name                0
year                0
km_driven           0
fuel                3
seller_type         2
transmission        2
owner               3
ExShowroom Price    0
selling_price       0
dtype: int64

In [18]:
data

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,ExShowroom Price,selling_price
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,Diesel,Individual,Manual,Second Owner,106001,100000
1,Tata Nano Lx BSIV,2012,50000,Petrol,Individual,Manual,Second Owner,108556,100000
2,Tata Indica Vista Aqua TDI BSIII,2011,70000,Diesel,Individual,Manual,First Owner,120678,120000
3,Maruti Wagon R LXI Minor,2010,80000,Petrol,Individual,Manual,Second Owner,122917,100000
4,Chevrolet Beat LT,2010,80000,Petrol,Individual,Manual,Second Owner,144902,130000
...,...,...,...,...,...,...,...,...,...
4335,BMW X5 xDrive 30d xLine,2019,30000,Diesel,Dealer,Automatic,First Owner,9407487,4950000
4336,BMW 5 Series 520d Luxury Line,2019,12999,Diesel,Dealer,Automatic,First Owner,9598350,4800000
4337,BMW X5 xDrive 30d xLine,2019,30000,Diesel,Dealer,Automatic,First Owner,9857238,4950000
4338,Audi RS7 2015-2019 Sportback Performance,2016,13000,Petrol,Dealer,Automatic,First Owner,14235729,8900000


In [19]:
data.pivot_table(index='fuel',values=('name'),aggfunc='count')

Unnamed: 0_level_0,name
fuel,Unnamed: 1_level_1
CNG,40
Diesel,2152
Electric,1
LPG,23
Petrol,2121


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4340 entries, 0 to 4339
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              4340 non-null   object
 1   year              4340 non-null   int64 
 2   km_driven         4340 non-null   int64 
 3   fuel              4337 non-null   object
 4   seller_type       4338 non-null   object
 5   transmission      4338 non-null   object
 6   owner             4337 non-null   object
 7   ExShowroom Price  4340 non-null   int64 
 8   selling_price     4340 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 339.1+ KB


# Filling the missing values in columns object=mode,int=median

In [21]:
for col in data.columns:
    if data [col].dtype == 'object':
   #     print(col,  '--->>>', data[col].dtype,'---->>>>', data[col].mode()[0])
         data[col].fillna(data[col].mode()[0],inplace=True)
    if data [col].dtype != 'object':
     #   print('*'*40,col,  '--->>>', data[col].dtype,'---->>>>', data[col].median())
         data[col].fillna(data[col].median(),inplace=True)

In [22]:
data.isna().sum()

name                0
year                0
km_driven           0
fuel                0
seller_type         0
transmission        0
owner               0
ExShowroom Price    0
selling_price       0
dtype: int64

# Calculate mean, median,mode

In [23]:
data['fuel'].mode()[0]

'Diesel'

In [24]:
data['km_driven'].median()

60000.0

In [25]:
data['km_driven'].mean()

66215.77741935484

# Add_dummy column

In [26]:
data['column']='Ashish'
data.head(2)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,ExShowroom Price,selling_price,column
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,Diesel,Individual,Manual,Second Owner,106001,100000,Ashish
1,Tata Nano Lx BSIV,2012,50000,Petrol,Individual,Manual,Second Owner,108556,100000,Ashish


In [27]:
data['column'].unique()

array(['Ashish'], dtype=object)

In [28]:
for col in data.columns:
    if data[col].nunique()==1:
        del data[col]
    elif (data[col].nunique()==len(data)) & (data[col].dtype=='object'):
        del data[col]

# Featuring Engineering

In [29]:
for col in data.columns:
    print(col,'---->>>',data[col].nunique(),'----->>>>',data[col].dtype)

name ---->>> 1491 ----->>>> object
year ---->>> 27 ----->>>> int64
km_driven ---->>> 770 ----->>>> int64
fuel ---->>> 5 ----->>>> object
seller_type ---->>> 3 ----->>>> object
transmission ---->>> 2 ----->>>> object
owner ---->>> 5 ----->>>> object
ExShowroom Price ---->>> 4331 ----->>>> int64
selling_price ---->>> 445 ----->>>> int64


# object column that is name is reduced to company name Need to do Manually

In [30]:
data['name']

0                          Tata Indica Vista Aqua 1.4 TDI
1                                       Tata Nano Lx BSIV
2                        Tata Indica Vista Aqua TDI BSIII
3                                Maruti Wagon R LXI Minor
4                                       Chevrolet Beat LT
                              ...                        
4335                              BMW X5 xDrive 30d xLine
4336                        BMW 5 Series 520d Luxury Line
4337                              BMW X5 xDrive 30d xLine
4338             Audi RS7 2015-2019 Sportback Performance
4339    Mercedes-Benz S-Class S 350d Connoisseurs Edition
Name: name, Length: 4340, dtype: object

In [31]:
data['name'].str.split(" ").str[0]

0                Tata
1                Tata
2                Tata
3              Maruti
4           Chevrolet
            ...      
4335              BMW
4336              BMW
4337              BMW
4338             Audi
4339    Mercedes-Benz
Name: name, Length: 4340, dtype: object

In [32]:
data['company name']=data['name'].str.split(" ").str[0]
data.head(2)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,ExShowroom Price,selling_price,company name
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,Diesel,Individual,Manual,Second Owner,106001,100000,Tata
1,Tata Nano Lx BSIV,2012,50000,Petrol,Individual,Manual,Second Owner,108556,100000,Tata


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4340 entries, 0 to 4339
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              4340 non-null   object
 1   year              4340 non-null   int64 
 2   km_driven         4340 non-null   int64 
 3   fuel              4340 non-null   object
 4   seller_type       4340 non-null   object
 5   transmission      4340 non-null   object
 6   owner             4340 non-null   object
 7   ExShowroom Price  4340 non-null   int64 
 8   selling_price     4340 non-null   int64 
 9   company name      4340 non-null   object
dtypes: int64(4), object(6)
memory usage: 373.0+ KB


In [34]:
data['company name'].nunique()

29

In [35]:
data

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,ExShowroom Price,selling_price,company name
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,Diesel,Individual,Manual,Second Owner,106001,100000,Tata
1,Tata Nano Lx BSIV,2012,50000,Petrol,Individual,Manual,Second Owner,108556,100000,Tata
2,Tata Indica Vista Aqua TDI BSIII,2011,70000,Diesel,Individual,Manual,First Owner,120678,120000,Tata
3,Maruti Wagon R LXI Minor,2010,80000,Petrol,Individual,Manual,Second Owner,122917,100000,Maruti
4,Chevrolet Beat LT,2010,80000,Petrol,Individual,Manual,Second Owner,144902,130000,Chevrolet
...,...,...,...,...,...,...,...,...,...,...
4335,BMW X5 xDrive 30d xLine,2019,30000,Diesel,Dealer,Automatic,First Owner,9407487,4950000,BMW
4336,BMW 5 Series 520d Luxury Line,2019,12999,Diesel,Dealer,Automatic,First Owner,9598350,4800000,BMW
4337,BMW X5 xDrive 30d xLine,2019,30000,Diesel,Dealer,Automatic,First Owner,9857238,4950000,BMW
4338,Audi RS7 2015-2019 Sportback Performance,2016,13000,Petrol,Dealer,Automatic,First Owner,14235729,8900000,Audi


# Numerical Column -->>> Automatic Feature Engineering

In [36]:
round(data.describe(),2)

Unnamed: 0,year,km_driven,ExShowroom Price,selling_price
count,4340.0,4340.0,4340.0,4340.0
mean,2013.09,66215.78,845380.89,504127.31
std,4.22,46644.1,884840.54,578548.74
min,1992.0,1.0,106001.0,20000.0
25%,2011.0,35000.0,445389.5,208749.75
50%,2014.0,60000.0,596055.0,350000.0
75%,2016.0,90000.0,946243.0,600000.0
max,2020.0,806599.0,15538153.0,8900000.0


In [37]:
#range_labels=['new','medium','high','extreme']
#range_limits=[0,35000,60000,90000,1000000]
#data['km_bins'] = pd.cut(data['km_driven'],labels=range_labels,bins=range_limits)
#data.head()

In [38]:
for col in data.columns:
    if data[col].nunique()/len(data)>0.05:
        if data[col].dtype == 'object':
            print('Please perform manual feature Engineering for--->>>',col)
        else:
                print('Auto feature engineering for ---->>>>', col)
                new_col_name=col + '_bins'
                data[new_col_name] = pd.qcut(data[col],
                                             4,
                                             labels=['b1','b2','b3','b4'],
                                             duplicates='raise')
                

Please perform manual feature Engineering for--->>> name
Auto feature engineering for ---->>>> km_driven
Auto feature engineering for ---->>>> ExShowroom Price
Auto feature engineering for ---->>>> selling_price


In [39]:
data

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,ExShowroom Price,selling_price,company name,km_driven_bins,ExShowroom Price_bins,selling_price_bins
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,Diesel,Individual,Manual,Second Owner,106001,100000,Tata,b4,b1,b1
1,Tata Nano Lx BSIV,2012,50000,Petrol,Individual,Manual,Second Owner,108556,100000,Tata,b2,b1,b1
2,Tata Indica Vista Aqua TDI BSIII,2011,70000,Diesel,Individual,Manual,First Owner,120678,120000,Tata,b3,b1,b1
3,Maruti Wagon R LXI Minor,2010,80000,Petrol,Individual,Manual,Second Owner,122917,100000,Maruti,b3,b1,b1
4,Chevrolet Beat LT,2010,80000,Petrol,Individual,Manual,Second Owner,144902,130000,Chevrolet,b3,b1,b1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4335,BMW X5 xDrive 30d xLine,2019,30000,Diesel,Dealer,Automatic,First Owner,9407487,4950000,BMW,b1,b4,b4
4336,BMW 5 Series 520d Luxury Line,2019,12999,Diesel,Dealer,Automatic,First Owner,9598350,4800000,BMW,b1,b4,b4
4337,BMW X5 xDrive 30d xLine,2019,30000,Diesel,Dealer,Automatic,First Owner,9857238,4950000,BMW,b1,b4,b4
4338,Audi RS7 2015-2019 Sportback Performance,2016,13000,Petrol,Dealer,Automatic,First Owner,14235729,8900000,Audi,b1,b4,b4


# label Encoding

In [40]:
data['fuel'].unique()

array(['Diesel', 'Petrol', 'CNG', 'LPG', 'Electric'], dtype=object)

In [41]:
LN = LabelEncoder()

In [42]:
data['fuel']=LN.fit_transform(data['fuel'])

In [43]:
data['fuel'].unique()

array([1, 4, 0, 3, 2])

In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4340 entries, 0 to 4339
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   name                   4340 non-null   object  
 1   year                   4340 non-null   int64   
 2   km_driven              4340 non-null   int64   
 3   fuel                   4340 non-null   int32   
 4   seller_type            4340 non-null   object  
 5   transmission           4340 non-null   object  
 6   owner                  4340 non-null   object  
 7   ExShowroom Price       4340 non-null   int64   
 8   selling_price          4340 non-null   int64   
 9   company name           4340 non-null   object  
 10  km_driven_bins         4340 non-null   category
 11  ExShowroom Price_bins  4340 non-null   category
 12  selling_price_bins     4340 non-null   category
dtypes: category(3), int32(1), int64(4), object(5)
memory usage: 369.3+ KB


In [45]:
data

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,ExShowroom Price,selling_price,company name,km_driven_bins,ExShowroom Price_bins,selling_price_bins
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,1,Individual,Manual,Second Owner,106001,100000,Tata,b4,b1,b1
1,Tata Nano Lx BSIV,2012,50000,4,Individual,Manual,Second Owner,108556,100000,Tata,b2,b1,b1
2,Tata Indica Vista Aqua TDI BSIII,2011,70000,1,Individual,Manual,First Owner,120678,120000,Tata,b3,b1,b1
3,Maruti Wagon R LXI Minor,2010,80000,4,Individual,Manual,Second Owner,122917,100000,Maruti,b3,b1,b1
4,Chevrolet Beat LT,2010,80000,4,Individual,Manual,Second Owner,144902,130000,Chevrolet,b3,b1,b1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4335,BMW X5 xDrive 30d xLine,2019,30000,1,Dealer,Automatic,First Owner,9407487,4950000,BMW,b1,b4,b4
4336,BMW 5 Series 520d Luxury Line,2019,12999,1,Dealer,Automatic,First Owner,9598350,4800000,BMW,b1,b4,b4
4337,BMW X5 xDrive 30d xLine,2019,30000,1,Dealer,Automatic,First Owner,9857238,4950000,BMW,b1,b4,b4
4338,Audi RS7 2015-2019 Sportback Performance,2016,13000,4,Dealer,Automatic,First Owner,14235729,8900000,Audi,b1,b4,b4


In [46]:
for col in data.columns:
    if (data[col].dtype=='object') | (data[col].dtype=='category'):
        data[col]=LN.fit_transform(data[col])

In [47]:
data

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,ExShowroom Price,selling_price,company name,km_driven_bins,ExShowroom Price_bins,selling_price_bins
0,1228,2010,120000,1,1,1,2,106001,100000,25,3,0,0
1,1283,2012,50000,4,1,1,2,108556,100000,25,1,0,0
2,1229,2011,70000,1,1,1,0,120678,120000,25,2,0,0
3,1041,2010,80000,4,1,1,2,122917,100000,18,2,0,0
4,56,2010,80000,4,1,1,2,144902,130000,3,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4335,39,2019,30000,1,0,0,0,9407487,4950000,2,0,3,3
4336,30,2019,12999,1,0,0,0,9598350,4800000,2,0,3,3
4337,39,2019,30000,1,0,0,0,9857238,4950000,2,0,3,3
4338,25,2016,13000,4,0,0,0,14235729,8900000,1,0,3,3


# Correlation

In [48]:
corr=data.corr()
corr

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,ExShowroom Price,selling_price,company name,km_driven_bins,ExShowroom Price_bins,selling_price_bins
name,1.0,-0.051902,0.126203,-0.08385,0.140802,0.087785,0.032819,-0.068402,-0.077598,0.974513,0.116306,-0.02856,-0.021108
year,-0.051902,1.0,-0.419688,-0.120528,-0.098352,-0.1438,-0.414705,0.304871,0.413922,-0.039724,-0.480378,0.411361,0.661129
km_driven,0.126203,-0.419688,1.0,-0.285634,0.113689,0.120226,0.297115,-0.165105,-0.192289,0.131461,0.840924,-0.179051,-0.2507
fuel,-0.08385,-0.120528,-0.285634,1.0,0.038387,0.040445,-0.010301,-0.234674,-0.269779,-0.109394,-0.297167,-0.294192,-0.339001
seller_type,0.140802,-0.098352,0.113689,0.038387,1.0,0.174925,0.165681,-0.132745,-0.151554,0.144218,0.126851,-0.109656,-0.147082
transmission,0.087785,-0.1438,0.120226,0.040445,0.174925,1.0,0.078893,-0.516677,-0.530205,0.110699,0.134227,-0.308199,-0.309334
owner,0.032819,-0.414705,0.297115,-0.010301,0.165681,0.078893,1.0,-0.167726,-0.20784,0.036452,0.342417,-0.23544,-0.321452
ExShowroom Price,-0.068402,0.304871,-0.165105,-0.234674,-0.132745,-0.516677,-0.167726,1.0,0.960629,-0.087559,-0.199464,0.579682,0.546762
selling_price,-0.077598,0.413922,-0.192289,-0.269779,-0.151554,-0.530205,-0.20784,0.960629,1.0,-0.096858,-0.229658,0.573748,0.625049
company name,0.974513,-0.039724,0.131461,-0.109394,0.144218,0.110699,0.036452,-0.087559,-0.096858,1.0,0.119469,-0.031219,-0.025116


In [49]:
corr=100*data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,ExShowroom Price,selling_price,company name,km_driven_bins,ExShowroom Price_bins,selling_price_bins
name,100.0,-5.190221,12.620297,-8.385037,14.080245,8.778524,3.281915,-6.840192,-7.759752,97.451337,11.630633,-2.85603,-2.11076
year,-5.190221,100.0,-41.968815,-12.052816,-9.835151,-14.380043,-41.470517,30.487065,41.392168,-3.972402,-48.037822,41.136106,66.112946
km_driven,12.620297,-41.968815,100.0,-28.563429,11.368942,12.022623,29.711504,-16.510516,-19.228863,13.146088,84.092423,-17.905059,-25.070004
fuel,-8.385037,-12.052816,-28.563429,100.0,3.838666,4.044484,-1.030093,-23.467424,-26.977883,-10.939373,-29.716702,-29.419206,-33.900059
seller_type,14.080245,-9.835151,11.368942,3.838666,100.0,17.492495,16.568072,-13.274456,-15.155423,14.421752,12.685058,-10.965633,-14.708225
transmission,8.778524,-14.380043,12.022623,4.044484,17.492495,100.0,7.889262,-51.667679,-53.020514,11.069899,13.422679,-30.819882,-30.933432
owner,3.281915,-41.470517,29.711504,-1.030093,16.568072,7.889262,100.0,-16.772565,-20.784034,3.645168,34.241653,-23.543955,-32.145235
ExShowroom Price,-6.840192,30.487065,-16.510516,-23.467424,-13.274456,-51.667679,-16.772565,100.0,96.062867,-8.755945,-19.946381,57.968207,54.676162
selling_price,-7.759752,41.392168,-19.228863,-26.977883,-15.155423,-53.020514,-20.784034,96.062867,100.0,-9.685807,-22.965846,57.374834,62.504903
company name,97.451337,-3.972402,13.146088,-10.939373,14.421752,11.069899,3.645168,-8.755945,-9.685807,100.0,11.946879,-3.121876,-2.5116


In [50]:
data.drop(columns=['name','ExShowroom Price'],inplace=True)

In [51]:
data

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,selling_price,company name,km_driven_bins,ExShowroom Price_bins,selling_price_bins
0,2010,120000,1,1,1,2,100000,25,3,0,0
1,2012,50000,4,1,1,2,100000,25,1,0,0
2,2011,70000,1,1,1,0,120000,25,2,0,0
3,2010,80000,4,1,1,2,100000,18,2,0,0
4,2010,80000,4,1,1,2,130000,3,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...
4335,2019,30000,1,0,0,0,4950000,2,0,3,3
4336,2019,12999,1,0,0,0,4800000,2,0,3,3
4337,2019,30000,1,0,0,0,4950000,2,0,3,3
4338,2016,13000,4,0,0,0,8900000,1,0,3,3


In [52]:
corr = 100*data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,selling_price,company name,km_driven_bins,ExShowroom Price_bins,selling_price_bins
year,100.0,-41.968815,-12.052816,-9.835151,-14.380043,-41.470517,41.392168,-3.972402,-48.037822,41.136106,66.112946
km_driven,-41.968815,100.0,-28.563429,11.368942,12.022623,29.711504,-19.228863,13.146088,84.092423,-17.905059,-25.070004
fuel,-12.052816,-28.563429,100.0,3.838666,4.044484,-1.030093,-26.977883,-10.939373,-29.716702,-29.419206,-33.900059
seller_type,-9.835151,11.368942,3.838666,100.0,17.492495,16.568072,-15.155423,14.421752,12.685058,-10.965633,-14.708225
transmission,-14.380043,12.022623,4.044484,17.492495,100.0,7.889262,-53.020514,11.069899,13.422679,-30.819882,-30.933432
owner,-41.470517,29.711504,-1.030093,16.568072,7.889262,100.0,-20.784034,3.645168,34.241653,-23.543955,-32.145235
selling_price,41.392168,-19.228863,-26.977883,-15.155423,-53.020514,-20.784034,100.0,-9.685807,-22.965846,57.374834,62.504903
company name,-3.972402,13.146088,-10.939373,14.421752,11.069899,3.645168,-9.685807,100.0,11.946879,-3.121876,-2.5116
km_driven_bins,-48.037822,84.092423,-29.716702,12.685058,13.422679,34.241653,-22.965846,11.946879,100.0,-23.448379,-30.692995
ExShowroom Price_bins,41.136106,-17.905059,-29.419206,-10.965633,-30.819882,-23.543955,57.374834,-3.121876,-23.448379,100.0,81.97437


In [53]:
data

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,selling_price,company name,km_driven_bins,ExShowroom Price_bins,selling_price_bins
0,2010,120000,1,1,1,2,100000,25,3,0,0
1,2012,50000,4,1,1,2,100000,25,1,0,0
2,2011,70000,1,1,1,0,120000,25,2,0,0
3,2010,80000,4,1,1,2,100000,18,2,0,0
4,2010,80000,4,1,1,2,130000,3,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...
4335,2019,30000,1,0,0,0,4950000,2,0,3,3
4336,2019,12999,1,0,0,0,4800000,2,0,3,3
4337,2019,30000,1,0,0,0,4950000,2,0,3,3
4338,2016,13000,4,0,0,0,8900000,1,0,3,3


# Train Test Split

In [54]:
x=data.drop(columns='selling_price')
y=data['selling_price']

In [55]:
x_train, x_test, y_train , y_test=train_test_split(x,y,test_size=0.3,random_state=1234)

In [56]:
len(x_train)

3038

In [57]:
len(x_test)

1302

# Model Development

# 1.Model name
2.Training
3.exam- Predicted
4.Accuracy  Check

# Model declaration

In [58]:
model=LinearRegression()

# Model training (fitting)

In [59]:
model.fit(x_train,y_train)

LinearRegression()

# Prediction

In [60]:
y_pred=model.predict(x_test)


In [61]:
y_pred

array([1426136.43580567,  284351.26493189,  281621.90095788, ...,
       1417525.59263429,  792218.69459036,  657137.11611018])

# Accuracy

In [62]:
accuracy=100*r2_score(y_test,y_pred)
accuracy

56.31081018763313

# Running all the models

In [63]:
models=[LinearRegression(),Lasso(),ElasticNet(),KNeighborsRegressor(),DecisionTreeRegressor(),RandomForestRegressor(),AdaBoostRegressor(),GradientBoostingRegressor()]

In [64]:
acc_dic={}


In [65]:
for model in models:
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    accuracy=100*r2_score(y_test,y_pred)
    acc_dic[model]=round(accuracy,2)

In [66]:
acc_dic

{LinearRegression(): 56.31,
 Lasso(): 56.31,
 ElasticNet(): 46.19,
 KNeighborsRegressor(): 55.48,
 DecisionTreeRegressor(): 57.17,
 RandomForestRegressor(): 84.83,
 AdaBoostRegressor(): 37.82,
 GradientBoostingRegressor(): 82.3}