## Importing dependencies

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn import metrics

## Load the data
### https://www.kaggle.com/nehalbirla/vehicle-dataset-from-cardekho?select=car+data.csv

In [3]:
df = pd.read_csv('/Users/car data.csv')
df

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.60,6.87,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...,...
296,city,2016,9.50,11.60,33988,Diesel,Dealer,Manual,0
297,brio,2015,4.00,5.90,60000,Petrol,Dealer,Manual,0
298,city,2009,3.35,11.00,87934,Petrol,Dealer,Manual,0
299,city,2017,11.50,12.50,9000,Diesel,Dealer,Manual,0


In [4]:
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [6]:
print(df['Fuel_Type'].value_counts())
print(df['Seller_Type'].value_counts())
print(df['Transmission'].value_counts())

Petrol    239
Diesel     60
CNG         2
Name: Fuel_Type, dtype: int64
Dealer        195
Individual    106
Name: Seller_Type, dtype: int64
Manual       261
Automatic     40
Name: Transmission, dtype: int64


## Label Binarization

### Create a new df and load it with the converted categorical features

In [7]:
df_new = df.replace({'Fuel_Type': {'CNG':0,'Diesel':1,'Petrol':2},
           'Seller_Type': {'Dealer':0,'Individual':1},
           'Transmission': {'Manual':0,'Automatic':1}})

In [8]:
df_new

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,2,0,0,0
1,sx4,2013,4.75,9.54,43000,1,0,0,0
2,ciaz,2017,7.25,9.85,6900,2,0,0,0
3,wagon r,2011,2.85,4.15,5200,2,0,0,0
4,swift,2014,4.60,6.87,42450,1,0,0,0
...,...,...,...,...,...,...,...,...,...
296,city,2016,9.50,11.60,33988,1,0,0,0
297,brio,2015,4.00,5.90,60000,2,0,0,0
298,city,2009,3.35,11.00,87934,2,0,0,0
299,city,2017,11.50,12.50,9000,1,0,0,0


### Create the split

In [9]:
X = df_new.drop(['Car_Name','Selling_Price'],axis=1)
y = df_new['Selling_Price']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [11]:
lin_model = LinearRegression()

In [12]:
lin_model.fit(X_train,y_train)

LinearRegression()

In [13]:
lin_model.score(X_test,y_test)

0.8650320546210946

In [14]:
y_pred = lin_model.predict(X_test)
y_true = y_test
y_pred

array([10.37761177, 10.53101062,  4.49552252,  5.57933769,  2.60458157,
        8.73793093,  7.40734539,  1.52366668, 15.89175758,  4.76608883,
        6.33273576,  6.52389404,  3.51872442,  3.13961514, -0.68495874,
        4.28790652,  9.2609021 , -0.35079867,  9.34633447,  1.18010217,
        7.61043836, 18.01754482, 19.45949769,  0.23805849,  2.1307204 ,
        4.86363474,  5.05812902,  6.94556259,  5.31420261,  5.63285646,
        4.42111231,  8.06086042,  5.81277404, -0.57139186,  0.7121926 ,
        6.61505627,  7.52001221,  5.48635327,  1.1358701 ,  1.16924982,
        2.09654099,  5.24356235, 14.26314322, -8.64585382,  1.41363311,
        4.05846921, -0.96482233,  8.07192811,  4.39205469,  4.88916506,
        5.24618767, 16.1732445 ,  0.73964183,  1.66886485,  1.61428682,
       -1.47570513,  2.81845772,  0.74828173, -1.83207075,  1.62857371,
        0.68698358])

In [15]:
metrics.r2_score(y_true,y_pred)

0.8650320546210946

## Now let's check the Lasso model

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [17]:
lasso_model = Lasso()

In [18]:
lasso_model.fit(X_train,y_train)

Lasso()

In [19]:
lasso_model.score(X_test,y_test)

0.8497457570738539

In [20]:
y_pred = lasso_model.predict(X_test)
y_pred

array([ 9.84567382,  1.41978996,  4.33000433,  3.17455558,  8.92053425,
        4.27617308,  3.7331185 ,  5.65469631,  0.9067815 ,  5.24312003,
        6.45060323,  4.39580233,  1.28891382,  8.24160548,  2.00848206,
        2.30994514,  2.5492099 ,  2.34681358,  7.97733531,  4.18658601,
        2.31777374,  7.86684049,  2.01868131,  8.55068741,  1.53941124,
        6.48280905,  2.1138961 , -1.18968392,  4.00416509,  2.18458499,
        2.4093372 ,  3.56273297,  5.58698712,  8.26139501, -0.83259354,
        5.27147158,  6.48649531,  5.72043366,  6.39669211,  4.79182476,
       15.51840214,  2.59837644,  1.67424249,  0.45725311,  5.16817508,
        6.85817855,  1.61450208,  5.29605485, 13.86828069,  2.87644681,
        6.44593415,  0.08044801,  9.97507828,  1.79190966,  2.64946731,
        0.0470775 ,  1.38245113, 10.10098365,  0.44332429, -1.30063353,
        8.95788022])

In [21]:
metrics.r2_score(y_test,y_pred)

0.8497457570738539

Linear Model are good for features that are directly correlated