In [67]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Give us flexibility to view data by allowing us to see more of the data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [68]:
df = pd.read_csv('used_cars_data.csv')

In [69]:
df

Unnamed: 0,S.No.,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.50
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.50
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.00
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7248,7248,Volkswagen Vento Diesel Trendline,Hyderabad,2011,89411,Diesel,Manual,First,20.54 kmpl,1598 CC,103.6 bhp,5.0,,
7249,7249,Volkswagen Polo GT TSI,Mumbai,2015,59000,Petrol,Automatic,First,17.21 kmpl,1197 CC,103.6 bhp,5.0,,
7250,7250,Nissan Micra Diesel XV,Kolkata,2012,28000,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0,,
7251,7251,Volkswagen Polo GT TSI,Pune,2013,52262,Petrol,Automatic,Third,17.2 kmpl,1197 CC,103.6 bhp,5.0,,


In [70]:
df.dtypes

S.No.                  int64
Name                  object
Location              object
Year                   int64
Kilometers_Driven      int64
Fuel_Type             object
Transmission          object
Owner_Type            object
Mileage               object
Engine                object
Power                 object
Seats                float64
New_Price             object
Price                float64
dtype: object

In [71]:
df['New_Price'].unique

<bound method Series.unique of 0             NaN
1             NaN
2       8.61 Lakh
3             NaN
4             NaN
          ...    
7248          NaN
7249          NaN
7250          NaN
7251          NaN
7252          NaN
Name: New_Price, Length: 7253, dtype: object>

In [72]:
df['New_Price'].sample(50)

7203           NaN
1315           NaN
4342           NaN
4013           NaN
4753           NaN
1094     9.46 Lakh
5000           NaN
5549           NaN
4022    22.66 Lakh
3460           NaN
94             NaN
3152           NaN
7116           NaN
4707           NaN
6965           NaN
2235           NaN
5518    39.27 Lakh
5202           NaN
5106           NaN
4592           NaN
3543           NaN
6232           NaN
2635           NaN
1804           NaN
3521           NaN
6159           NaN
6744           NaN
3644           NaN
4082           NaN
3802           NaN
6564           NaN
7163           NaN
1897           NaN
5177           NaN
2927           NaN
1244           NaN
5102     5.12 Lakh
4536           NaN
5301           NaN
128            NaN
4445           NaN
600            NaN
7231     8.79 Lakh
1254           NaN
167            NaN
5748           NaN
6583           NaN
3159           NaN
117      9.51 Lakh
3932           NaN
Name: New_Price, dtype: object

In [73]:
df['Price'].sample(50)

4029    14.70
388      2.50
6700      NaN
4540     2.89
386      1.95
3722     3.89
3182     4.50
6482      NaN
1525     7.70
2134    31.00
3171     3.90
1098     2.80
452      2.25
7018      NaN
4224     1.75
630      1.45
5431     8.25
2191     6.50
4793     6.50
4856    32.78
4290     1.00
3361     9.50
1470     0.90
1399    10.50
3910     8.70
2192     4.75
5084     6.24
468      2.22
1062     2.20
267     13.75
2187    17.75
2369     1.50
6705      NaN
3691     4.70
6280      NaN
3515    15.50
1939     8.50
2246     5.50
880      5.00
788      8.50
775     17.90
4517    10.20
3031     5.80
1620     4.00
2780     1.60
5784     3.05
5538    17.75
5941     3.40
2907    75.00
99       2.65
Name: Price, dtype: float64

# Data Analysis

In [74]:
# Drop the S.No. column. It's useless.
df.drop(['S.No.'], axis=1, inplace=True)
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.50
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.50
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.00
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7248,Volkswagen Vento Diesel Trendline,Hyderabad,2011,89411,Diesel,Manual,First,20.54 kmpl,1598 CC,103.6 bhp,5.0,,
7249,Volkswagen Polo GT TSI,Mumbai,2015,59000,Petrol,Automatic,First,17.21 kmpl,1197 CC,103.6 bhp,5.0,,
7250,Nissan Micra Diesel XV,Kolkata,2012,28000,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0,,
7251,Volkswagen Polo GT TSI,Pune,2013,52262,Petrol,Automatic,Third,17.2 kmpl,1197 CC,103.6 bhp,5.0,,


In [75]:
values = pd.unique(df['Transmission'])
print(values)

['Manual' 'Automatic']


In [76]:
values = pd.unique(df['Fuel_Type'])
print(values)

['CNG' 'Diesel' 'Petrol' 'LPG' 'Electric']


In [77]:
values = pd.unique(df['Owner_Type'])
print(values)

['First' 'Second' 'Fourth & Above' 'Third']


In [78]:
# Let's get the mileage units to see if there are data entry problems we should be worried about
def getUnits(x):
    if ( isinstance(x, str) ):
        return x.split(' ')[1]
    else:
        return np.nan
    
df['Mileage_Units'] = df['Mileage'].apply(getUnits)
values = pd.unique(df['Mileage_Units'])
print(values)

['km/kg' 'kmpl' nan]


In [79]:
# Let's get the Engine units to see if there are data entry problems we should be worried about
def getUnits(x):
    if ( isinstance(x, str) ):
        return x.split(' ')[1]
    else:
        return np.nan
    
df['Engine_Units'] = df['Engine'].apply(getUnits)
values = pd.unique(df['Engine_Units'])
print(values)

['CC' nan]


In [80]:
# Let's get the Power units to see if there are data entry problems we should be worried about
def getUnits(x):
    if ( isinstance(x, str) ):
        return x.split(' ')[1]
    else:
        return np.nan
    
df['Power_Units'] = df['Power'].apply(getUnits)
values = pd.unique(df['Power_Units'])
print(values)

['bhp' nan]


# Column Cleanup and Feature Engineering

In [81]:
# Let's cleanup the mileage column
def getNumericValue(x):
    if ( isinstance(x, str) ):
        return float(x.split(' ')[0])
    else:
        return np.nan
    
df['Mileage_num'] = df['Mileage'].apply(getNumericValue)
df['Mileage_num'].sample(50)

3055    15.60
238     18.60
5154    23.10
5391    15.10
4560    18.20
2933    12.99
2203    18.70
1375    16.80
2175    25.80
3785    21.38
5246    25.00
2874    16.47
5902    26.20
163     15.10
482     12.80
3087    22.69
5864    19.60
6300    17.68
4534    17.00
3944    24.07
420     18.90
7134    17.70
217     19.81
3826    18.50
848     16.47
6713    17.92
5552    14.69
4073    16.80
423     15.26
348     21.76
1340    18.40
6       23.08
4619    23.08
3245    20.40
120     17.00
1542    16.55
4942    18.25
731     24.00
3870    17.60
1983    23.10
6699    19.87
5908    14.42
4298    16.00
4598    25.20
7203    18.60
354     18.00
452     18.50
2245    21.50
6894    21.10
6590    19.40
Name: Mileage_num, dtype: float64

In [82]:
# Let's cleanup the Engine column
def getNumericValue(x):
    if ( isinstance(x, str) ):
        return int(x.split(' ')[0])
    else:
        return np.nan
    
df['Engine_num'] = df['Engine'].apply(getNumericValue)
df['Engine_num'].sample(50)

3589    1405.0
564     1498.0
3541    1197.0
188     1582.0
6169    1598.0
1752    2179.0
3245    1197.0
3062    2967.0
6976    1796.0
389     1999.0
713     1197.0
6710    1198.0
1905    1373.0
6219    1248.0
3738    1197.0
2250    1582.0
4277    2143.0
5226    2179.0
2964    1497.0
4935    2982.0
3437    1341.0
3233    2143.0
1417    1197.0
2664    1493.0
5348    1995.0
4679    1497.0
5770    2179.0
853     1968.0
3262    1248.0
5444     936.0
6239    1582.0
4749    1197.0
6427    1496.0
2319    2987.0
1621    1199.0
5915    1399.0
7096    1198.0
6956    2494.0
5161    2494.0
1563     796.0
702     1197.0
1351    1373.0
6562    1591.0
2020    1196.0
1061    1997.0
2209    1995.0
4398    1396.0
4464    1197.0
884     1197.0
4036    1248.0
Name: Engine_num, dtype: float64

In [83]:
# Let's cleanup the Power column
def getNumericValue(x):
    if ( isinstance(x, str) ):
        val = x.split(' ')[0]
        if ( val == 'null' ) :
            return np.nan
        else:
            return float(val)
    else:
        return np.nan
    
df['Power_num'] = df['Power'].apply(getNumericValue)
df['Power_num'].sample(50)

663     102.00
812      88.80
2383    138.10
487     126.32
4648    187.70
1567     67.10
5183    117.30
6033    102.50
7051     88.50
35       64.00
1303    190.00
3497     71.01
4032     70.00
2013     81.86
5042    174.33
590      83.80
6330     81.83
2767    254.79
1861    169.00
34      103.50
1294       NaN
2862    103.20
1701    171.00
3982     74.00
4130    121.30
3312    140.00
4879     85.80
5703    121.30
47      163.70
5496     82.00
1790     90.00
5888     67.10
6810     83.80
7037     89.84
3410    187.74
3224     83.10
6996     82.85
4023    126.32
1609    138.03
889      67.10
6397     83.10
435     258.00
4698    184.00
2981     79.40
1300    102.00
1302    174.33
4712       NaN
6059     74.00
3196    197.00
5711     70.00
Name: Power_num, dtype: float64

In [84]:
df.dtypes

Name                  object
Location              object
Year                   int64
Kilometers_Driven      int64
Fuel_Type             object
Transmission          object
Owner_Type            object
Mileage               object
Engine                object
Power                 object
Seats                float64
New_Price             object
Price                float64
Mileage_Units         object
Engine_Units          object
Power_Units           object
Mileage_num          float64
Engine_num           float64
Power_num            float64
dtype: object

# Drop All Unwanted Columns now that feature engineering is done

In [66]:
df.drop(['Mileage', 'Mileage_Units', 'Engine', 'Engine_Units', 'Power', 'Power_Units'], axis=1, inplace=True)
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,New_Price,Price,Mileage_num,Engine_num,Power_num
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,5.0,,1.75,26.60,998.0,58.16
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,5.0,,12.50,19.67,1582.0,126.20
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,5.0,8.61 Lakh,4.50,18.20,1199.0,88.70
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,7.0,,6.00,20.77,1248.0,88.76
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,5.0,,17.74,15.20,1968.0,140.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7248,Volkswagen Vento Diesel Trendline,Hyderabad,2011,89411,Diesel,Manual,First,5.0,,,20.54,1598.0,103.60
7249,Volkswagen Polo GT TSI,Mumbai,2015,59000,Petrol,Automatic,First,5.0,,,17.21,1197.0,103.60
7250,Nissan Micra Diesel XV,Kolkata,2012,28000,Diesel,Manual,First,5.0,,,23.08,1461.0,63.10
7251,Volkswagen Polo GT TSI,Pune,2013,52262,Petrol,Automatic,Third,5.0,,,17.20,1197.0,103.60


In [37]:
df.dtypes

Name                  object
Location              object
Year                   int64
Kilometers_Driven      int64
Fuel_Type             object
Transmission          object
Owner_Type            object
Engine                object
Power                 object
Seats                float64
New_Price             object
Price                float64
Mileage_num           object
dtype: object