In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
sns.set(rc = {'figure.figsize':(11.7, 8.27)})

# Data Exploration

In [18]:
cars_data = pd.read_csv("./Dataset/autos.csv", encoding = 'cp850')
cars_data = cars_data.drop(['nrOfPictures'], axis = 1)

In [19]:
cars = cars_data.copy() # deep copy

In [20]:
cars.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,postalCode,lastSeen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3T▄RER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,60437,2016-04-06 10:17:21


In [22]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371528 entries, 0 to 371527
Data columns (total 19 columns):
dateCrawled            371528 non-null object
name                   371528 non-null object
seller                 371528 non-null object
offerType              371528 non-null object
price                  371528 non-null int64
abtest                 371528 non-null object
vehicleType            333659 non-null object
yearOfRegistration     371528 non-null int64
gearbox                351319 non-null object
powerPS                371528 non-null int64
model                  351044 non-null object
kilometer              371528 non-null int64
monthOfRegistration    371528 non-null int64
fuelType               338142 non-null object
brand                  371528 non-null object
notRepairedDamage      299468 non-null object
dateCreated            371528 non-null object
postalCode             371528 non-null int64
lastSeen               371528 non-null object
dtypes: int64(6), obj

In [23]:
cars.describe()

Unnamed: 0,price,yearOfRegistration,powerPS,kilometer,monthOfRegistration,postalCode
count,371528.0,371528.0,371528.0,371528.0,371528.0,371528.0
mean,17295.14,2004.577997,115.549477,125618.688228,5.734445,50820.66764
std,3587954.0,92.866598,192.139578,40112.337051,3.712412,25799.08247
min,0.0,1000.0,0.0,5000.0,0.0,1067.0
25%,1150.0,1999.0,70.0,125000.0,3.0,30459.0
50%,2950.0,2003.0,105.0,150000.0,6.0,49610.0
75%,7200.0,2008.0,150.0,150000.0,9.0,71546.0
max,2147484000.0,9999.0,20000.0,150000.0,12.0,99998.0


In [30]:
pd.set_option('display.float_format', lambda x: '%.3f' % x) # used to get rid of scientific notation
# use the following if all columns are not displayed. 500 is the columns to display
# pd.set_option('display.max_columns', 500)
cars.describe()

Unnamed: 0,price,yearOfRegistration,powerPS,kilometer,monthOfRegistration,postalCode
count,371528.0,371528.0,371528.0,371528.0,371528.0,371528.0
mean,17295.142,2004.578,115.549,125618.688,5.734,50820.668
std,3587953.744,92.867,192.14,40112.337,3.712,25799.082
min,0.0,1000.0,0.0,5000.0,0.0,1067.0
25%,1150.0,1999.0,70.0,125000.0,3.0,30459.0
50%,2950.0,2003.0,105.0,150000.0,6.0,49610.0
75%,7200.0,2008.0,150.0,150000.0,9.0,71546.0
max,2147483647.0,9999.0,20000.0,150000.0,12.0,99998.0


infrences from above data
- the mean of price is 17000 whereas 2nd quartine is about 3000 which is a huge gap, so the distrubution os price is skewed.
- The year of registration is just year, but minimum and maximum values as 1000 and 9999respectively but they do not make sense.
- The powerPA has mean of 115 and 2nd quartile is about 105 which are pretty close to each other
- The average Kilometer drived by a vehicle ia 1,25,000kms which is about the value of first quartile.
- min month of registration doesnot make any sence.
- also the postal is a categorical variable not continous variable

In [32]:
# dropping unwanted data
unwanted = ['name', 'dateCrawled', 'dateCreated','postalCode', 'lastSeen']
cars = cars.drop(unwanted, axis = 1)
cars.shape

(371528, 14)

In [39]:
# removind suplicate entries
cars.drop_duplicates(keep = 'first', inplace = True)
cars.shape
# we got about 20000 duplicate entries which are dropped

(353306, 14)

In [42]:
# Exploring  categorical variables
cars.describe(include = 'O')

Unnamed: 0,seller,offerType,abtest,vehicleType,gearbox,model,fuelType,brand,notRepairedDamage
count,353306,353306,353306,316867,334744,333598,321084,353306,284342
unique,2,2,2,8,2,251,7,40,2
top,privat,Angebot,test,limousine,manuell,golf,benzin,volkswagen,nein
freq,353303,353294,182841,90589,261430,28236,213704,74987,249066


In [45]:
for i in cars.describe(include = 'O').columns:
    print(cars[i].value_counts())
    print()

privat        353303
gewerblich         3
Name: seller, dtype: int64

Angebot    353294
Gesuch         12
Name: offerType, dtype: int64

test       182841
control    170465
Name: abtest, dtype: int64

limousine     90589
kleinwagen    76096
kombi         63930
bus           28881
cabrio        21910
coupe         18278
suv           13898
andere         3285
Name: vehicleType, dtype: int64

manuell      261430
automatik     73314
Name: gearbox, dtype: int64

golf               28236
andere             25618
3er                19440
polo               12384
corsa              11876
                   ...  
kalina                 8
rangerover             6
serie_3                4
serie_1                2
discovery_sport        1
Name: model, Length: 251, dtype: int64

benzin     213704
diesel     101096
lpg          5153
cng           549
hybrid        273
andere        206
elektro       103
Name: fuelType, dtype: int64

volkswagen        74987
opel              38146
bmw               

In [47]:
correlation = cars.corr()
correlation
# none of them are correlated to each other as no value is close to -1 or +1

Unnamed: 0,price,yearOfRegistration,powerPS,kilometer,monthOfRegistration
price,1.0,-0.0,0.007,-0.001,-0.003
yearOfRegistration,-0.0,1.0,0.0,-0.055,-0.012
powerPS,0.007,0.0,1.0,-0.01,0.043
kilometer,-0.001,-0.055,-0.01,1.0,0.002
monthOfRegistration,-0.003,-0.012,0.043,0.002,1.0
