# Introduction

We are using a selected subset of the used cars dataset provided by data society. We are interested in cleaning and analyzing this subset of data to generalize the cars that are listed on Ebay.

data can be downloaded here: https://data.world/data-society/used-cars-data

In [1]:
import pandas as pd
import numpy as np

In [80]:
autos = pd.read_csv('autos.csv', encoding='Latin-1')

In [82]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
dateCrawled            50000 non-null object
name                   50000 non-null object
seller                 50000 non-null object
offerType              50000 non-null object
price                  50000 non-null object
abtest                 50000 non-null object
vehicleType            44905 non-null object
yearOfRegistration     50000 non-null int64
gearbox                47320 non-null object
powerPS                50000 non-null int64
model                  47242 non-null object
odometer               50000 non-null object
monthOfRegistration    50000 non-null int64
fuelType               45518 non-null object
brand                  50000 non-null object
notRepairedDamage      40171 non-null object
dateCreated            50000 non-null object
nrOfPictures           50000 non-null int64
postalCode             50000 non-null int64
lastSeen               50000 non-null obj

We see that some of the columns contain up to 20% null values

In [83]:
autos.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,odometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50


In [84]:
autos.columns

Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'odometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')

## Data Cleaning

In [85]:
from functools import reduce
# convert columns into snakecase
def to_snakecase(string):
    return reduce(lambda x, y: x + ('_' if y.isupper() else '') + y, string).lower() 
    
def rpl(string):
    string = string.replace('yearOfRegistration', 'registration_year')
    string = string.replace('monthOfRegistration', 'registration_month')
    string = string.replace('notRepairedDamage', 'unrepaired_damage')
    string = string.replace('dateCreated', 'ad_created')
    string = to_snakecase(string)
    return string

In [86]:
new_columns = []
for c in autos.columns:
    new_columns.append(rpl(c))

In [87]:
autos.columns = new_columns

In [88]:
autos.head()

Unnamed: 0,date_crawled,name,seller,offer_type,price,abtest,vehicle_type,registration_year,gearbox,power_p_s,model,odometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,nr_of_pictures,postal_code,last_seen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50


In [89]:
autos.describe(include='all')

Unnamed: 0,date_crawled,name,seller,offer_type,price,abtest,vehicle_type,registration_year,gearbox,power_p_s,model,odometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,nr_of_pictures,postal_code,last_seen
count,50000,50000,50000,50000,50000,50000,44905,50000.0,47320,50000.0,47242,50000,50000.0,45518,50000,40171,50000,50000.0,50000.0,50000
unique,48213,38754,2,2,2357,2,8,,2,,245,13,,7,40,2,76,,,39481
top,2016-03-29 23:42:13,Ford_Fiesta,privat,Angebot,$0,test,limousine,,manuell,,golf,"150,000km",,benzin,volkswagen,nein,2016-04-03 00:00:00,,,2016-04-07 06:17:27
freq,3,78,49999,49999,1421,25756,12859,,36993,,4024,32424,,30107,10687,35232,1946,,,8
mean,,,,,,,,2005.07328,,116.35592,,,5.72336,,,,,0.0,50813.6273,
std,,,,,,,,105.712813,,209.216627,,,3.711984,,,,,0.0,25779.747957,
min,,,,,,,,1000.0,,0.0,,,0.0,,,,,0.0,1067.0,
25%,,,,,,,,1999.0,,70.0,,,3.0,,,,,0.0,30451.0,
50%,,,,,,,,2003.0,,105.0,,,6.0,,,,,0.0,49577.0,
75%,,,,,,,,2008.0,,150.0,,,9.0,,,,,0.0,71540.0,


In [90]:
autos['price'].value_counts()

$0          1421
$500         781
$1,500       734
$2,500       643
$1,200       639
$1,000       639
$600         531
$800         498
$3,500       498
$2,000       460
$999         434
$750         433
$900         420
$650         419
$850         410
$700         395
$4,500       394
$300         384
$2,200       382
$950         379
$1,100       376
$1,300       371
$3,000       365
$550         356
$1,800       355
$5,500       340
$1,250       335
$350         335
$1,600       327
$1,999       322
            ... 
$10,080        1
$11,790        1
$3,129         1
$3,080         1
$23,890        1
$3,795         1
$37,400        1
$3,580         1
$16,190        1
$29,600        1
$64,999        1
$115,000       1
$220,000       1
$20,980        1
$4,222         1
$23,650        1
$72,500        1
$6,444         1
$1,856         1
$18,470        1
$75,997        1
$31,450        1
$8,730         1
$6,940         1
$34,890        1
$4,655         1
$785           1
$4,780        

In [91]:
autos['price']=autos['price'].str.replace('$','')
autos['price']=autos['price'].str.replace(',','').astype(int)

In [92]:
autos['odometer'].value_counts()

150,000km    32424
125,000km     5170
100,000km     2169
90,000km      1757
80,000km      1436
70,000km      1230
60,000km      1164
50,000km      1027
5,000km        967
40,000km       819
30,000km       789
20,000km       784
10,000km       264
Name: odometer, dtype: int64

In [93]:
autos['odometer']=autos['odometer'].str.replace('km','')
autos['odometer']=autos['odometer'].str.replace(',','').astype(int)

In [94]:
autos.rename({'odometer': 'odometer_km'}, axis=1, inplace=True)

In [95]:
autos.columns

Index(['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest',
       'vehicle_type', 'registration_year', 'gearbox', 'power_p_s', 'model',
       'odometer_km', 'registration_month', 'fuel_type', 'brand',
       'unrepaired_damage', 'ad_created', 'nr_of_pictures', 'postal_code',
       'last_seen'],
      dtype='object')

In [96]:
autos['price'].unique().shape

(2357,)

In [97]:
autos['price'].describe()

count    5.000000e+04
mean     9.840044e+03
std      4.811044e+05
min      0.000000e+00
25%      1.100000e+03
50%      2.950000e+03
75%      7.200000e+03
max      1.000000e+08
Name: price, dtype: float64

In [98]:
autos['price'].value_counts().sort_index(ascending=False)

99999999       1
27322222       1
12345678       3
11111111       2
10000000       1
3890000        1
1300000        1
1234566        1
999999         2
999990         1
350000         1
345000         1
299000         1
295000         1
265000         1
259000         1
250000         1
220000         1
198000         1
197000         1
194000         1
190000         1
180000         1
175000         1
169999         1
169000         1
163991         1
163500         1
155000         1
151990         1
            ... 
66             1
65             5
60             9
59             1
55             2
50            49
49             4
47             1
45             4
40             6
35             1
30             7
29             1
25             5
20             4
18             1
17             3
15             2
14             1
13             2
12             3
11             2
10             7
9              1
8              1
5              2
3              1
2             

We see that there are some outliers such as 99999999 or 11111111

In [99]:
autos=autos[(autos['price'].between(1,350000))]

In [100]:
autos['odometer_km'].describe()

count     48565.000000
mean     125770.101925
std       39788.636804
min        5000.000000
25%      125000.000000
50%      150000.000000
75%      150000.000000
max      150000.000000
Name: odometer_km, dtype: float64

In [101]:
autos['odometer_km'].value_counts().sort_index(ascending=False)

150000    31414
125000     5057
100000     2115
90000      1734
80000      1415
70000      1217
60000      1155
50000      1012
40000       815
30000       780
20000       762
10000       253
5000        836
Name: odometer_km, dtype: int64

In [102]:
autos['date_crawled'].str[:10].value_counts(normalize=True, dropna=False).sort_index

<bound method Series.sort_index of 2016-04-03    0.038608
2016-03-20    0.037887
2016-03-21    0.037373
2016-03-12    0.036920
2016-03-14    0.036549
2016-04-04    0.036487
2016-03-07    0.036014
2016-04-02    0.035478
2016-03-28    0.034860
2016-03-19    0.034778
2016-03-15    0.034284
2016-03-29    0.034099
2016-03-30    0.033687
2016-04-01    0.033687
2016-03-08    0.033296
2016-03-09    0.033090
2016-03-22    0.032987
2016-03-11    0.032575
2016-03-23    0.032225
2016-03-26    0.032204
2016-03-10    0.032184
2016-03-31    0.031834
2016-03-17    0.031628
2016-03-25    0.031607
2016-03-27    0.031092
2016-03-16    0.029610
2016-03-24    0.029342
2016-03-05    0.025327
2016-03-13    0.015670
2016-03-06    0.014043
2016-04-05    0.013096
2016-03-18    0.012911
2016-04-06    0.003171
2016-04-07    0.001400
Name: date_crawled, dtype: float64>

In [103]:
autos['ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_index

<bound method Series.sort_index of 2016-04-03    0.038855
2016-03-20    0.037949
2016-03-21    0.037579
2016-04-04    0.036858
2016-03-12    0.036755
2016-03-14    0.035190
2016-04-02    0.035149
2016-03-28    0.034984
2016-03-07    0.034737
2016-03-29    0.034037
2016-03-15    0.034016
2016-03-19    0.033687
2016-04-01    0.033687
2016-03-30    0.033501
2016-03-08    0.033316
2016-03-09    0.033151
2016-03-11    0.032904
2016-03-22    0.032801
2016-03-26    0.032266
2016-03-23    0.032060
2016-03-10    0.031895
2016-03-31    0.031875
2016-03-25    0.031751
2016-03-17    0.031278
2016-03-27    0.030989
2016-03-16    0.030125
2016-03-24    0.029280
2016-03-05    0.022897
2016-03-13    0.017008
2016-03-06    0.015320
                ...   
2016-02-24    0.000041
2016-02-20    0.000041
2016-02-26    0.000041
2016-02-05    0.000041
2016-02-02    0.000041
2016-02-18    0.000041
2016-01-10    0.000041
2016-02-14    0.000041
2016-02-12    0.000041
2015-11-10    0.000021
2015-08-10    0.000021

In [104]:
autos['last_seen'].str[:10].value_counts(normalize=True, dropna=False).sort_index

<bound method Series.sort_index of 2016-04-06    0.221806
2016-04-07    0.131947
2016-04-05    0.124761
2016-03-17    0.028086
2016-04-03    0.025203
2016-04-02    0.024915
2016-03-30    0.024771
2016-04-04    0.024483
2016-03-12    0.023783
2016-03-31    0.023783
2016-04-01    0.022794
2016-03-29    0.022341
2016-03-22    0.021373
2016-03-28    0.020859
2016-03-20    0.020653
2016-03-21    0.020632
2016-03-24    0.019767
2016-03-25    0.019211
2016-03-23    0.018532
2016-03-26    0.016802
2016-03-16    0.016452
2016-03-15    0.015876
2016-03-19    0.015834
2016-03-27    0.015649
2016-03-14    0.012602
2016-03-11    0.012375
2016-03-10    0.010666
2016-03-09    0.009595
2016-03-13    0.008895
2016-03-08    0.007413
2016-03-18    0.007351
2016-03-07    0.005395
2016-03-06    0.004324
2016-03-05    0.001071
Name: last_seen, dtype: float64>

The last day when the listing was seen, so either the car was sold, or the seller decided not to sell it anymore.

In [105]:
autos['registration_year'].describe()

count    48565.000000
mean      2004.755421
std         88.643887
min       1000.000000
25%       1999.000000
50%       2004.000000
75%       2008.000000
max       9999.000000
Name: registration_year, dtype: float64

In [106]:
autos['registration_year'].value_counts().sort_index(ascending=False)

9999       3
9000       1
8888       1
6200       1
5911       1
5000       4
4800       1
4500       1
4100       1
2800       1
2019       2
2018     470
2017    1392
2016    1220
2015     392
2014     663
2013     803
2012    1310
2011    1623
2010    1589
2009    2085
2008    2215
2007    2277
2006    2670
2005    2936
2004    2703
2003    2699
2002    2486
2001    2636
2000    3156
        ... 
1964      12
1963       8
1962       4
1961       6
1960      23
1959       6
1958       4
1957       2
1956       4
1955       2
1954       2
1953       1
1952       1
1951       2
1950       3
1948       1
1943       1
1941       2
1939       1
1938       1
1937       4
1934       2
1931       1
1929       1
1927       1
1910       5
1800       2
1111       1
1001       1
1000       1
Name: registration_year, Length: 95, dtype: int64

We see that there are some outliers, so we want to limit our cars to have registered from 1910 to 2016, even though it's hard to define a proper start year.

In [107]:
autos=autos[(autos['registration_year'].between(1910,2016))]

In [108]:
autos['registration_year'].value_counts(normalize=True)

2000    0.067608
2005    0.062895
1999    0.062060
2004    0.057904
2003    0.057818
2006    0.057197
2001    0.056468
2002    0.053255
1998    0.050620
2007    0.048778
2008    0.047450
2009    0.044665
1997    0.041794
2011    0.034768
2010    0.034040
1996    0.029412
2012    0.028063
1995    0.026285
2016    0.026135
2013    0.017202
2014    0.014203
1994    0.013474
1993    0.009104
2015    0.008397
1992    0.007926
1990    0.007433
1991    0.007262
1989    0.003727
1988    0.002892
1985    0.002035
          ...   
1966    0.000471
1976    0.000450
1969    0.000407
1975    0.000386
1965    0.000364
1964    0.000257
1963    0.000171
1959    0.000129
1961    0.000129
1910    0.000107
1956    0.000086
1958    0.000086
1937    0.000086
1962    0.000086
1950    0.000064
1954    0.000043
1941    0.000043
1951    0.000043
1934    0.000043
1957    0.000043
1955    0.000043
1953    0.000021
1943    0.000021
1929    0.000021
1939    0.000021
1938    0.000021
1948    0.000021
1927    0.0000

In [114]:
brand_mean_price = {}

brands = autos['brand'].unique()

for b in brands:
    b_brand = autos[autos['brand']==b]
    mean_b_price = b_brand['price'].mean()
    brand_mean_price[b] = mean_b_price
    

In [117]:
brand_mean_price

{'alfa_romeo': 4087.690322580645,
 'audi': 9336.687453600594,
 'bmw': 8332.820517811953,
 'chevrolet': 6684.139097744361,
 'chrysler': 3465.743902439024,
 'citroen': 3779.1391437308866,
 'dacia': 5915.528455284553,
 'daewoo': 1049.0,
 'daihatsu': 1636.1965811965813,
 'fiat': 2813.748538011696,
 'ford': 3749.4695065890287,
 'honda': 4107.857923497268,
 'hyundai': 5365.254273504273,
 'jaguar': 11635.493150684932,
 'jeep': 11650.5,
 'kia': 5982.330303030303,
 'lada': 2688.296296296296,
 'lancia': 3376.22,
 'land_rover': 19108.091836734693,
 'mazda': 4112.596614950635,
 'mercedes_benz': 8628.450366422385,
 'mini': 10613.459657701711,
 'mitsubishi': 3394.5729166666665,
 'nissan': 4743.40252454418,
 'opel': 2975.2419354838707,
 'peugeot': 3094.0172290021537,
 'porsche': 45643.93706293706,
 'renault': 2474.8646069968195,
 'rover': 1602.2903225806451,
 'saab': 3211.6493506493507,
 'seat': 4397.230949589683,
 'skoda': 6368.0,
 'smart': 3580.2239031770046,
 'sonstige_autos': 12338.550218340612,


We can clearly see that some luxury car brands have a higher mean price such as audi or bmw.

In [119]:
brand_mean_mileage = {}

for b in brands:
    b_brand = autos[autos['brand']==b]
    mean_b_mileage = b_brand['odometer_km'].mean()
    brand_mean_mileage[b] = mean_b_mileage
    

In [121]:
brand_mean_mileage

{'alfa_romeo': 131338.70967741936,
 'audi': 129157.38678544914,
 'bmw': 132572.51313996495,
 'chevrolet': 99191.72932330827,
 'chrysler': 132378.0487804878,
 'citroen': 119694.18960244648,
 'dacia': 84268.29268292683,
 'daewoo': 121642.85714285714,
 'daihatsu': 116410.2564102564,
 'fiat': 117121.9715956558,
 'ford': 124266.01287159056,
 'honda': 122172.13114754099,
 'hyundai': 106442.30769230769,
 'jaguar': 124178.08219178082,
 'jeep': 127122.64150943396,
 'kia': 112530.30303030302,
 'lada': 83518.51851851853,
 'lancia': 121900.0,
 'land_rover': 118010.20408163265,
 'mazda': 124464.03385049365,
 'mercedes_benz': 130788.36331334666,
 'mini': 88105.13447432763,
 'mitsubishi': 126575.52083333333,
 'nissan': 118330.99579242637,
 'opel': 129310.0358422939,
 'peugeot': 127153.62526920316,
 'porsche': 96853.14685314686,
 'renault': 128071.33121308497,
 'rover': 137661.29032258064,
 'saab': 144415.58441558442,
 'seat': 121131.30128956624,
 'skoda': 110848.5639686684,
 'smart': 99326.7776096822

We see that the miles do not vary as much as the price. Almost all are within 90k to 130k.