# Analyzing used car listings on eBay Kleinanzeigen

The dataset to be analyzed was originally scraped and uploaded to Kaggle and contains 50,000 data points.

In [1]:
# importing pandas and numpy
import pandas as pd
import numpy as np

In [2]:
# reading autos.csv
autos=pd.read_csv("autos.csv", encoding="Latin-1")

In [3]:
# exploring data set
autos.info()
autos.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dateCrawled          50000 non-null  object
 1   name                 50000 non-null  object
 2   seller               50000 non-null  object
 3   offerType            50000 non-null  object
 4   price                50000 non-null  object
 5   abtest               50000 non-null  object
 6   vehicleType          44905 non-null  object
 7   yearOfRegistration   50000 non-null  int64 
 8   gearbox              47320 non-null  object
 9   powerPS              50000 non-null  int64 
 10  model                47242 non-null  object
 11  odometer             50000 non-null  object
 12  monthOfRegistration  50000 non-null  int64 
 13  fuelType             45518 non-null  object
 14  brand                50000 non-null  object
 15  notRepairedDamage    40171 non-null  object
 16  date

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,odometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50


The autos.csv dataset contains 20 columns, most of which are strings. Some string type columns consist of 5-20% null values. From the first 5 rows, we can see that some columns that are strings should be integers intsead, such as odometer.

We will now explore the column names and clean where necessary.

In [4]:
# exploring column names
autos.columns

Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'odometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')

In [5]:
# cleaning column names and reverting to snakecase
autos.columns=['date_crawled','name','seller','offer_type','price','ab_test','vehicle_type','registration_year','gearbox','power_ps','model','odometer','registration_month','fuel_type','brand','unrepaired_damage','ad_created','num_photos','postal_code','last_seen']
autos.head()

Unnamed: 0,date_crawled,name,seller,offer_type,price,ab_test,vehicle_type,registration_year,gearbox,power_ps,model,odometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,num_photos,postal_code,last_seen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50


Four columns were edited for clarity using underscores to split up words where necessary, while the rest of the column names were changed to snakecase.

Now, I will determine if any actual columns need to be cleaned.

In [6]:
# explording data columns
autos.describe(include="all")

Unnamed: 0,date_crawled,name,seller,offer_type,price,ab_test,vehicle_type,registration_year,gearbox,power_ps,model,odometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,num_photos,postal_code,last_seen
count,50000,50000,50000,50000,50000,50000,44905,50000.0,47320,50000.0,47242,50000,50000.0,45518,50000,40171,50000,50000.0,50000.0,50000
unique,48213,38754,2,2,2357,2,8,,2,,245,13,,7,40,2,76,,,39481
top,2016-03-19 17:36:18,Ford_Fiesta,privat,Angebot,$0,test,limousine,,manuell,,golf,"150,000km",,benzin,volkswagen,nein,2016-04-03 00:00:00,,,2016-04-07 06:17:27
freq,3,78,49999,49999,1421,25756,12859,,36993,,4024,32424,,30107,10687,35232,1946,,,8
mean,,,,,,,,2005.07328,,116.35592,,,5.72336,,,,,0.0,50813.6273,
std,,,,,,,,105.712813,,209.216627,,,3.711984,,,,,0.0,25779.747957,
min,,,,,,,,1000.0,,0.0,,,0.0,,,,,0.0,1067.0,
25%,,,,,,,,1999.0,,70.0,,,3.0,,,,,0.0,30451.0,
50%,,,,,,,,2003.0,,105.0,,,6.0,,,,,0.0,49577.0,
75%,,,,,,,,2008.0,,150.0,,,9.0,,,,,0.0,71540.0,


In [7]:
autos.loc[:,"seller"].value_counts()

privat        49999
gewerblich        1
Name: seller, dtype: int64

In [8]:
autos.loc[:,"offer_type"].value_counts()

Angebot    49999
Gesuch         1
Name: offer_type, dtype: int64

In [9]:
autos.loc[:,"num_photos"].value_counts()

0    50000
Name: num_photos, dtype: int64

Both the "seller" and "offer_type" columns have basically only a single unique character (likely corresponding to each other). Additionally, there seems to be some issue with the "num_photos" column, which has no value other than 0. We can reasonably drop all three of these columns if we need to.

Next, we need to clean the price and odometer columns since they should be numeric value but are currently being stored as text.

In [10]:
# cleaning "price" column and changing dtype from string to int
autos.loc[:,"price"]=autos.loc[:,"price"].str.replace("$","").str.replace(",","").astype(int)

In [11]:
# cleaning "price" column and changing dtype from string to int
autos.loc[:,"odometer"]=autos.loc[:,"odometer"].str.replace(",","").str.replace("km","").astype(int)
autos.rename({"odometer":"odometer_km"}, axis=1, inplace=True)

In [12]:
# checking that column changes worked
autos.head()

Unnamed: 0,date_crawled,name,seller,offer_type,price,ab_test,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,num_photos,postal_code,last_seen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,5000,control,bus,2004,manuell,158,andere,150000,3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,8500,control,limousine,1997,automatik,286,7er,150000,6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,8990,test,limousine,2009,manuell,102,golf,70000,7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,4350,control,kleinwagen,2007,automatik,71,fortwo,70000,6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,1350,test,kombi,2003,manuell,0,focus,150000,7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50


In [13]:
print(autos.loc[:,"odometer_km"].unique().shape,"\n")
print(autos.loc[:,"odometer_km"].describe(),"\n")
print(autos.loc[:,"odometer_km"].value_counts())

(13,) 

count     50000.000000
mean     125732.700000
std       40042.211706
min        5000.000000
25%      125000.000000
50%      150000.000000
75%      150000.000000
max      150000.000000
Name: odometer_km, dtype: float64 

150000    32424
125000     5170
100000     2169
90000      1757
80000      1436
70000      1230
60000      1164
50000      1027
5000        967
40000       819
30000       789
20000       784
10000       264
Name: odometer_km, dtype: int64


In [14]:
print(autos.loc[:,"price"].unique().shape,"\n")
print(autos.loc[:,"price"].describe(),"\n")
print(autos.loc[:,"price"].value_counts().sort_index().head(25),"\n",autos.loc[:,"price"].value_counts().sort_index().tail(25))

(2357,) 

count    5.000000e+04
mean     9.840044e+03
std      4.811044e+05
min      0.000000e+00
25%      1.100000e+03
50%      2.950000e+03
75%      7.200000e+03
max      1.000000e+08
Name: price, dtype: float64 

0     1421
1      156
2        3
3        1
5        2
8        1
9        1
10       7
11       2
12       3
13       2
14       1
15       2
17       3
18       1
20       4
25       5
29       1
30       7
35       1
40       6
45       4
47       1
49       4
50      49
Name: price, dtype: int64 
 169999      1
175000      1
180000      1
190000      1
194000      1
197000      1
198000      1
220000      1
250000      1
259000      1
265000      1
295000      1
299000      1
345000      1
350000      1
999990      1
999999      2
1234566     1
1300000     1
3890000     1
10000000    1
11111111    2
12345678    3
27322222    1
99999999    1
Name: price, dtype: int64


Odometer values seem reasonable, but we can see that there are some vehicles that are being listed at unrealistic prices. Since these are second-hand vehicles on an auction site, lower prices could be possible, even vehicles being given away for free, but the prices at the top-end are totally unfeasible.

Since prices seem to spike after $350,000, we will remove all listings that are above that price.

In [15]:
# removing listing with unreasonably high prices

autos=autos[autos.loc[:,"price"]<=350000]

Next, we will examine the date columns which are currently being stored as strings, which makes it impossible to perform arithmetic operations on them.

In [16]:
# exploring "date_crawled"
print(autos.loc[:,"date_crawled"].str[:10].value_counts(normalize=True, dropna=False).sort_index(),"\n")
print(autos.loc[:,"ad_created"].str[:10].value_counts(normalize=True, dropna=False).sort_index(),"\n")
print(autos.loc[:,"last_seen"].str[:10].value_counts(normalize=True, dropna=False).sort_index())

2016-03-05    0.025387
2016-03-06    0.013944
2016-03-07    0.035970
2016-03-08    0.033269
2016-03-09    0.033209
2016-03-10    0.032129
2016-03-11    0.032489
2016-03-12    0.036770
2016-03-13    0.015564
2016-03-14    0.036630
2016-03-15    0.033990
2016-03-16    0.029508
2016-03-17    0.031509
2016-03-18    0.013064
2016-03-19    0.034910
2016-03-20    0.037831
2016-03-21    0.037490
2016-03-22    0.032909
2016-03-23    0.032389
2016-03-24    0.029108
2016-03-25    0.031749
2016-03-26    0.032489
2016-03-27    0.031049
2016-03-28    0.034850
2016-03-29    0.034150
2016-03-30    0.033629
2016-03-31    0.031909
2016-04-01    0.033809
2016-04-02    0.035410
2016-04-03    0.038691
2016-04-04    0.036490
2016-04-05    0.013104
2016-04-06    0.003181
2016-04-07    0.001420
Name: date_crawled, dtype: float64 

2015-06-11    0.000020
2015-08-10    0.000020
2015-09-09    0.000020
2015-11-10    0.000020
2015-12-05    0.000020
                ...   
2016-04-03    0.038931
2016-04-04    0.0368

We can seee that the dates are spread quite evenly over a period of one month (~March 2016). No abnormalities spotted.

In [17]:
# exploring "registration_year" column
autos.loc[:,"registration_year"].describe()

count    49986.000000
mean      2005.075721
std        105.727161
min       1000.000000
25%       1999.000000
50%       2003.000000
75%       2008.000000
max       9999.000000
Name: registration_year, dtype: float64

There are some abnormal values here with the minimum registration year being 1000 and the maximum year being 9999. I will examine the actual years that are very early to determine if the issue.

Practically, since cars were only invented in the late 19th century, 1900 is a decent starting point for a realistic registration_year value. Since the latest the ads were seen is 2016, that would be the appropriate ending point.

In [18]:
# calculating portion of 'incorrect' values
1-(autos.loc[:,"registration_year"].between(1900,2016).sum()/50000)

0.03968000000000005

As we can see, only about 4% of the rows are abnormal and so we can easily remove them.

In [19]:
# removing the incorrect values using a Boolean mask
autos=autos[autos.loc[:,"registration_year"].between(1900,2016)]
print(autos.loc[:,"registration_year"].value_counts(normalize=True).head(10),"\n")
print(autos.loc[:,"registration_year"].describe())

2000    0.069852
2005    0.062792
1999    0.062438
2004    0.057002
2003    0.056794
2006    0.056377
2001    0.056273
2002    0.052753
1998    0.051087
2007    0.047984
Name: registration_year, dtype: float64 

count    48016.000000
mean      2002.806002
std          7.306212
min       1910.000000
25%       1999.000000
50%       2003.000000
75%       2008.000000
max       2016.000000
Name: registration_year, dtype: float64


As we can see, the cleaned data only has 48,028 rows, with the earliest registration year now being 1910.

Next, we will explore the brand column and see if we can derive any interesting insights from the column.

In [20]:
# exploring brand column
autos.loc[:,"brand"].value_counts(normalize=True)

volkswagen        0.212117
bmw               0.110026
opel              0.108172
mercedes_benz     0.095364
audi              0.086409
ford              0.069768
renault           0.047359
peugeot           0.029532
fiat              0.025866
seat              0.018181
skoda             0.016036
mazda             0.015141
nissan            0.015099
smart             0.013912
citroen           0.013912
toyota            0.012475
sonstige_autos    0.010892
hyundai           0.009851
volvo             0.009247
mini              0.008643
mitsubishi        0.008143
honda             0.007852
kia               0.007102
alfa_romeo        0.006623
porsche           0.006102
suzuki            0.005915
chevrolet         0.005706
chrysler          0.003665
dacia             0.002562
daihatsu          0.002562
jeep              0.002249
subaru            0.002187
land_rover        0.002041
saab              0.001604
jaguar            0.001583
trabant           0.001562
daewoo            0.001500
r

We can see that Volkswagen is easily the most popular brand, with double the listings of the next most popular brand (Opel). To preserve the integrity of the analysis, we will only take into account brands with over 1% of total listings. However, we will exclude sonstige_autos since "sonstige" is German for "other", so we will use brands with over 1.1% of total listings.

In [28]:
# creating list of brands with over 1% of total listings
brand_popularity=autos.loc[:,"brand"].value_counts(normalize=True)
popular_brands=brand_popularity[brand_popularity>0.011].index
print(popular_brands)

Index(['volkswagen', 'bmw', 'opel', 'mercedes_benz', 'audi', 'ford', 'renault',
       'peugeot', 'fiat', 'seat', 'skoda', 'mazda', 'nissan', 'smart',
       'citroen', 'toyota'],
      dtype='object')


In [29]:
# looping over selected brands
mean_price={}

for a in popular_brands:
    brand_mean_price=(autos[autos.loc[:,"brand"]==a]).loc[:,"price"].mean()
    mean_price[a]=int(brand_mean_price)
    
mean_price

{'volkswagen': 5231,
 'bmw': 8102,
 'opel': 2876,
 'mercedes_benz': 8485,
 'audi': 9093,
 'ford': 3652,
 'renault': 2395,
 'peugeot': 3039,
 'fiat': 2711,
 'seat': 4296,
 'skoda': 6334,
 'mazda': 4010,
 'nissan': 4664,
 'smart': 3542,
 'citroen': 3699,
 'toyota': 5115}

From our analysis of mean prices for each brand, we can see that BMW, Mercedes-Benz, and Audi are the most expensive brands, while cheaper brands include Opel, Renault and Fiat. Volkswagen looks to be solid mid-range brand.

Finally, we will explore the relationship between average price and average mileage for each brand to determine if they are related.

In [31]:
# converting mean_price dictionary to a df
mean_price_series=pd.Series(mean_price)
print(mean_price_series)

volkswagen       5231
bmw              8102
opel             2876
mercedes_benz    8485
audi             9093
ford             3652
renault          2395
peugeot          3039
fiat             2711
seat             4296
skoda            6334
mazda            4010
nissan           4664
smart            3542
citroen          3699
toyota           5115
dtype: int64


In [46]:
# calculating mean mileage for each popular brand and converting to a df
mean_mileage={}

for a in popular_brands:
    brand_mean_mileage=(autos[autos.loc[:,"brand"]==a]).loc[:,"odometer_km"].mean()
    mean_mileage[a]=int(brand_mean_mileage)
    
mean_mileage_series=pd.Series(mean_mileage)
print(mean_mileage_series)

volkswagen       128724
bmw              132431
opel             129223
mercedes_benz    130856
audi             129287
ford             124068
renault          128183
peugeot          127136
fiat             116553
seat             121563
skoda            110954
mazda            124745
nissan           118572
smart             99595
citroen          119580
toyota           115709
dtype: int64


In [44]:
# creating dataframe with mean_price and mean_mileage
brand_price_vs_mileage=pd.DataFrame(mean_price_series, columns=["mean_price"])
brand_price_vs_mileage["mean_mileage"]=mean_mileage_series

brand_price_vs_mileage

Unnamed: 0,mean_price,mean_mileage
volkswagen,5231,128724
bmw,8102,132431
opel,2876,129223
mercedes_benz,8485,130856
audi,9093,129287
ford,3652,124068
renault,2395,128183
peugeot,3039,127136
fiat,2711,116553
seat,4296,121563


From our analysis above, we can see that there is little to no relationship between the price of a vehicle and the mileage across popular brands. Cheaper cars like Opel, Ford, and Renault get used just as much as expensive cars such as Mercedes-Benz, BMW, or Audi.