## Data Clean Project: Ebay Kleinanzeigen

** The dataset contains 50,000 data points taken from Kagle, scrapped data, from a German Ebay website **

In [1]:
import pandas as pd
import numpy as np

** Reading dataset with pandas, UTF-8 not working, used Latin-1 instead **

In [2]:
autos = pd.read_csv("autos.csv", encoding= "Latin-1")

In [3]:
autos.head() #Dataframe, first 5 rows

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,odometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50


In [4]:
autos.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
dateCrawled            50000 non-null object
name                   50000 non-null object
seller                 50000 non-null object
offerType              50000 non-null object
price                  50000 non-null object
abtest                 50000 non-null object
vehicleType            44905 non-null object
yearOfRegistration     50000 non-null int64
gearbox                47320 non-null object
powerPS                50000 non-null int64
model                  47242 non-null object
odometer               50000 non-null object
monthOfRegistration    50000 non-null int64
fuelType               45518 non-null object
brand                  50000 non-null object
notRepairedDamage      40171 non-null object
dateCreated            50000 non-null object
nrOfPictures           50000 non-null int64
postalCode             50000 non-null int64
lastSeen               50000 non-null obj

** 1) There are 20 columns: 15 object (strings) and 5 integer columns **

** 2) 5 columns have null values, "notRepairedDamage" being the column with the most null values: ~9829 **

In [5]:
autos.columns

Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'odometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')

** Switching column names from camelCase to SnakeCase **

* The change was done to make it easier to locate the column names, and make it more readable 

In [6]:
autos.columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest',
       'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model',
       'odometer', 'registration_month', 'fuel_type', 'brand',
       'unrepaired_damage', 'ad_created', 'nr_of_pictures', 'postal_code',
       'last_seen']

In [7]:
autos.describe(include= "all")

Unnamed: 0,date_crawled,name,seller,offer_type,price,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,nr_of_pictures,postal_code,last_seen
count,50000,50000,50000,50000,50000,50000,44905,50000.0,47320,50000.0,47242,50000,50000.0,45518,50000,40171,50000,50000.0,50000.0,50000
unique,48213,38754,2,2,2357,2,8,,2,,245,13,,7,40,2,76,,,39481
top,2016-03-30 17:37:35,Ford_Fiesta,privat,Angebot,$0,test,limousine,,manuell,,golf,"150,000km",,benzin,volkswagen,nein,2016-04-03 00:00:00,,,2016-04-07 06:17:27
freq,3,78,49999,49999,1421,25756,12859,,36993,,4024,32424,,30107,10687,35232,1946,,,8
mean,,,,,,,,2005.07328,,116.35592,,,5.72336,,,,,0.0,50813.6273,
std,,,,,,,,105.712813,,209.216627,,,3.711984,,,,,0.0,25779.747957,
min,,,,,,,,1000.0,,0.0,,,0.0,,,,,0.0,1067.0,
25%,,,,,,,,1999.0,,70.0,,,3.0,,,,,0.0,30451.0,
50%,,,,,,,,2003.0,,105.0,,,6.0,,,,,0.0,49577.0,
75%,,,,,,,,2008.0,,150.0,,,9.0,,,,,0.0,71540.0,


In [8]:
autos["nr_of_pictures"].value_counts()

0    50000
Name: nr_of_pictures, dtype: int64

** 3 columns need to be dropped: **
* name and seller have same values 
* nr_of_pictures has no values, all 5,000 inputs have 0 value 

** Dropping name, seller, and nr_of_pictures columns **

In [9]:
autos.drop(["name", "seller", "nr_of_pictures"], axis=1, inplace=True)

** Converting price and odometer columns, from object(string) to int **


In [10]:
autos["price"] = autos["price"].str.replace("$", "").str.replace(",","").astype(int)

In [11]:
autos["odometer"] = autos["odometer"].str.replace("km","").str.replace(",","").astype(int)

** Renaming "odometer" column to "odometer_km" **

In [12]:
autos.rename({"odometer": "odometer_km"}, axis = 1, inplace = True)

# Analyzing *odometer_km *and * price* columns for abnormal values #

* Odometer_km: 
    * There are 13 unique values
    * more than 50% of odometer_km are 150000

In [13]:
print("Unique values: ", autos["odometer_km"].unique().shape)
print(autos["odometer_km"].describe())

Unique values:  (13,)
count     50000.000000
mean     125732.700000
std       40042.211706
min        5000.000000
25%      125000.000000
50%      150000.000000
75%      150000.000000
max      150000.000000
Name: odometer_km, dtype: float64


In [14]:
autos["odometer_km"].value_counts().sort_values(ascending = False)

150000    32424
125000     5170
100000     2169
90000      1757
80000      1436
70000      1230
60000      1164
50000      1027
5000        967
40000       819
30000       789
20000       784
10000       264
Name: odometer_km, dtype: int64

* Price:
    * There are 2357 unique values
    * 1421 cars have price of 0
    

In [15]:
print("Unique values: ", autos["price"].unique().shape)
print(autos["price"].describe())

Unique values:  (2357,)
count    5.000000e+04
mean     9.840044e+03
std      4.811044e+05
min      0.000000e+00
25%      1.100000e+03
50%      2.950000e+03
75%      7.200000e+03
max      1.000000e+08
Name: price, dtype: float64


In [16]:
autos["price"].value_counts().sort_values(ascending= False).head(10)

0       1421
500      781
1500     734
2500     643
1000     639
1200     639
600      531
800      498
3500     498
2000     460
Name: price, dtype: int64

In [17]:
print(autos["price"].value_counts().sort_index(ascending = False).head(20))
print(autos["price"].value_counts().sort_index(ascending = True).head(10))

99999999    1
27322222    1
12345678    3
11111111    2
10000000    1
3890000     1
1300000     1
1234566     1
999999      2
999990      1
350000      1
345000      1
299000      1
295000      1
265000      1
259000      1
250000      1
220000      1
198000      1
197000      1
Name: price, dtype: int64
0     1421
1      156
2        3
3        1
5        2
8        1
9        1
10       7
11       2
12       3
Name: price, dtype: int64


* Cars priced at 0 will be deleted, and also cars priced at above 350000 will be deleted 

In [18]:
autos = autos[autos["price"].between(1, 351000)]

In [19]:
autos["price"].describe()

count     48565.000000
mean       5888.935591
std        9059.854754
min           1.000000
25%        1200.000000
50%        3000.000000
75%        7490.000000
max      350000.000000
Name: price, dtype: float64

## Date column ##

In [20]:
autos[["date_crawled", "ad_created", "last_seen"]][0:5]

Unnamed: 0,date_crawled,ad_created,last_seen
0,2016-03-26 17:47:46,2016-03-26 00:00:00,2016-04-06 06:45:54
1,2016-04-04 13:38:56,2016-04-04 00:00:00,2016-04-06 14:45:08
2,2016-03-26 18:57:24,2016-03-26 00:00:00,2016-04-06 20:15:37
3,2016-03-12 16:58:10,2016-03-12 00:00:00,2016-03-15 03:16:28
4,2016-04-01 14:38:50,2016-04-01 00:00:00,2016-04-01 14:38:50


** Sort by Date added by Crawler **
* There are 34 different dates, starting at 3/5/16 - 4/7/16
* The last two days have the least entries

In [21]:
date_crawled = autos["date_crawled"].str[:10] #Only dates
date_crawled = date_crawled.value_counts(normalize=True, dropna=False) #With missing values, percentages
date_crawled = date_crawled.sort_index() #Sort by index (dates)
print("Sort - Cont by Crawler date")
print("Amount of dates: ", date_crawled.count())
print(date_crawled)

Sort - Cont by Crawler date
Amount of dates:  34
2016-03-05    0.025327
2016-03-06    0.014043
2016-03-07    0.036014
2016-03-08    0.033296
2016-03-09    0.033090
2016-03-10    0.032184
2016-03-11    0.032575
2016-03-12    0.036920
2016-03-13    0.015670
2016-03-14    0.036549
2016-03-15    0.034284
2016-03-16    0.029610
2016-03-17    0.031628
2016-03-18    0.012911
2016-03-19    0.034778
2016-03-20    0.037887
2016-03-21    0.037373
2016-03-22    0.032987
2016-03-23    0.032225
2016-03-24    0.029342
2016-03-25    0.031607
2016-03-26    0.032204
2016-03-27    0.031092
2016-03-28    0.034860
2016-03-29    0.034099
2016-03-30    0.033687
2016-03-31    0.031834
2016-04-01    0.033687
2016-04-02    0.035478
2016-04-03    0.038608
2016-04-04    0.036487
2016-04-05    0.013096
2016-04-06    0.003171
2016-04-07    0.001400
Name: date_crawled, dtype: float64


** Sort by last seen added by Crawler **
* 34 different entries starting 3/5/2016 daily until 4/7/16

In [22]:
last_seen = autos["last_seen"].str[:10] #Only dates
last_seen = last_seen.value_counts(normalize=True, dropna=False) #With missing values, percentages
last_seen = last_seen.sort_index() #Sort by index (dates)
print("Sort - Count by Crawler last seen")
print("Amount of dates: ", last_seen.count())
print(last_seen)

Sort - Count by Crawler last seen
Amount of dates:  34
2016-03-05    0.001071
2016-03-06    0.004324
2016-03-07    0.005395
2016-03-08    0.007413
2016-03-09    0.009595
2016-03-10    0.010666
2016-03-11    0.012375
2016-03-12    0.023783
2016-03-13    0.008895
2016-03-14    0.012602
2016-03-15    0.015876
2016-03-16    0.016452
2016-03-17    0.028086
2016-03-18    0.007351
2016-03-19    0.015834
2016-03-20    0.020653
2016-03-21    0.020632
2016-03-22    0.021373
2016-03-23    0.018532
2016-03-24    0.019767
2016-03-25    0.019211
2016-03-26    0.016802
2016-03-27    0.015649
2016-03-28    0.020859
2016-03-29    0.022341
2016-03-30    0.024771
2016-03-31    0.023783
2016-04-01    0.022794
2016-04-02    0.024915
2016-04-03    0.025203
2016-04-04    0.024483
2016-04-05    0.124761
2016-04-06    0.221806
2016-04-07    0.131947
Name: last_seen, dtype: float64


** Sort by Date created by Website **
* There are 76 different dates created by the Website
* Most entries are minimal for dates before 3/16
* Last two days 4/6/16 - 4/7/16 are also low on entries

In [23]:
ad_created = autos["ad_created"].str[:10] #Only dates
ad_created = ad_created.value_counts(normalize=True, dropna=False) #With missing values, percentages
ad_created = ad_created.sort_index() #Sort by index (dates)
print("Sort - Count by Website date")
print("Amount of dates: ", ad_created.count())
print(ad_created)

Sort - Count by Website date
Amount of dates:  76
2015-06-11    0.000021
2015-08-10    0.000021
2015-09-09    0.000021
2015-11-10    0.000021
2015-12-05    0.000021
2015-12-30    0.000021
2016-01-03    0.000021
2016-01-07    0.000021
2016-01-10    0.000041
2016-01-13    0.000021
2016-01-14    0.000021
2016-01-16    0.000021
2016-01-22    0.000021
2016-01-27    0.000062
2016-01-29    0.000021
2016-02-01    0.000021
2016-02-02    0.000041
2016-02-05    0.000041
2016-02-07    0.000021
2016-02-08    0.000021
2016-02-09    0.000021
2016-02-11    0.000021
2016-02-12    0.000041
2016-02-14    0.000041
2016-02-16    0.000021
2016-02-17    0.000021
2016-02-18    0.000041
2016-02-19    0.000062
2016-02-20    0.000041
2016-02-21    0.000062
                ...   
2016-03-09    0.033151
2016-03-10    0.031895
2016-03-11    0.032904
2016-03-12    0.036755
2016-03-13    0.017008
2016-03-14    0.035190
2016-03-15    0.034016
2016-03-16    0.030125
2016-03-17    0.031278
2016-03-18    0.013590
2016-03

* The previous 3 tables show that crowled data account for ads that were more numerous. The ads from the website have almost 30 days prior to 3/16, but these dates probably only contain a single ad. In comparison, crawled data doesnt account for this sole ads. 

* Furthermore, date crowled shows more ads being created at the beginning days when compared to last_seen by crawler; similarly, last_seen ads by crawler show more ads seen by the last days in comparison to date added by crawler 

In [24]:
autos["registration_year"].describe()

count    48565.000000
mean      2004.755421
std         88.643887
min       1000.000000
25%       1999.000000
50%       2004.000000
75%       2008.000000
max       9999.000000
Name: registration_year, dtype: float64

** The registration year column accounts for dates for the car's registration. The average registration year is 2004. **
* The minimum and maximum values are far off however. These two values are either too low (min: 1000) and (max: 9999).
* These years need to be deleted
* Registration year after last_seen dates are innacurate
* Only dates ranging from 1900 to 2016 will be accounted

In [31]:
autos = autos[autos["registration_year"].between(1900, 2016)] #removing out of range dates

In [53]:
autos["registration_year"].value_counts(normalize = True).head(10)

2000    0.067608
2005    0.062895
1999    0.062060
2004    0.057904
2003    0.057818
2006    0.057197
2001    0.056468
2002    0.053255
1998    0.050620
2007    0.048778
Name: registration_year, dtype: float64

* Most car registration start appearing starting at 1994 

## Brand Columns 

In [51]:
print("Amount of unique values: ", autos["brand"].unique().size)
brand_counts = autos["brand"].value_counts(normalize = True)
brand_counts.head(12)

Amount of unique values:  40


volkswagen       0.211264
bmw              0.110045
opel             0.107581
mercedes_benz    0.096463
audi             0.086566
ford             0.069900
renault          0.047150
peugeot          0.029841
fiat             0.025642
seat             0.018273
skoda            0.016409
nissan           0.015274
Name: brand, dtype: float64

** Only 7 brands account for most of the ad data. Wolkswagen accounting for almost 21%.**
* Only brands accounting for 4% or more will be used

In [60]:
brand_over_4 = brand_counts[brand_counts > 0.04].index
brand_over_4

Index(['volkswagen', 'bmw', 'opel', 'mercedes_benz', 'audi', 'ford',
       'renault'],
      dtype='object')

** The following dictionary contains the name of the most popular brands as keys, and their respective mean price as value **
* Audi, Bmw, and Mercedez Benz are on average the most pricey autos
* Renault, Opel, and Ford are on average the least expensive autos

In [79]:
aggregate_price = {} 
for value in brand_over_4:
    subset = autos[autos["brand"] == value]
    mean = subset["price"].mean().astype(int)
    aggregate_price[value] = mean

In [80]:
aggregate_price

{'audi': 9336,
 'bmw': 8332,
 'ford': 3749,
 'mercedes_benz': 8628,
 'opel': 2975,
 'renault': 2474,
 'volkswagen': 5402}

** The following dictionary accounts for odometer_km of the 7 most popular car brands **

In [87]:
aggregate_mileage = {}
for value in brand_over_4:
    subset = autos[autos["brand"] == value]
    mean = subset["odometer_km"].mean().astype(int)
    aggregate_mileage[value] = mean

In [88]:
aggregate_mileage

{'audi': 129157,
 'bmw': 132572,
 'ford': 124266,
 'mercedes_benz': 130788,
 'opel': 129310,
 'renault': 128071,
 'volkswagen': 128707}

** Converting both mean price and mean odometer to Series **

In [97]:
price_series = pd.Series(aggregate_price).sort_values(ascending = False)
mileage_series = pd.Series(aggregate_mileage).sort_values(ascending = False)

** Converting Series to DataFrame ** 

In [100]:
price_df = pd.DataFrame(price_series, columns = ["mean_price"])
mileage_df = pd.DataFrame(mileage_series, columns = ["mean_mileage"])
print(price_df)
print("\n")
print(mileage_df)

               mean_price
audi                 9336
mercedes_benz        8628
bmw                  8332
volkswagen           5402
ford                 3749
opel                 2975
renault              2474


               mean_mileage
bmw                  132572
mercedes_benz        130788
opel                 129310
audi                 129157
volkswagen           128707
renault              128071
ford                 124266


** Creating a single table with both mean price and mean mileage ** 

In [103]:
mileage_df["mean_price"] = price_series
mileage_df

Unnamed: 0,mean_mileage,mean_price
bmw,132572,8332
mercedes_benz,130788,8628
opel,129310,2975
audi,129157,9336
volkswagen,128707,5402
renault,128071,2474
ford,124266,3749


* The above table shows that the top two higher mileages are also amonng the highest average price 