## This notebook contains:
### 01. Import libraries
### 02. Import and explore rent data
### 03. Deal with missing and irrelevant data
### 04. Investigate and correct anomalies
   #### 04a. Total rent anomalies
   #### 04b. Base rent anomalies
   #### 04c. Year constructed anomalies
   #### 04d. Living space (size) anomalies
   #### 04e. Number of rooms anomalies
   #### 04f. Floor anomalies
   #### 04g. PLZ anomalies
   #### 04h. Update and copy descriptive stats
### 05. Address mixed-type data
### 06. Check for duplicates
### 07. Create new price per unit variable
### 08. Final checks and export
   

# 01. Import libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

# 02. Import and explore rent data

In [2]:
# create shortcut for data imports
path = r'../rawData'


In [3]:
# import data 
df = pd.read_csv(os.path.join(path, 'immo_data.csv'), index_col = False)

In [4]:
df.head()

Unnamed: 0,regio1,serviceCharge,heatingType,telekomTvOffer,telekomHybridUploadSpeed,newlyConst,balcony,picturecount,pricetrend,telekomUploadSpeed,...,regio2,regio3,description,facilities,heatingCosts,energyEfficiencyClass,lastRefurbish,electricityBasePrice,electricityKwhPrice,date
0,Nordrhein_Westfalen,245.0,central_heating,ONE_YEAR_FREE,,False,False,6,4.62,10.0,...,Dortmund,Schüren,Die ebenerdig zu erreichende Erdgeschosswohnun...,Die Wohnung ist mit Laminat ausgelegt. Das Bad...,,,,,,May19
1,Rheinland_Pfalz,134.0,self_contained_central_heating,ONE_YEAR_FREE,,False,True,8,3.47,10.0,...,Rhein_Pfalz_Kreis,Böhl_Iggelheim,Alles neu macht der Mai – so kann es auch für ...,,,,2019.0,,,May19
2,Sachsen,255.0,floor_heating,ONE_YEAR_FREE,10.0,True,True,8,2.72,2.4,...,Dresden,Äußere_Neustadt_Antonstadt,Der Neubau entsteht im Herzen der Dresdner Neu...,"* 9 m² Balkon\n* Bad mit bodengleicher Dusche,...",,,,,,Oct19
3,Sachsen,58.15,district_heating,ONE_YEAR_FREE,,False,True,9,1.53,40.0,...,Mittelsachsen_Kreis,Freiberg,Abseits von Lärm und Abgasen in Ihre neue Wohn...,,87.23,,,,,May19
4,Bremen,138.0,self_contained_central_heating,,,False,True,19,2.46,,...,Bremen,Neu_Schwachhausen,Es handelt sich hier um ein saniertes Mehrfami...,Diese Wohnung wurde neu saniert und ist wie fo...,,,,,,Feb20


In [5]:
df.shape

(268850, 49)

In [6]:
# on how many distinct dates were data collected?
df['date'].describe()

count     268850
unique         4
top        Feb20
freq       79276
Name: date, dtype: object

In [7]:
# are all states represented in the data?
df['regio1'].unique()

array(['Nordrhein_Westfalen', 'Rheinland_Pfalz', 'Sachsen', 'Bremen',
       'Schleswig_Holstein', 'Baden_Württemberg', 'Thüringen', 'Hessen',
       'Niedersachsen', 'Bayern', 'Hamburg', 'Sachsen_Anhalt',
       'Mecklenburg_Vorpommern', 'Berlin', 'Brandenburg', 'Saarland'],
      dtype=object)

In [8]:
pd.options.display.max_columns = 40

# 03. Deal with missing and irrelevant data

In [9]:
# are there any entirely blank rows in the df?
df.isnull().all(axis=1).sum()

0

In [10]:
# count NaN in each row
df.isna().sum()

regio1                           0
serviceCharge                 6909
heatingType                  44856
telekomTvOffer               32619
telekomHybridUploadSpeed    223830
newlyConst                       0
balcony                          0
picturecount                     0
pricetrend                    1832
telekomUploadSpeed           33358
totalRent                    40517
yearConstructed              57045
scoutId                          0
noParkSpaces                175798
firingTypes                  56964
hasKitchen                       0
geo_bln                          0
cellar                           0
yearConstructedRange         57045
baseRent                         0
houseNumber                  71018
livingSpace                      0
geo_krs                          0
condition                    68489
interiorQual                112665
petsAllowed                 114573
street                           0
streetPlain                  71013
lift                

I'll delete all columns with over 33% missing data, columns that contain the same information as other columns, and columns that aren't relevant to my analysis.

In [11]:
df_trim = df.drop(['serviceCharge', 'telekomTvOffer', 'telekomHybridUploadSpeed', 'telekomUploadSpeed', 'noParkSpaces', 'interiorQual', 'geo_bln', 'geo_krs', 'petsAllowed', 'thermalChar', 'numberOfFloors', 'heatingCosts', 'energyEfficiencyClass', 'houseNumber', 'streetPlain', 'lastRefurbish', 'electricityBasePrice', 'electricityKwhPrice', 'facilities'], axis=1)


In [12]:
# check work
df_trim.head()

Unnamed: 0,regio1,heatingType,newlyConst,balcony,picturecount,pricetrend,totalRent,yearConstructed,scoutId,firingTypes,hasKitchen,cellar,yearConstructedRange,baseRent,livingSpace,condition,street,lift,baseRentRange,typeOfFlat,geo_plz,noRooms,floor,noRoomsRange,garden,livingSpaceRange,regio2,regio3,description,date
0,Nordrhein_Westfalen,central_heating,False,False,6,4.62,840.0,1965.0,96107057,oil,False,True,2.0,595.0,86.0,well_kept,Sch&uuml;ruferstra&szlig;e,False,4,ground_floor,44269,4.0,1.0,4,True,4,Dortmund,Schüren,Die ebenerdig zu erreichende Erdgeschosswohnun...,May19
1,Rheinland_Pfalz,self_contained_central_heating,False,True,8,3.47,,1871.0,111378734,gas,False,False,1.0,800.0,89.0,refurbished,no_information,False,5,ground_floor,67459,3.0,,3,False,4,Rhein_Pfalz_Kreis,Böhl_Iggelheim,Alles neu macht der Mai – so kann es auch für ...,May19
2,Sachsen,floor_heating,True,True,8,2.72,1300.0,2019.0,113147523,,False,True,9.0,965.0,83.8,first_time_use,Turnerweg,True,6,apartment,1097,3.0,3.0,3,False,4,Dresden,Äußere_Neustadt_Antonstadt,Der Neubau entsteht im Herzen der Dresdner Neu...,Oct19
3,Sachsen,district_heating,False,True,9,1.53,,1964.0,108890903,district_heating,False,False,2.0,343.0,58.15,,Gl&uuml;ck-Auf-Stra&szlig;e,False,2,other,9599,3.0,3.0,3,False,2,Mittelsachsen_Kreis,Freiberg,Abseits von Lärm und Abgasen in Ihre neue Wohn...,May19
4,Bremen,self_contained_central_heating,False,True,19,2.46,903.0,1950.0,114751222,gas,False,False,1.0,765.0,84.97,refurbished,Hermann-Henrich-Meier-Allee,False,5,apartment,28213,3.0,1.0,3,False,4,Bremen,Neu_Schwachhausen,Es handelt sich hier um ein saniertes Mehrfami...,Feb20


# 04. Investigate and correct anomalies

In [13]:
# rearrange columns to prioritize most important info
df_trim = df_trim[['regio1', 'baseRent', 'totalRent', 'pricetrend', 'livingSpace', 'geo_plz', 'picturecount', 'newlyConst', 'yearConstructed', 'scoutId', 'firingTypes', 'heatingType', 'hasKitchen', 'cellar', 'condition', 'street', 'lift', 'typeOfFlat', 'noRooms', 'floor', 'balcony', 'garden',
'baseRentRange', 'noRoomsRange', 'livingSpaceRange', 'yearConstructedRange', 'regio2', 'regio3', 'description', 'date']]


In [14]:
df_trim.head()

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date
0,Nordrhein_Westfalen,595.0,840.0,4.62,86.0,44269,6,False,1965.0,96107057,oil,central_heating,False,True,well_kept,Sch&uuml;ruferstra&szlig;e,False,ground_floor,4.0,1.0,False,True,4,4,4,2.0,Dortmund,Schüren,Die ebenerdig zu erreichende Erdgeschosswohnun...,May19
1,Rheinland_Pfalz,800.0,,3.47,89.0,67459,8,False,1871.0,111378734,gas,self_contained_central_heating,False,False,refurbished,no_information,False,ground_floor,3.0,,True,False,5,3,4,1.0,Rhein_Pfalz_Kreis,Böhl_Iggelheim,Alles neu macht der Mai – so kann es auch für ...,May19
2,Sachsen,965.0,1300.0,2.72,83.8,1097,8,True,2019.0,113147523,,floor_heating,False,True,first_time_use,Turnerweg,True,apartment,3.0,3.0,True,False,6,3,4,9.0,Dresden,Äußere_Neustadt_Antonstadt,Der Neubau entsteht im Herzen der Dresdner Neu...,Oct19
3,Sachsen,343.0,,1.53,58.15,9599,9,False,1964.0,108890903,district_heating,district_heating,False,False,,Gl&uuml;ck-Auf-Stra&szlig;e,False,other,3.0,3.0,True,False,2,3,2,2.0,Mittelsachsen_Kreis,Freiberg,Abseits von Lärm und Abgasen in Ihre neue Wohn...,May19
4,Bremen,765.0,903.0,2.46,84.97,28213,19,False,1950.0,114751222,gas,self_contained_central_heating,False,False,refurbished,Hermann-Henrich-Meier-Allee,False,apartment,3.0,1.0,True,False,5,3,4,1.0,Bremen,Neu_Schwachhausen,Es handelt sich hier um ein saniertes Mehrfami...,Feb20


In [15]:
# view descriptive statistics
df_trim.describe()

Unnamed: 0,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,yearConstructed,scoutId,noRooms,floor,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange
count,268850.0,228333.0,267018.0,268850.0,268850.0,268850.0,211805.0,268850.0,268850.0,217541.0,268850.0,268850.0,268850.0,211805.0
mean,694.1294,901.3315,3.389001,74.355548,37283.022235,9.791958,1966.40059,106969700.0,2.641261,2.122405,3.765256,2.571542,3.07079,3.714544
std,19536.02,33238.33,1.964874,254.759208,27798.037296,6.408399,46.992207,12500930.0,2.63344,3.634934,2.214357,0.937594,1.407127,2.738134
min,0.0,0.0,-12.33,0.0,852.0,0.0,1000.0,28871740.0,1.0,-1.0,1.0,1.0,1.0,1.0
25%,338.0,469.8,2.0,54.0,9128.0,6.0,1950.0,106691000.0,2.0,1.0,2.0,2.0,2.0,1.0
50%,490.0,650.0,3.39,67.32,38667.0,9.0,1973.0,111158400.0,3.0,2.0,3.0,3.0,3.0,3.0
75%,799.0,985.0,4.57,87.0,57072.0,13.0,1996.0,113768800.0,3.0,3.0,5.0,3.0,4.0,5.0
max,9999999.0,15751540.0,14.92,111111.0,99998.0,121.0,2090.0,115711700.0,999.99,999.0,9.0,5.0,7.0,9.0


Descriptive statistics show a number of issues, namely suspiciously low and high values in baseRent, totalRent, livingSpace, yearsConstructed, noRooms (high numbers only) and floor (high numbers only).
The basic procedure for each check involves locating very low or high values, copying them to an Excel spreadsheet, and using other column values (particularly the description) to see if the value appears plausible.
I'll investigate high values in totalRent first. 

## 04a. Total rent anomalies

In [16]:
df_trim[df_trim['totalRent']> 10000.0]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date
8406,Baden_Württemberg,1234567.0,1234567.0,3.9,76.0,68789,13,False,,107360251,,,True,False,,Kastanienweg,False,apartment,4.0,,True,False,9,4,3,,Rhein_Neckar_Kreis,Sankt_Leon_Rot,Komplett eingerichtete Wohnung in 68789 St. Le...,Sep18
14123,Hessen,12000.0,13400.0,5.13,600.0,60322,2,False,1900.0,98567656,gas,self_contained_central_heating,False,False,negotiable,Auf der K&ouml;rnerwiese,True,maisonette,15.0,4.0,True,False,9,5,7,1.0,Frankfurt_am_Main,Westend_Nord,"Über die hellen, mit Granit verkleideten Trepp...",Oct19
16832,Sachsen_Anhalt,460.29,63204.0,1.52,68.7,39108,17,False,1998.0,115270570,district_heating,district_heating,False,True,well_kept,G._Hauptmann_Str.,True,apartment,2.0,2.0,True,False,3,2,3,5.0,Magdeburg,Stadtfeld_Ost,Modernes und gepflegtes Mehrfamilienhaus in s...,Feb20
23490,Schleswig_Holstein,820.0,108000.0,3.9,86.61,22927,10,False,1979.0,114307164,oil,oil_heating,False,False,well_kept,Ahrensfelder Weg,False,apartment,2.5,,False,False,6,2,4,3.0,Stormarn_Kreis,Großhansdorf,Die angebotene 2½-Zimmer-Wohnung befindet sich...,Feb20
38143,Hamburg,10500.0,11900.0,,207.07,20457,20,True,2018.0,113610764,,district_heating,False,True,first_time_use,no_information,True,apartment,5.0,23.0,True,False,9,5,7,9.0,Hamburg,Hamb._Altstadt,"Ultimate luxury living! Ein Angebot, wie es se...",Oct19
50706,Niedersachsen,184.0,19519.0,3.61,16.0,30169,17,False,1851.0,49280854,district_heating,central_heating,True,True,fully_renovated,Oeltzenstra&szlig;e,False,,1.0,3.0,True,True,1,1,1,1.0,Hannover,Calenberger_Neustadt,Wir suchen einen engagierten Studenten (männli...,Sep18
57237,Bayern,13000.0,15000.0,4.89,269.0,80538,12,False,2016.0,112740899,,,True,False,,no_information,True,,6.0,,True,False,9,5,7,9.0,München,Lehel,Die high-end-Wohnung befindet sich in einem te...,Oct19
61094,Berlin,12000.0,15500.0,7.69,364.0,14195,19,False,2008.0,108319408,gas,gas_heating,True,True,,no_information,True,penthouse,4.0,3.0,True,True,9,4,7,7.0,Berlin,Schmargendorf_Wilmersdorf,Die Villa „Grunewaldherz“ ist ein modernes Geb...,May19
64109,Sachsen,9000.0,11250.0,3.8,387.0,4103,6,False,,96517534,natural_gas_light,central_heating,True,False,fully_renovated,no_information,False,,15.0,,False,False,9,5,7,,Leipzig,Lausen_Grünau,Unsere Firmenwohnungen sind nicht für Privatpe...,Feb20
71162,Bayern,10700.0,12900.0,7.43,277.0,80333,5,False,2008.0,105945172,,central_heating,True,True,mint_condition,no_information,True,apartment,4.0,4.0,True,False,9,4,7,7.0,München,Maxvorstadt,"Exclusiver ""City-Lifestyle"" \n\nExklusive Luxu...",Sep18


In [17]:
# create subset of entries with rent over 10,000
highRent = df_trim[df_trim['totalRent'] > 10000.0]

In [18]:
# copy to clipboard so I can view in Excel
highRent.to_clipboard()

I located a set of incorrect values that I'll need to impute with the median value for the specified region.

In [19]:
df_trim.loc[df_trim['regio1'] == 'Baden_Württemberg', 'totalRent'].median()

1020.0

In [20]:
df_trim.loc[df_trim['regio1'] == 'Sachsen_Anhalt', 'totalRent'].median()

464.0

In [21]:
df_trim.loc[df_trim['regio1'] == 'Schleswig_Holstein', 'totalRent'].median()

755.0

In [22]:
df_trim.loc[df_trim['regio1'] == 'Niedersachsen', 'totalRent'].median()

692.0

In [23]:
df_trim.loc[df_trim['regio1'] == 'Sachsen', 'totalRent'].median()

480.0

In [24]:
df_trim.loc[df_trim['regio1'] == 'Nordrhein_Westfalen', 'totalRent'].median()

660.0

In [25]:
df_trim.loc[df_trim['regio1'] == 'Berlin', 'totalRent'].median()

1124.0

In [26]:
df_trim.loc[df_trim['regio1'] == 'Rheinland_Pfalz', 'totalRent'].median()

780.0

In [27]:
# replace the incorrect values
df_trim = df_trim.replace({'totalRent': {1234567: 1020, 63204: 464, 108000: 755, 19519: 692, 11250: 480, 485350: 660, 51570: 1124, 1150900: 1020, 12900: 660, 64651: 660, 13500: 1124, 15751535: 780, 37600: 464, 1000000: 464}})

In [28]:
# view descriptive stats again to make sure the excessively high values are gone
df_trim['totalRent'].describe()

count    228333.000000
mean        813.664872
std         562.786596
min           0.000000
25%         469.800000
50%         650.000000
75%         984.450000
max       26500.000000
Name: totalRent, dtype: float64

In [29]:
# investigate low rent values
df_trim[df_trim['totalRent']< 150.0]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date
1119,Sachsen,210.0,0.0,-0.65,49.80,9430,10,False,1973.0,94258021,gas,central_heating,False,True,,no_information,False,apartment,2.0,1.0,False,True,1,2,2,3.0,Erzgebirgskreis,Drebach,Diese gemütliche 2-Raumwohnung befindet sich i...,Oct19
1883,Nordrhein_Westfalen,295.0,0.0,4.28,40.00,45130,6,False,1964.0,111407942,electricity,night_storage_heater,False,True,well_kept,Brigittastr.,False,apartment,1.0,2.0,True,False,1,1,1,2.0,Essen,Rüttenscheid,2. Obergeschoss in einem gepflegten Mehrfamili...,May19
4209,Sachsen,245.0,0.0,0.00,45.00,9212,15,False,1930.0,105929235,gas,central_heating,False,True,,no_information,False,apartment,2.0,2.0,False,True,1,2,2,1.0,Zwickau_Kreis,Limbach_Oberfrohna,Im 2.Obergeschoss des schönen Mehrfamilienhaus...,May19
5035,Sachsen,240.0,0.0,0.97,50.00,8280,13,False,1960.0,113014387,gas,central_heating,False,True,well_kept,no_information,False,apartment,2.0,1.0,False,True,1,2,2,2.0,Erzgebirgskreis,Aue,Diese geräumige 2-Raumwohnung befindet sich im...,Feb20
5739,Sachsen,220.5,0.0,-0.65,63.11,9430,9,False,1989.0,94257790,electricity,self_contained_central_heating,False,True,,no_information,False,apartment,3.0,3.0,True,True,1,3,3,4.0,Erzgebirgskreis,Drebach,,Feb20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263216,Sachsen,250.0,3.0,0.00,46.00,8359,5,False,1950.0,110620002,gas,,False,True,,no_information,False,ground_floor,2.0,,False,False,1,2,2,1.0,Erzgebirgskreis,Breitenbrunn/Erzgebirge,Vermietet wird hier eine schöne und ruhig gele...,May19
266209,Nordrhein_Westfalen,385.0,0.0,2.99,62.00,45145,4,False,1950.0,110666549,gas,self_contained_central_heating,False,True,well_kept,Berlinerstr.,False,other,2.0,2.0,True,False,2,2,3,1.0,Essen,Frohnhausen,Gepflegtes Mehrfamilienhaus.\n\n- Wohnzimmer\n...,May19
267867,Nordrhein_Westfalen,400.0,0.0,1.98,48.56,42853,8,False,,113704814,gas,self_contained_central_heating,True,True,well_kept,no_information,False,apartment,2.0,4.0,False,False,2,2,2,,Remscheid,Innenstadt,"Die Wohnung ist ein echter Volltreffer, denn s...",Oct19
268067,Baden_Württemberg,1620.0,0.0,,124.00,69115,5,False,2014.0,106732585,,district_heating,False,False,mint_condition,no_information,False,apartment,4.0,3.0,False,False,8,4,6,8.0,Heidelberg,Bahnstadt,Diese geräumige und charmante 4- Zimmer-Wohnun...,Sep18


In [30]:
# for values of zero, insert value from the base rent column
total_rent = df_trim['totalRent']
total_rent.replace(to_replace = 0,  method='ffill', inplace=True)

In [31]:
# check that no zeroes remain
df_trim[df_trim['totalRent'] == 0.0]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date


Follow the same procedure for baseRent, replacing zeroes with totalRent values.

In [32]:
df_trim[df_trim['baseRent'] == 0.0]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date
201,Sachsen,0.0,,0.00,45.34,8058,7,False,1900.0,108372054,gas,central_heating,False,False,fully_renovated,Franz-Mehring-Stra&szlig;e,False,apartment,2.0,1.0,False,False,1,2,2,1.0,Zwickau,Nordvorstadt,Sie sind ein fleißiger und handwerklich begabt...,May19
1377,Saarland,0.0,700.0,2.46,30.00,66740,7,False,1956.0,108643336,district_heating,,False,False,fully_renovated,no_information,False,apartment,1.0,,False,False,1,1,1,2.0,Saarlouis_Kreis,Saarlouis,Dieses Appartement besteht aus einem Wohn- Sch...,Feb20
2045,Nordrhein_Westfalen,0.0,731.5,3.30,77.00,59174,1,False,1902.0,115454960,gas,central_heating,False,False,well_kept,Bahnhofstr.,False,,2.5,,False,False,1,2,3,1.0,Unna_Kreis,Kamen,Das charmante Fachwerkhaus verfügt im Erdgesch...,Feb20
6911,Bayern,0.0,370.0,6.58,42.00,84166,7,False,,113267651,oil,central_heating,False,False,,no_information,False,ground_floor,1.0,,False,False,1,1,2,,Landshut_Kreis,Adlkofen,Dieses Teilmöblierte Appartement liegt in ruhi...,Oct19
11130,Rheinland_Pfalz,0.0,,3.86,20.00,67663,1,False,,112047388,,,False,False,,no_information,False,apartment,1.0,,False,False,1,1,1,,Kaiserslautern,Innenstadt,Wir finden für Sie 1 Zimmer - Apartments im Un...,Oct19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254055,Baden_Württemberg,0.0,470.0,4.29,0.00,74889,7,False,,107357543,,,True,False,first_time_use_after_refurbishment,no_information,False,ground_floor,2.0,0.0,False,False,1,2,1,,Rhein_Neckar_Kreis,Sinsheim,"*WICHTIGE INFO*:\nDie Raumhöhe liegt unter 2,3...",Sep18
261094,Schleswig_Holstein,0.0,,,120.00,24855,7,False,,63955810,gas,central_heating,False,False,well_kept,M&uuml;hlenstr.,False,ground_floor,5.0,,False,False,1,5,5,,Schleswig_Flensburg_Kreis,Bollingstedt,Wohnung langfristig zu vermieten.,Oct19
264041,Saarland,0.0,450.0,2.86,30.00,66740,8,False,,107318389,,district_heating,False,False,,no_information,False,ground_floor,1.0,,False,False,1,1,1,,Saarlouis_Kreis,Saarlouis,"Ein Wohn- Schlafraum, eine EBK mit Kühlschrank...",Sep18
265287,Saarland,0.0,,1.72,55.00,66333,0,False,,105621115,,floor_heating,False,False,,no_information,False,roof_storey,1.0,,False,False,1,1,2,,Stadtverband_Saarbrücken_Kreis,Völklingen,PREIS AUF ANFRAGE\n\nSchöne 2ZKB Wohnung in Vö...,Sep18


In [33]:
base_rent = df_trim['baseRent']
base_rent.replace(to_replace = 0,  method='bfill', inplace=True)

In [34]:
# check that no zeroes remain in base rent
df_trim[df_trim['baseRent'] == 0.0]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date


In [35]:
# view remaining low rents
df_trim[df_trim['totalRent'] < 100.0]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date
6601,Sachsen_Anhalt,35.0,35.0,1.64,0.0,6217,1,False,,100870699,,,False,False,well_kept,no_information,False,other,1.0,,False,False,1,1,1,,Saalekreis,Merseburg,,May19
9332,Sachsen,250.0,75.0,0.00,50.0,9131,5,False,1915.0,109939615,,,False,True,,no_information,False,roof_storey,2.0,,False,False,1,2,2,1.0,Chemnitz,Ebersdorf,Der Zuschnitt der Wohnung ist großzügig und ze...,Oct19
11922,Sachsen,25.0,25.0,0.19,12.5,9113,6,False,1920.0,97569416,,central_heating,False,False,mint_condition,Bergstra&szlig;e,False,,1.0,,False,False,1,1,1,1.0,Chemnitz,Schloßchemnitz,Das Objekt ist ein Kulturdenkmal gem. § 2\nSäc...,Oct19
18360,Bayern,50.0,50.0,4.70,20.0,84453,0,True,2019.0,108750293,district_heating,district_heating,False,False,,Waidbruckstra&szlig;e,False,other,1.0,,False,False,1,1,1,9.0,Mühldorf_am_Inn_Kreis,Mühldorf_am_Inn,Große Tiefgarage in einem Neubau.\n10 Stellplä...,May19
19381,Baden_Württemberg,50.0,50.0,4.39,3.0,75177,1,False,1954.0,108001105,oil,,False,True,,Bl&uuml;cherstr.,False,other,1.0,-1.0,False,False,1,1,1,2.0,Pforzheim,Nordstadt,"Miete mtl. 50,-€\nKaution : 150,-€\n\nPreis is...",May19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241782,Niedersachsen,1111.0,1.0,3.70,101.0,21423,11,True,2019.0,111139920,gas,central_heating,True,False,first_time_use,Brahmsallee,True,apartment,3.0,1.0,True,False,7,3,5,9.0,Harburg_Kreis,Winsen_Luhe,,May19
243082,Sachsen,459.0,1.0,1.89,70.6,9111,1,False,,107330418,,,False,True,first_time_use_after_refurbishment,Elisenstra&szlig;e,False,apartment,3.0,2.0,True,False,3,3,3,,Chemnitz,Zentrum,Das Stadthaus im Gewand der Gründerzeit in der...,May19
249754,Nordrhein_Westfalen,960.0,1.3,3.92,91.0,53819,5,True,2019.0,112892262,,floor_heating,False,False,first_time_use,no_information,False,other,2.0,1.0,False,False,6,2,4,9.0,Rhein_Sieg_Kreis,Neunkirchen_Seelscheid,Die Untergeschosswohnung befindet sich in eine...,Oct19
261708,Sachsen,99.0,97.0,5.36,20.0,4315,22,False,1899.0,93855770,,central_heating,False,False,first_time_use,no_information,False,apartment,1.0,3.0,False,False,1,1,1,1.0,Leipzig,Volkmarsdorf,"Hallo, \n\nhier könnt ihr entweder Zimmer so g...",Sep18


There are still quite a few rows with rents that seem implausibly low. Because it's too many for me to check individually but represents a small percentage of the data, I opted to drop these rows.

In [36]:
# drop rows where total rent is less than 100
df_trim = df_trim.drop(df_trim[df_trim.totalRent < 100.0].index)

In [37]:
# check that drop worked
df_trim[df_trim['totalRent'] < 100.0]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date


In [38]:
df_trim['totalRent'].describe()

count    228223.000000
mean        814.815558
std         562.467754
min         100.000000
25%         470.000000
50%         650.000000
75%         985.000000
max       26500.000000
Name: totalRent, dtype: float64

The mean rent has decreased by about 100 Euro.
Now I'll follow the same procedure for checking out high and low baseRent values.

## 04b. Base rent anomalies

In [39]:
highBaseRent = df_trim[df_trim['baseRent'] > 10000.0]

In [40]:
# copy to clipboard so I can view in Excel
highBaseRent.to_clipboard()

In [41]:
df_trim.loc[df_trim['regio1'] == 'Saarland', 'baseRent'].median()

530.0

In [42]:
df_trim.loc[df_trim['regio1'] == 'Baden_Württemberg', 'baseRent'].median()

850.0

In [43]:
df_trim.loc[df_trim['regio1'] == 'Bayern', 'baseRent'].median()

850.0

In [44]:
df_trim.loc[df_trim['regio1'] == 'Nordrhein_Westfalen', 'baseRent'].median()

490.0

In [45]:
df_trim.loc[df_trim['regio1'] == 'Sachsen_Anhalt', 'baseRent'].median()

325.0

In [46]:
# replace incorrect baseRent values
df_trim = df_trim.replace({'baseRent': {120000: 1200, 20100: 2010, 39200: 392, 30990: 310, 120000: 400, 9999999: 530, 1234567: 850, 10440: 490, 1000000: 325}})

In [47]:
df_trim['baseRent'].describe()

count    268772.000000
mean        647.661063
std         505.609452
min           1.000000
25%         338.000000
50%         490.000000
75%         799.000000
max       20000.000000
Name: baseRent, dtype: float64

I spotted a few construction years that seemed too far in the future for an advertised unit, so let's check out everything after 2022.

## 04c. Year constructed anomalies

In [48]:
df_trim[df_trim['yearConstructed'] > 2022.0]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date
40265,Berlin,470.0,590.0,6.29,31.0,12487,9,True,2029.0,114977577,,district_heating,True,True,first_time_use,Gro&szlig;-Berliner Damm,True,apartment,1.0,2.0,False,False,3,1,1,9.0,Berlin,Johannisthal_Treptow,Das Objekt ist ein Neubau und befindet sich im...,Feb20
54595,Hessen,2945.0,3385.0,4.48,156.13,60594,14,True,2026.0,113807419,,floor_heating,True,False,mint_condition,Gerberm&uuml;hlstra&szlig;e,True,apartment,4.0,6.0,False,False,9,4,7,9.0,Frankfurt_am_Main,Sachsenhausen_Nord,Die hier angebotene exklusive 4-Zimmer-Wohnung...,Oct19
87221,Baden_Württemberg,1175.0,1475.0,3.96,130.64,76437,10,True,2090.0,110659704,district_heating,district_heating,False,True,mint_condition,no_information,True,ground_floor,4.0,,True,False,7,4,6,9.0,Rastatt_Kreis,Rastatt,In einem hochwertigen und sehr modernen Gebäud...,May19
182121,Hessen,1620.0,1910.0,4.48,102.44,60594,17,True,2026.0,113807326,,floor_heating,True,False,mint_condition,Gerberm&uuml;hlstra&szlig;e,True,apartment,3.0,4.0,False,False,8,3,5,9.0,Frankfurt_am_Main,Sachsenhausen_Nord,Die hier angebotene 3-Zimmer-Wohnung befindet ...,Oct19
240346,Hessen,1435.0,1690.0,4.48,91.07,60594,10,True,2026.0,113807369,,floor_heating,True,False,mint_condition,Gerberm&uuml;hlstra&szlig;e,True,apartment,3.0,4.0,False,False,7,3,4,9.0,Frankfurt_am_Main,Sachsenhausen_Nord,Die hier angebotene 3-Zimmer-Wohnung befindet ...,Oct19


In [49]:
# replace with correct years inferred from scraping date and description.
df_trim = df_trim.replace ({'yearConstructed':{2029.0: 2020, 2026.0: 2019, 2090: 2019}})

In [50]:
df_trim[df_trim['yearConstructed'] > 2022.0]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date


Following the same procedure for early years, I noticed a string of buildings constructed in 1111. This appears to be a placeholder, so I'll replace it with the column mean.

In [51]:
df_trim[df_trim['yearConstructed'] == 1111.0]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date
2152,Nordrhein_Westfalen,415.00,580.00,3.45,59.14,47229,7,False,1111.0,115538262,,central_heating,False,True,,Henschelstr.,False,apartment,3.0,1.0,False,False,3,3,2,1.0,Duisburg,Friemersheim,- Modernisiertes Badezimmer\n - Bad mit Fenste...,Feb20
2541,Sachsen,290.00,392.50,3.00,49.50,4249,7,False,1111.0,115267331,,central_heating,False,True,,Giordano-Bruno-Stra&szlig;e,False,roof_storey,1.0,,True,False,1,1,2,1.0,Leipzig,Großzschocher,Gerne können Sie einen Besichtigungstermin mit...,Feb20
7604,Sachsen,310.12,378.12,3.87,50.84,4277,7,False,1111.0,115555925,,self_contained_central_heating,False,True,,Burgst&auml;dter Str.,False,apartment,2.0,2.0,True,False,2,2,2,1.0,Leipzig,Connewitz,Gerne können Sie einen Besichtigungstermin mit...,Feb20
8817,Nordrhein_Westfalen,555.00,725.00,4.66,66.92,44803,4,False,1111.0,115627128,,central_heating,False,True,,Andreas-Hofer-Str.,False,apartment,2.0,1.0,True,False,4,2,3,1.0,Bochum,Altenbochum,,Feb20
11923,Sachsen,433.00,568.00,1.72,58.58,1157,8,False,1111.0,114929482,,central_heating,False,True,,Grillparzerstr.,False,apartment,2.0,1.0,True,False,3,2,2,1.0,Dresden,Cotta,Einen individuellen Besichtigungstermin verein...,Feb20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255950,Hamburg,681.23,873.23,4.45,61.93,21029,9,False,1111.0,115378043,,central_heating,False,True,,Vierlandenstr.,False,apartment,2.5,1.0,False,False,5,2,3,1.0,Hamburg,Bergedorf,+ Genießen Sie Ihre Bewegungsfreiheit und Komf...,Feb20
258549,Schleswig_Holstein,210.00,313.00,3.27,24.88,24116,4,False,1111.0,115704742,,central_heating,False,True,,Westring,True,ground_floor,1.0,,False,False,1,1,1,1.0,Kiel,Schreventeich,Schöne und günstige 1-Zimmer-Studentenwohnung ...,Feb20
261781,Sachsen,283.01,355.01,4.33,41.68,4317,7,False,1111.0,115687129,,central_heating,False,True,,Reichpietschstr.,False,ground_floor,1.0,,False,False,1,1,2,1.0,Leipzig,Reudnitz_Thonberg,Gerne können Sie einen Besichtigungstermin mit...,Feb20
265442,Berlin,365.87,522.87,6.82,51.97,13403,8,False,1111.0,115590265,,central_heating,False,True,,Waldowstr.,False,apartment,2.0,2.0,True,False,2,2,2,1.0,Berlin,Reinickendorf_Reinickendorf,Eine helle Wohnung mit Balkon und gefliestem W...,Feb20


In [52]:
year_col = df_trim['yearConstructed']
year_col.replace(to_replace = 1111.0, value = year_col.mean(), inplace=True)

In [53]:
# check that no unusual years remain
df_trim[df_trim['yearConstructed'] == 1111.0]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date


In [54]:
# check out the minimum year. It's an old church tower, so 1000 seems plausible.
df_trim[df_trim['yearConstructed'] == 1000.0]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date
124633,Sachsen_Anhalt,500.0,580.0,2.13,78.0,6420,16,False,1000.0,68781039,electricity,night_storage_heater,True,False,mint_condition,Dorfstr.,False,apartment,3.0,,False,False,3,3,3,1.0,Salzlandkreis,Könnern,Die Wohnung befindet sich in einem ehemaligen ...,Sep18


In [55]:
df_trim['yearConstructed'].describe()

count    211765.000000
mean       1966.751823
std          43.675181
min        1000.000000
25%        1950.000000
50%        1973.000000
75%        1996.000000
max        2022.000000
Name: yearConstructed, dtype: float64

## 04d. Living space (size) anomalies

Next, I'll check out suspicious values in livingSpace. 600 square meters is quite large for an apartment, so let's look at anything bigger than that.

In [56]:
df_trim[df_trim['livingSpace'] > 599]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date
12420,Bremen,520.0,720.0,2.74,649.0,28357,10,False,,115371332,,district_heating,False,True,first_time_use_after_refurbishment,no_information,False,apartment,3.0,2.0,True,False,4,3,7,,Bremen,Lehesterdeich,Die angebotene Wohnung befindet sich im 2.Obe...,Feb20
14123,Hessen,12000.0,13400.0,5.13,600.0,60322,2,False,1900.0,98567656,gas,self_contained_central_heating,False,False,negotiable,Auf der K&ouml;rnerwiese,True,maisonette,15.0,4.0,True,False,9,5,7,1.0,Frankfurt_am_Main,Westend_Nord,"Über die hellen, mit Granit verkleideten Trepp...",Oct19
50683,Sachsen,825.0,,0.72,1717.74,9112,11,False,1911.0,114935600,,,False,True,first_time_use_after_refurbishment,Reichsstra&szlig;e,True,apartment,5.0,,True,False,1,5,7,1.0,Chemnitz,Kaßberg,Diese großzügig geschnittene Villa überzeugt d...,Feb20
51540,Brandenburg,504.0,650.0,4.33,8684.0,16225,8,False,1996.0,109860941,gas,central_heating,True,True,modernized,Danckelmannstra&szlig;e,False,,3.0,4.0,False,True,4,3,7,5.0,Barnim_Kreis,Eberswalde,Das Wohnhaus wurde im Jahre 1996 neu errichtet...,Oct19
92451,Bayern,800.0,,3.58,2257.88,90518,30,True,2019.0,110383735,gas,,False,False,,no_information,False,,75.5,,False,False,5,5,7,9.0,Nürnberger_Land_Kreis,Altdorf_bei_Nürnberg,"*****Bitte beachten Sie, der angegebene Preis ...",May19
97475,Bayern,340.0,454.69,5.96,600.0,94315,7,False,2016.0,90422924,gas,central_heating,True,False,fully_renovated,Enggasse,False,,23.0,,False,False,2,5,7,9.0,Straubing,Kernstadt,Kernsanierte Häuser mit insgesamt 12 Wohnungen...,Oct19
151184,Bayern,1200.0,1400.0,6.09,66100.0,81737,23,False,1987.0,109350573,oil,central_heating,False,True,well_kept,Neubiberger Stra&szlig;e,False,roof_storey,2.0,2.0,False,True,7,2,7,4.0,München,Perlach,+ kleines gepflegtes Mehrfamilienhaus\n\n+ 2-Z...,May19
170816,Hessen,19329.0,20588.0,5.21,601.85,60322,11,False,,114421851,gas,gas_heating,False,False,first_time_use_after_refurbishment,Hochstra&szlig;e,True,penthouse,6.0,7.0,True,False,9,5,7,,Frankfurt_am_Main,Innenstadt,"Ausgestattet mit hochwertigem Eichen-Parkett, ...",Feb20
172399,Sachsen,200.0,260.0,1.24,4340.0,8529,5,False,,111349875,gas,central_heating,False,True,well_kept,no_information,False,apartment,2.0,1.0,False,False,1,2,7,,Plauen,Reusa_mit_Sorga,Ihre neue Wohnung befindet sich in einem ruhig...,May19
175397,Sachsen_Anhalt,679.0,369.0,4.61,111111.0,39624,0,False,,113363197,,,False,False,,G&uuml;ssefelder Dorfstra&szlig;e,False,apartment,3.0,,False,False,5,3,7,,Altmarkkreis_Salzwedel,Güssefeld,,Oct19


In [57]:
df_trim.loc[df_trim['noRoomsRange'] == 5, 'livingSpace'].median()

144.0

In [58]:
df_trim.loc[df_trim['noRoomsRange'] == 3, 'livingSpace'].median()

77.12

In [59]:
df_trim.loc[df_trim['noRoomsRange'] == 2, 'livingSpace'].median()

58.01

In [60]:
df_trim.loc[df_trim['noRoomsRange'] == 1, 'livingSpace'].median()

35.0

To deal with the suspicious values, I'll replace 8 with the mean for the given noRoomsRange, correct one value that included the correct value in the description, and correct 9 values where livingSpace was swapped with noRooms.

In [61]:
df_trim = df_trim.replace({'livingSpace': {2257.88: 144.0, 1000.00: 144.0, 8684.00: 77.12, 111111.00: 77.12, 2782.00: 77.12, 7008.00: 77.12, 10259.00: 77.12, 14000.00: 77.12, 4340.00: 58.01, 4947.00: 58.01, 2420.00: 35.0, 66100.0: 166.0, 144: 52, 600: 60, 2: 45, 1: 75, 446: 45, 3: 99}})

Now, let's see the rows where livingSpace is listed as zero. That seems wrong.

In [62]:
df_trim[df_trim['livingSpace'] == 0.0]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date
3444,Sachsen_Anhalt,620.00,890.0,1.98,0.0,6847,10,False,1985.0,113671451,,,False,False,,Ellerbreite,False,,5.0,5.0,True,False,5,5,1,4.0,Dessau_Roßlau,Zoberberg,,Oct19
11127,Niedersachsen,775.00,865.0,3.75,0.0,30459,7,False,,106780303,gas,self_contained_central_heating,False,True,well_kept,Barsingh&auml;userstr.,False,other,2.0,,True,True,5,2,1,,Hannover,Oberricklingen,Diese moderne 2-Zimmer-Design-Wohnung befindet...,Sep18
15014,Hessen,110.00,110.0,6.04,0.0,60326,1,False,,111383274,,,False,False,,Helmut Walcha Str,False,,1.0,,False,False,1,1,1,,Frankfurt_am_Main,Gallusviertel,"Es handelt sich um einen TG Stellplatz, der si...",May19
17278,Sachsen,160.00,,0.00,0.0,9599,5,False,1987.0,103331711,district_heating,district_heating,True,True,,Paul-M&uuml;ller-Str,False,apartment,1.0,5.0,False,False,1,1,1,4.0,Mittelsachsen_Kreis,Freiberg,,May19
18433,Sachsen,357.74,,0.00,0.0,9130,1,False,1920.0,104445035,gas,central_heating,False,False,,Stra&szlig;e der Nationen,False,,2.0,,False,False,1,2,1,1.0,Chemnitz,Schloßchemnitz,Dieses schöne Haus aus der Gründerzeit bestich...,May19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247111,Baden_Württemberg,450.00,490.0,4.07,0.0,76137,2,False,,113593869,gas,central_heating,False,False,mint_condition,no_information,False,ground_floor,1.0,,False,False,3,1,1,,Karlsruhe,Südweststadt,Helle 1 Zimmer-Wohnung in der Karlsruher Südwe...,Oct19
248599,Mecklenburg_Vorpommern,325.00,465.0,1.13,0.0,17192,18,False,1888.0,109978237,gas,central_heating,False,False,well_kept,Teterower Str.,False,apartment,2.0,1.0,False,False,2,2,1,1.0,Müritz_Kreis,Waren_Müritz,Solides und ruhiges Mehrfamilienhaus in Randla...,May19
254055,Baden_Württemberg,1018.15,470.0,4.29,0.0,74889,7,False,,107357543,,,True,False,first_time_use_after_refurbishment,no_information,False,ground_floor,2.0,0.0,False,False,1,2,1,,Rhein_Neckar_Kreis,Sinsheim,"*WICHTIGE INFO*:\nDie Raumhöhe liegt unter 2,3...",Sep18
266160,Baden_Württemberg,450.00,480.0,4.24,0.0,76137,7,False,1900.0,113744252,gas,central_heating,False,False,,no_information,False,ground_floor,1.0,1.0,False,False,3,1,1,1.0,Karlsruhe,Südstadt,Das schöne 1-Zimmer-Apartment verfügt über ein...,Oct19


In [63]:
# replace zeroes in livingSpace with NaN
df_trim['livingSpace'].replace(0, np.nan, inplace=True)

In [64]:
df_trim.describe()

Unnamed: 0,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,yearConstructed,scoutId,noRooms,floor,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange
count,268772.0,228223.0,266940.0,268701.0,268772.0,268772.0,211765.0,268772.0,268772.0,217493.0,268772.0,268772.0,268772.0,211765.0
mean,647.661063,814.815558,3.389086,73.461159,37281.874834,9.792668,1966.751823,106970100.0,2.641347,2.122583,3.76566,2.571663,3.070971,3.714509
std,505.609452,562.467754,1.964909,32.15867,27797.004173,6.408047,43.675181,12500130.0,2.633616,3.635269,2.214247,0.937473,1.407025,2.738046
min,1.0,100.0,-12.33,5.0,852.0,0.0,1000.0,28871740.0,1.0,-1.0,1.0,1.0,1.0,1.0
25%,338.0,470.0,2.0,54.0,9128.0,6.0,1950.0,106691200.0,2.0,1.0,2.0,2.0,2.0,1.0
50%,490.0,650.0,3.39,67.28,38667.0,9.0,1973.0,111158400.0,3.0,2.0,3.0,3.0,3.0,3.0
75%,799.0,985.0,4.57,86.91,57072.0,13.0,1996.0,113768700.0,3.0,3.0,5.0,3.0,4.0,5.0
max,20000.0,26500.0,14.92,1717.74,99998.0,121.0,2022.0,115711700.0,999.99,999.0,9.0,5.0,7.0,9.0


In [65]:
# count NaN in each row to make sure the substitution took effect
df_trim.isna().sum()

regio1                      0
baseRent                    0
totalRent               40549
pricetrend               1832
livingSpace                71
geo_plz                     0
picturecount                0
newlyConst                  0
yearConstructed         57007
scoutId                     0
firingTypes             56927
heatingType             44827
hasKitchen                  0
cellar                      0
condition               68451
street                      0
lift                        0
typeOfFlat              36598
noRooms                     0
floor                   51279
balcony                     0
garden                      0
baseRentRange               0
noRoomsRange                0
livingSpaceRange            0
yearConstructedRange    57007
regio2                      0
regio3                      0
description             19738
date                        0
dtype: int64

## 04e. Number of rooms anomalies

Moving on to noRooms. I'll examine anything with more than 13 rooms.

In [66]:
# create subset of entries with over 13 rooms
manyRooms = df_trim[df_trim['noRooms'] > 13.0]

In [67]:
# copy to clipboard so I can view in Excel
manyRooms.to_clipboard()

In [68]:
# view noRooms stats before corrections
df_trim['noRooms'].describe()

count    268772.000000
mean          2.641347
std           2.633616
min           1.000000
25%           2.000000
50%           3.000000
75%           3.000000
max         999.990000
Name: noRooms, dtype: float64

In [69]:
df_trim.loc[df_trim['livingSpaceRange'] == 2, 'noRooms'].median()

2.0

In [70]:
df_trim.loc[df_trim['livingSpaceRange'] == 3, 'noRooms'].median()

3.0

In [71]:
df_trim.loc[df_trim['livingSpaceRange'] == 1, 'noRooms'].median()

1.0

I decided to correct 30 noRooms values with the value stated in the description or the median value for the given livingSpaceRange.

In [72]:
df_trim = df_trim.replace({'noRooms': {79: 6, 120: 3, 23:3, 16: 2, 100: 3, 32:3, 560: 2, 160: 3, 30: 3, 230: 2, 65: 1, 21: 2, 999.99: 3, 80: 3, 75.5: 2, 50: 2, 25: 2, 99.5: 3, 200: 2, 140: 1, 99:2, 305: 3,22: 2, 45: 3}})

In [73]:
df_trim['noRooms'].describe()

count    268772.000000
mean          2.627874
std           0.982793
min           1.000000
25%           2.000000
50%           3.000000
75%           3.000000
max          18.000000
Name: noRooms, dtype: float64

## 04f. Floor anomalies

Last but not least, I'll deal with the suspiciously high units. Anything above the 20th floor seems like a good start.

In [74]:
# create subset of entries with over 20 floors
manyFloors = df_trim[df_trim['floor'] > 20.0]

In [75]:
df_trim['floor'].describe()

count    217493.000000
mean          2.122583
std           3.635269
min          -1.000000
25%           1.000000
50%           2.000000
75%           3.000000
max         999.000000
Name: floor, dtype: float64

In [76]:
# copy to clipboard so I can view in Excel, again using the description column to make a judgment
manyFloors.to_clipboard()

In [77]:
# calculate median for each region with incorrect floor numbers
df_trim.loc[df_trim['regio1'] == 'Sachsen', 'floor'].median()

2.0

In [78]:
df_trim.loc[df_trim['regio1'] == 'Hessen', 'floor'].median()

2.0

In [79]:
df_trim.loc[df_trim['regio1'] == 'Bayern', 'floor'].median()

2.0

In [80]:
df_trim.loc[df_trim['regio1'] == 'Brandenburg', 'floor'].median()

2.0

In [81]:
df_trim.loc[df_trim['regio1'] == 'Thüringen', 'floor'].median()

2.0

In [82]:
df_trim.loc[df_trim['regio1'] == 'Berlin', 'floor'].median()

2.0

In [83]:
# replace incorrect floor values
df_trim = df_trim.replace({'floor': {45: 4, 80: 2, 83: 4, 98: 2, 99: 2, 104: 0, 105: 2, 124: 2, 133: 2, 134: 2, 135: 2, 136: 2, 137: 2, 138: 2, 139: 2, 390: 0, 645: 2, 650: 2, 999: 2}})

In [84]:
df_trim['floor'].describe()

count    217493.000000
mean          2.098114
std           1.660336
min          -1.000000
25%           1.000000
50%           2.000000
75%           3.000000
max          41.000000
Name: floor, dtype: float64

In [85]:
df_trim['floor'].value_counts()

 1.0     64116
 2.0     56952
 3.0     37872
 0.0     24597
 4.0     19953
 5.0      7987
 6.0      2493
 7.0      1031
 8.0       578
 9.0       427
 10.0      363
-1.0       311
 11.0      236
 12.0      144
 13.0      133
 14.0      110
 15.0       61
 16.0       34
 17.0       28
 19.0       15
 21.0       12
 18.0       11
 20.0       10
 22.0        4
 24.0        3
 23.0        2
 26.0        2
 29.0        2
 32.0        1
 25.0        1
 37.0        1
 31.0        1
 41.0        1
 36.0        1
Name: floor, dtype: int64

## 04g. PLZ anomalies

In [86]:
# view entries where PLZ has fewer than five digits
df_trim[df_trim['geo_plz'] < 10000]

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date
2,Sachsen,965.0,1300.0,2.72,83.80,1097,8,True,2019.0,113147523,,floor_heating,False,True,first_time_use,Turnerweg,True,apartment,3.0,3.0,True,False,6,3,4,9.0,Dresden,Äußere_Neustadt_Antonstadt,Der Neubau entsteht im Herzen der Dresdner Neu...,Oct19
3,Sachsen,343.0,,1.53,58.15,9599,9,False,1964.0,108890903,district_heating,district_heating,False,False,,Gl&uuml;ck-Auf-Stra&szlig;e,False,other,3.0,3.0,True,False,2,3,2,2.0,Mittelsachsen_Kreis,Freiberg,Abseits von Lärm und Abgasen in Ihre neue Wohn...,May19
6,Sachsen,310.0,380.0,1.01,62.00,9599,9,False,,114391930,,self_contained_central_heating,False,True,fully_renovated,Am Bahnhof,False,,2.0,1.0,False,True,2,2,3,,Mittelsachsen_Kreis,Freiberg,Am Bahnhof 14 in Freiberg\nHeizkosten und Warm...,Feb20
10,Sachsen,219.0,307.0,0.21,40.20,9130,11,False,1930.0,112923517,gas,,False,True,,Hofer Stra&szlig;e,False,apartment,2.0,3.0,True,False,1,2,1,1.0,Chemnitz,Sonnenberg,Gemütliche 2-Raum Wohnung in Chemnitz. komple...,Oct19
11,Sachsen,400.0,555.0,0.98,80.00,9669,9,False,1892.0,109842225,gas,central_heating,False,True,well_kept,no_information,False,,3.0,1.0,False,True,2,3,3,1.0,Mittelsachsen_Kreis,Frankenberg/Sachsen,Gern möchten wir Ihnen diese 3-Zimmer-Wohnung ...,May19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268833,Sachsen,300.0,,2.10,102.00,2708,11,False,1873.0,114637314,,,False,False,refurbished,no_information,False,apartment,4.0,,False,False,1,4,5,1.0,Görlitz_Kreis,Löbau,"Vermietet werden 3 Wohneinheiten im 1.OG, 2.OG...",Feb20
268834,Sachsen,440.0,610.0,1.92,70.49,8396,3,False,,99131525,gas,central_heating,False,True,well_kept,no_information,False,,3.0,0.0,True,False,3,3,3,,Zwickau_Kreis,Waldenburg,Die Wohnung befindet sich in einem gewachsenen...,Sep18
268836,Sachsen_Anhalt,520.0,820.0,0.00,115.00,6268,17,False,,112441878,gas,central_heating,True,False,well_kept,Lederberg,False,maisonette,4.0,,False,False,4,4,5,,Saalekreis,Querfurt,Der Eingang der 4-Raum-Maisonetten-Wohnung bef...,Oct19
268842,Sachsen,300.0,440.0,0.54,59.89,8058,7,False,,111857041,,,True,False,,M&uuml;hlpfortstra&szlig;e,False,maisonette,2.0,3.0,False,False,1,2,2,,Zwickau,Nordvorstadt,,Oct19


Note: The results indicate the four-digit PLZs are largely from the Sachsen region, where postal codes have leading zeroes. But when I went to map this dataset later, I realized there were 10 zipcodes that were missing a digit in error. Correcting those now...

In [87]:
df_trim = df_trim.replace({'geo_plz': {2122: 21224, 3017: 30177, 3118: 38118, 3821: 18311, 4414: 44141, 4428: 44289, 4730: 52159, 5931: 49716, 6910: 69120, 8382: 88079}})

## 04h. Update and copy descriptive stats

In [88]:
stats = df_trim.describe()

In [89]:
stats.to_clipboard()

# 05. Address mixed-type data

In [90]:
# display data types
df_trim.dtypes

regio1                   object
baseRent                float64
totalRent               float64
pricetrend              float64
livingSpace             float64
geo_plz                   int64
picturecount              int64
newlyConst                 bool
yearConstructed         float64
scoutId                   int64
firingTypes              object
heatingType              object
hasKitchen                 bool
cellar                     bool
condition                object
street                   object
lift                       bool
typeOfFlat               object
noRooms                 float64
floor                   float64
balcony                    bool
garden                     bool
baseRentRange             int64
noRoomsRange              int64
livingSpaceRange          int64
yearConstructedRange    float64
regio2                   object
regio3                   object
description              object
date                     object
dtype: object

In [91]:
for col in df_trim.columns.tolist():
  weird = (df_trim[[col]].applymap(type) != df_trim[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_trim[weird]) > 0:
    print (col)

firingTypes
heatingType
condition
typeOfFlat
description


We have mixed data types in the five columns listed above. I also need to change scoutId to a string because this number is a unique identifier that doesn't warrant analysis, and I'm changing date to a string for now.

In [92]:
# identify the appropriate data type for the column, then run this command to change the type of any non-conforming values

df_trim['firingTypes'] = df_trim['firingTypes'].astype('str')
df_trim['heatingType'] = df_trim['heatingType'].astype('str')
df_trim['condition'] = df_trim['condition'].astype('str')
df_trim['typeOfFlat'] = df_trim['typeOfFlat'].astype('str')
df_trim['description'] = df_trim['description'].astype('str')
df_trim['date'] = df_trim['date'].astype('str')
df_trim['scoutId'] = df_trim['scoutId'].astype('str')

In [93]:
# check again for mixed-type columns
for col in df_trim.columns.tolist():
  weird = (df_trim[[col]].applymap(type) != df_trim[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_trim[weird]) > 0:
    print (col)

Now I'll change the date column to datetime format to make sure I can use it for time-based analysis later. First I check the possible values for this column, then replace them with YYYYMMDD values, then convert to datetime format.

In [94]:
df['date'].unique()

array(['May19', 'Oct19', 'Feb20', 'Sep18'], dtype=object)

In [95]:
df_trim = df_trim.replace({'date': {'May19': 20190510, 'Sep18': 20180922, 'Feb20': 20200201, 'Oct19': 20191008}})


In [96]:
df_trim['date'] = pd.to_datetime(df_trim['date'], format= '%Y%m%d')

Finally, I'll check the data types again to make sure everything looks in order.

In [97]:
df_trim.dtypes

regio1                          object
baseRent                       float64
totalRent                      float64
pricetrend                     float64
livingSpace                    float64
geo_plz                          int64
picturecount                     int64
newlyConst                        bool
yearConstructed                float64
scoutId                         object
firingTypes                     object
heatingType                     object
hasKitchen                        bool
cellar                            bool
condition                       object
street                          object
lift                              bool
typeOfFlat                      object
noRooms                        float64
floor                          float64
balcony                           bool
garden                            bool
baseRentRange                    int64
noRoomsRange                     int64
livingSpaceRange                 int64
yearConstructedRange     

# 06. Check for duplicates

In [98]:
df_dups = df_trim[df_trim.duplicated()]

In [99]:
df_dups
# empty = no duplicates

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date


# 07. Create new price/unit variable

In [100]:
df_trim['price/unit'] = df_trim['totalRent']/df['livingSpace']

# 08. Final checks and export

In [101]:
# check the header one last time
df_trim.head()

Unnamed: 0,regio1,baseRent,totalRent,pricetrend,livingSpace,geo_plz,picturecount,newlyConst,yearConstructed,scoutId,firingTypes,heatingType,hasKitchen,cellar,condition,street,lift,typeOfFlat,noRooms,floor,balcony,garden,baseRentRange,noRoomsRange,livingSpaceRange,yearConstructedRange,regio2,regio3,description,date,price/unit
0,Nordrhein_Westfalen,595.0,840.0,4.62,86.0,44269,6,False,1965.0,96107057,oil,central_heating,False,True,well_kept,Sch&uuml;ruferstra&szlig;e,False,ground_floor,4.0,1.0,False,True,4,4,4,2.0,Dortmund,Schüren,Die ebenerdig zu erreichende Erdgeschosswohnun...,2019-05-10,9.767442
1,Rheinland_Pfalz,800.0,,3.47,89.0,67459,8,False,1871.0,111378734,gas,self_contained_central_heating,False,False,refurbished,no_information,False,ground_floor,3.0,,True,False,5,3,4,1.0,Rhein_Pfalz_Kreis,Böhl_Iggelheim,Alles neu macht der Mai – so kann es auch für ...,2019-05-10,
2,Sachsen,965.0,1300.0,2.72,83.8,1097,8,True,2019.0,113147523,,floor_heating,False,True,first_time_use,Turnerweg,True,apartment,3.0,3.0,True,False,6,3,4,9.0,Dresden,Äußere_Neustadt_Antonstadt,Der Neubau entsteht im Herzen der Dresdner Neu...,2019-10-08,15.513126
3,Sachsen,343.0,,1.53,58.15,9599,9,False,1964.0,108890903,district_heating,district_heating,False,False,,Gl&uuml;ck-Auf-Stra&szlig;e,False,other,3.0,3.0,True,False,2,3,2,2.0,Mittelsachsen_Kreis,Freiberg,Abseits von Lärm und Abgasen in Ihre neue Wohn...,2019-05-10,
4,Bremen,765.0,903.0,2.46,84.97,28213,19,False,1950.0,114751222,gas,self_contained_central_heating,False,False,refurbished,Hermann-Henrich-Meier-Allee,False,apartment,3.0,1.0,True,False,5,3,4,1.0,Bremen,Neu_Schwachhausen,Es handelt sich hier um ein saniertes Mehrfami...,2020-02-01,10.62728


In [102]:
# check shape
df_trim.shape

(268772, 31)

In [103]:
# export to csv
# be sure to specify utf8 encoding to preserve special characters
df_trim.to_csv(os.path.join(r'..\cleaned data','rent_cleaned.csv'), encoding = 'utf8', index=False)