# Cleaning the dataset

### This notebook is to clean the csv containing property listings from Daft.ie

In [1]:
import pandas as pd
import re

In [2]:
from datetime import date

today = date.today()
year = today.year
month = today.month
day = today.day

df = pd.read_csv(f"./scrape_output/daft_listings_{year}{month}{day}.csv")

In [3]:
# remove listings for which we don't have a price
df.drop(df[df['price'] == "Price on Application "].index, inplace = True)

In [4]:
df["bedrooms"] = df["bedrooms"].str.replace(" Bed", "")
df["bedrooms"] = df["bedrooms"].astype(int)

In [5]:
df["bathrooms"] = df["bathrooms"].str.replace(" Bath", "")
df["bathrooms"] = df["bathrooms"].astype(int)

In [6]:
df["price"] = df["price"].str.replace("€", "")
df["price"] = df["price"].str.replace(",", "")
df["price"] = df["price"].astype(int)

# to remove of any entries that start with AMV (advised minimum value)
df["price"] = df["price"].str.replace(r"^AMV:\s", "")

In [7]:
# split address column based on comma
df ["district"] = df["address"].str.split(",").str[-1]

df.district.value_counts()

 Co. Dublin        387
 Dublin 18         196
 Dublin 15         189
 Dublin 6          177
 Dublin 4          166
 Dublin 14         165
 Dublin 9          159
 Dublin 12         149
 Dublin 7          144
 Dublin 3          130
 Dublin 8          127
 Dublin 24         114
 Dublin 16         102
 Dublin 13         102
 Dublin 5          101
 Dublin 11          93
 Dublin 22          73
 Dublin 1           59
 Dublin 2           53
 Dublin 6W          43
 Dublin 20          30
 Dublin 10          24
 Dublin 17           9
 Northern Cross      6
 Belmayne            5
 Co. Meath           1
Name: district, dtype: int64

In [8]:
df["floor_area"] = df["floor_area"].str.replace("m²", "")

In [9]:
df[df["floor_area"].str.contains("ac")==True]
#assuming this one is wrong - seems like the value is in metres sq but is labelled incorrectly (from inspecting df / csv)

Unnamed: 0.1,Unnamed: 0,price,address,bedrooms,bathrooms,floor_area,district
2615,2615,495000,"96 Beech Grove Cottages, Bray Road, Loughlinst...",4,2,748.73 ac,Co. Dublin


In [10]:
#dropping that row
df = df.drop([2615])

In [11]:
df["floor_area"] = df["floor_area"].astype(float)

In [12]:
df = df.drop([2491])

In [13]:
df ["neighbourhood"] = df["address"].str.split(",").str[-2]

In [16]:
# print(subset.neighbourhood.value_counts())

# addresses in "Co. Dublin" follow the same address pattern ending in TOWN, CO. DUBLIN 

# identified the most common towns that are in Co. Dublin (identified as towns with > 1 entries)

# neighbourhoods that are in Glenageary (A96)
glenageary = ["Glenageary", "Dun Laoghaire", "Dalkey", "Killiney", "Sandycove", "Ballybrack", "Glasthule", "Sallynoggin"]

# neighbourhoods that are in Blackrock (A94)
blackrock = ["Blackrock", "Booterstown", "Stillorgan", "Monkstown", "Mount Merrion"]

d24 = ["Citywest"]

d18 = ["Rathmichael", "Loughlinstown", "Deans Grange"]

d13 = ["Howth", "Belmayne"]

# I want to see if the column neighbourhood has any of these towns, if so, assign the correct District value (Blackrock / Glenageary)

In [17]:
def make_district(text):
    if text.strip() in glenageary:
        return "Glenageary"
    elif text.strip() in blackrock:
        return "Blackrock"
    elif text.strip() in d24:
        return "Dublin 24"
    elif text.strip() in d18:
        return "Dublin 18"
    elif text.strip() in d13:
        return "Dublin 13"
    else:
        return "Not found " + text

In [18]:
import numpy as np

df['district'] = np.where(df["district"].str.contains("Co. Dublin"), df["neighbourhood"].apply(make_district),df["district"])

In [29]:
df["district"] = df["district"].str.replace(" Belmayne", "Dublin 13")

In [30]:
df["district"] = df["district"].str.replace(" Northern Cross", "Dublin 17")

In [37]:
df["district"] = df["district"].str.replace(r"^\s","")

In [38]:
df.district.value_counts()

Dublin 18                         204
Dublin 15                         189
Glenageary                        180
Blackrock                         179
Dublin 6                          177
Dublin 4                          166
Dublin 14                         165
Dublin 9                          159
Dublin 12                         148
Dublin 7                          144
Dublin 3                          130
Dublin 8                          127
Dublin 24                         119
Dublin 13                         112
Dublin 16                         102
Dublin 5                          101
Dublin 11                          93
Dublin 22                          73
Dublin 1                           59
Dublin 2                           53
Dublin 6W                          43
Dublin 20                          30
Dublin 10                          24
Dublin 17                          15
Not found  Cherrywood               1
Co. Meath                           1
Not found  I

In [45]:
df.drop(df[df['district'].str.contains("Not found")].index, inplace = True)
df.drop(df[df['district'].str.contains("Co. Meath")].index, inplace = True)

In [81]:
df["price_sqm"] = round(df["price"]/df["floor_area"])

In [88]:
df = df.drop(columns=["Unnamed: 0"])

In [91]:
df.price_sqm.describe()

count     2792.000000
mean      5344.926218
std       1714.366435
min        447.000000
25%       4078.250000
50%       5122.500000
75%       6267.000000
max      17143.000000
Name: price_sqm, dtype: float64

In [89]:
df.to_csv(f"./analysis_output/clean_output_{year}{month}{day}.csv")