In [1]:
#Scraping data from Daft.ie and putting together a dataset for House rent predictions:

#This program contains 2 parts:
#Part I: Web-scraping from daft.ie.
#Part II: Combining the data scraped to form a single dataset.

In [None]:
#Part I: Web-scraping of data from daft.ie.

#Step 1: Importing the necessary data from daft.ie
import requests
import bs4
import numpy as np
import pandas as pd
import re
from requests import get
from bs4 import BeautifulSoup
from functools import reduce

In [2]:
#Step 2: Creating a headers request with a 'user-agent' string which acts as an ID card containing basic information about the web page (version and type), for optimal performance and visuals.
headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}

In [3]:
#Step 3: Extracting the number of pages of data to scrape from daft.ie.
divs = np.arange(0, 3201, 20)
divs = divs.astype(int)
print(len(divs))

161

In [4]:
#Step 4: The scraping process! Extracting the addresses, rents, and accomodation type, and storing them all in lists:
Initial_page = "https://www.daft.ie/property-for-rent/dublin?pageSize=20&from=" # the very first page

#Initialising all the necessary lists. Each list is a nested list containing 
rates = [[]*len(divs)]  #nested list for rents
adds = [[]*len(divs)]  #nested list for addresses
houses = [[]*len(divs)]  #nested list for housing types
beds = [[]*len(divs)]  #nested list for number of bedrooms
baths = [[]*len(divs)]  #nested list for number of bathrooms
mix = [[]*len(divs)]  #nested list for the first 4 pages' data of beds + baths + housing_type

#a for loop running across all the pages:
for i in divs:
    url = Initial_page+str(i) # construct the url by pasting
    r = requests.get(url,{'headers':headers})
    soup = bs4.BeautifulSoup(r.text,'html.parser')
    
    #Scraping the rent data into 'rates'
    a = soup.find_all('p', {'class': 'SubUnit__Title-sc-10x486s-5 keXaVZ', 'data-testid': 'sub-title'})
    b = soup.find_all('span',{'class': 'TitleBlock__StyledSpan-sc-1avkvav-4 gDBFnc'})
    rates.append(a)
    rates.append(b)
    
    #Scraping the addresses into 'adds'
    c = soup.find_all('p',{'class': 'TitleBlock__Address-sc-1avkvav-7 eARcqq', 'data-testid': 'address'})
    d = soup.find_all('p',{'class': 'TitleBlock__Address-sc-1avkvav-7 knPImU', 'data-testid': 'address'})
    adds.append(c)
    adds.append(d)
    
    #Scraping the house specifications:
    #1. Bedroom specs into beds
    e = soup.find_all('p',{'class': 'TitleBlock__CardInfoItem-sc-1avkvav-8 jBZmlN', 'data-testid': 'beds'})
    beds.append(e)
    
    #2. Bathroom specs into baths
    f = soup.find_all('p',{'class': 'TitleBlock__CardInfoItem-sc-1avkvav-8 jBZmlN', 'data-testid': 'baths'})
    baths.append(f)
    
    #3. Accomodation type into houses
    g = soup.find_all('p', {'class': 'TitleBlock__CardInfoItem-sc-1avkvav-8 bcaKbv', 'data-testid': 'property-type'})
    houses.append(g)
    
    #4. For the entries from pages 1-4, beds, baths and house_types are all combined:
    h = soup.find_all('p', {'class': 'SubUnit__CardInfoItem-sc-10x486s-7 ftNycI'})
    mix.append(h)
    
#The initial pages of the web-site (1-4, to be exact) had different code for the content uploaded. Hence, a bit of complication while extracting. However, to overcome this, I have used Python as well as Excel to filter and obtain the best data!

In [5]:
#Step 5: Converting each nested list to a regular list:
rates2 = reduce(lambda x1,y1: x1+y1, rates)
adds2 = reduce(lambda x2,y2: x2+y2, adds)
beds2 = reduce(lambda x3,y3: x3+y3, beds)
baths2 = reduce(lambda x4,y4: x4+y4, baths)
houses2 = reduce(lambda x5,y5: x5+y5, houses)
mix2 = reduce(lambda x6,y6: x6+y6, mix)

#let's observe the number of elements in each list:
#print(len(rates2), len(adds2), len(beds2), len(baths2), len(houses2), len(mix2))

#the list of rents seems to have unusually high number of values, let's have a look why:
#rates2[0:300]
#we can observe that the title of the accomodation has been extracted as well. We need to filter them out and keep only the rents' values in the list.

#Note: the 'print()' and 'rates2' commands were run to check some necessary details.

3298 3139 2864 2863 3056 318


In [6]:
#Step 6: initialising a new list to store irregular elements (which don't give information regarding the rent) and then removing them from the original list:
extras = []   #empty list
s = 0    #counter

#for loop to go through the list containing web-scraped single list:
for i in range(len(rates2)):
    if '€' not in str(rates2[i]):    #checking if the symbol '€' exists in the information tag
        extras.append(rates2[i])
        s += 1
rates3 = [k for k in rates2 if k not in extras]
print("Length of filtered rent dataset", len(rates3))

#initialising another empty list for further filtering:
extras2 = []     #new empty list
s = 0    #re-initialising the counter

#for loop through the length of the filtered  list:
for j in range(len(rates3)):
    if 'from' in str(rates3[j]):      #checking for the term 'from' in the string
        extras2.append(rates3[j])
        s += 1
    elif 'up' in str(rates3[j]):      #checking for the term 'up' in the string
        extras2.append(rates3[j])
        s += 1
rates4 = [k for k in rates3 if k not in extras2]
print("Length of new filtered rent dataset", len(rates4))

Length of filtered rent dataset 3217
Length of new filtered rent dataset 3215


In [7]:
#Step 7: initialising and appending empty lists to store the necessary text from the scraped data:
types = []
bedrooms = []
bathrooms = []
rents = []
addresses = []
mixed = []

for container in houses2:
    #x = container.text
    types.append(container.text)

for container in beds2:
    #y = container.text
    bedrooms.append(container.text)
    
for container in baths2:
    #z = container.text
    bathrooms.append(container.text)

for container in rates4:
    #a = container.text
    rents.append(container.text)

for container in adds2:
    addresses.append(container.text)
    
for container in mix2:
    mixed.append(container.text)
    
#we do observe empty elements in the mixed list. Filtering them further to obtain all the non-null elements:
mixed2 = []
for i in range(len(mixed)):
    if mixed[i]:
        mixed2.append(mixed[i])

In [9]:
#Step 8: Since the arrays are of different length, I'm exporting the data separately into 6 different dataframes.
#The 6 different dataframes will be combined into one dataset using Excel.
#The last dataframe "Mixed-data" is more complex, and contains data of bedrooms, bathrooms and accomodation type for the first few pages in daft.ie.
df1 = pd.DataFrame({'House-Rent': rents})
df2 = pd.DataFrame({'Bedrooms': bedrooms})
df3 = pd.DataFrame({'Bathrooms': bathrooms})
df4 = pd.DataFrame({'Housing-type': types})
df5 = pd.DataFrame({'Address': addresses})
df6 = pd.DataFrame({'Mixed-data': mixed2})

df1.to_csv('House-rent.csv')
df2.to_csv('Bedrooms.csv')
df3.to_csv('Bathrooms.csv')
df4.to_csv('Housing-type.csv')
df5.to_csv('Address.csv')
df6.to_csv('Mixed-data.csv')

In [None]:
#Part II: Combining the data scraped to form a single dataset.

#Step 9: Import the dataset. Out dataset will be stored in a variable called HRdata:
HRdata = pd.read_csv("house-rent_data.csv")
HRdata.head()

#The dataset is loaded perfectly! Our variables are:
#1. House-rent: € per month.
#2. Bedrooms: No. of bedrooms.
#3. Bathrooms: No. of bathrooms.
#4. Housing-type: Type of housing --> Apartment/House/Studio.
#5. Address: The physical location of the accomodation.

In [None]:
#Step 10: The 'Address' variable is complex and is difficult to use directly. Hence it would be best to get the postal code of each accomodation, which would provide more insight into the regression procedure later on.

y = []    #empty list to store the postal codes

#Dublin is split into 22 different postal codes, plus some additional regions which do not follow the mainstream postal code terminology. Hence, we shall compile a list of the localities manually (it took me 90 minutes to come up with the list and cross-check for accuracy).
#comprehensive lists containing all the localities grouped under each individual postal code:
dub1 = ["IFSC", "Abbey Street", "Amiens Street", "Capel Street", "Dorset Street", "Henry Street", "Mary Street", "Mountjoy Square", "Marlborough Street", "North Wall", "O'Connell Street", "Parnell Square", "Talbot Street"]
dub2 = ["Merrion Square", "Trinity College", "Temple Bar", "Grafton Street", "Stephen's Green", "Dame Street", "Leeson Street", "Grand Canal Dock", "City Quay", "Leinster House", "Mansion House", "Aungier Street", "Wexford Street", "Camden Street", "Baggot Street", "College Green", "Fitzwilliam Square", "Harcourt Street", "Kildare Street", "Lord Edward Street", "Mount Street", "Nassau Street", "Pearse Street", "Georges Street", "Hanover Quay"]
dub3 = ["Ballybough", "North Strand", "Clonliffe", "Cloniffe", "Clontarf", "Dollymount", "East Wall", "East Point", "Fairview", "Killester", "Marino"]
dub4 = ["Ballsbridge", "Belfield", "Donnybrook", "Irishtown", "Merrion", "Pembroke", "Ringsend", "Sandymount", "RDS", "Lansdowne Road"]
dub5 = ["Artane", "Harmonstown", "Kilbarrack", "Raheny"]
dub6 = ["Milltown", "Ranelagh", "Rathmines", "Dartry", "Rathgar"]
dub6w = ["Harold's Cross", "Templeogue", "Kimmage", "Terenure"]
dub7 = ["Arbour Hill", "Ashtown", "Broadstone", "Cabra", "North Circular Road", "Grangegorman", "Oxmantown", "Phibsboro", "Smithfield", "Stoneybatter", "Four Courts", "Phibsborough", "Navan Road", "Navan road", "Ashington"]
dub8 = ["Dolphin's Barn", "Inchicore", "Islandbridge", "Kilmainham", "Merchants Quay", "Portobello", "South Circular Road", "Phoenix Park", "Liberties", "Christchurch", "St. Patrick's Cathedral", "Coombe", "Island Bridge", "Rialto"]
dub9 = ["Kilmore", "Shangan", "Coultry", "Beaumont", "Donnycarney", "Drumcondra", "Elm Mount", "Griffith Avenue", "Glasnevin", "St Mobhi", "Botanic Gardens", "Santry", "Whitehall"]
dub10 = ["Ballyfermot", "Sarsfield Road", "Cherry Orchard"]
dub11 = ["Ballymun", "Sillogue", "Balcurris", "Balbutcher", "Poppintree", "Sandyhill", "Wadelai", "Dubber", "Finglas", "Ballygal", "Cappagh", "Glasnevin", "Cremore", "Addison", "Violet Hill", "Finglas Road", "Old Finglas Road", "Glasnevin Cemetery", "Kilshane", "The Ward", "Coolquay", "Jamestown"]
dub12 = ["Bluebell", "Crumlin", "Drimnagh", "Greenhills", "Perrystown", "Walkinstown"]
dub13 = ["Clarehall", "Baldoyle", "Bayside", "Donaghmede", "Clongriffin", "Sutton", "Howth", "Ayrfield", "Balgriffin", "Portmarnock"]
dub14 = ["Churchtown", "Clonskeagh", "Dundrum", "Goatstown", "Windy Arbour", "Roebuck"]
dub15 = ["Hollystown", "Ashtown", "Blanchardstown", "Castleknock", "Coolmine", "Clonsilla", "Corduff", "Mulhuddart", "Tyrrelstown", "Clonee", "Ongar", "Carpenterstown"]
dub16 = ["Ballinteer", "Ballyboden", "Dundrum", "Kilmashogue", "Knocklyon", "Rathfarnham", "Rockbrook"]
dub17 = ["Balgriffin", "Coolock", "Belcamp", "Darndale", "Priorswood", "Riverside", "Clonshaugh"]
dub18 = ["Rathmichael", "Cherrywood", "Cabinteely", "Carrickmines", "Foxrock", "Kilternan", "Sandyford", "Shankill", "Ticknock", "Ballyedmonduff", "Stepaside", "Leopardstown"]
dub20 = ["Chapelizod", "Palmerstown"]
dub22 = ["Park West", "Clondalkin", "Rowlagh", "Quarryvale", "Liffey Valley", "Neilstown", "Bawnogue", "Kingswood"]
dub24 = ["Firhouse", "Jobstown", "Old Bawn", "Oldbawn", "Kilnamanagh", "Tallaght", "Saggart", "Citywest", "Aylesbury", "Rathcoole"]
ext1 = ["Blackrock", "Deansgrange", "Monkstown", "Stillorgan", "Booterstown", "Kilmacud"]
ext2 = ["Dun Laoghaire", "Dalkey", "Glasthule", "Glenageary", "Sandycove", "Killiney"]
ext3 = ["Swords", "Kilsallaghan", "Lusk"]
ext4 = ["Malahide", "Donabate"]
ext5 = ["Lucan", "Adamstown"]
ext6 = ["Rush"]
ext7 = ["Balbriggan"]

#for loop to assign the proper pincode to each address. I have used the lists created above for this.
for i in range(len(adds)):
    #since some addresses do have the postal codes mentioned, I have used a simple regex code to extract the code directly:
    x = re.findall(r'Dublin [0-9]+', adds[i])
    if x:
        y.append(str(x))
    #for the addresses where the postal code is not mentioned:
    else:
        if any(x in adds[i] for x in dub1):
            y.append('Dublin 1')
        elif any(x in adds[i] for x in dub2):
            y.append('Dublin 2')
        elif any(x in adds[i] for x in dub3):
            y.append('Dublin 3')
        elif any(x in adds[i] for x in dub4):
            y.append('Dublin 4')
        elif any(x in adds[i] for x in dub5):
            y.append('Dublin 5')
        elif any(x in adds[i] for x in dub6):
            y.append('Dublin 6')
        elif any(x in adds[i] for x in dub6w):
            y.append('Dublin 6W')
        elif any(x in adds[i] for x in dub7):
            y.append('Dublin 7')
        elif any(x in adds[i] for x in dub8):
            y.append('Dublin 8')
        elif any(x in adds[i] for x in dub9):
            y.append('Dublin 9')
        elif any(x in adds[i] for x in dub10):
            y.append('Dublin 10')
        elif any(x in adds[i] for x in dub11):
            y.append('Dublin 11')
        elif any(x in adds[i] for x in dub12):
            y.append('Dublin 12')
        elif any(x in adds[i] for x in dub13):
            y.append('Dublin 13')
        elif any(x in adds[i] for x in dub14):
            y.append('Dublin 14')
        elif any(x in adds[i] for x in dub15):
            y.append('Dublin 15')
        elif any(x in adds[i] for x in dub16):
            y.append('Dublin 16')
        elif any(x in adds[i] for x in dub2):
            y.append('Dublin 17')
        elif any(x in adds[i] for x in dub18):
            y.append('Dublin 18')
        elif any(x in adds[i] for x in dub20):
            y.append('Dublin 20')
        elif any(x in adds[i] for x in dub22):
            y.append('Dublin 22')
        elif any(x in adds[i] for x in dub24):
            y.append('Dublin 24')
        elif any(x in adds[i] for x in ext1):
            y.append('A94')
        elif any(x in adds[i] for x in ext2):
            y.append('A96')
        elif any(x in adds[i] for x in ext3):
            y.append('K67')
        elif any(x in adds[i] for x in ext4):
            y.append('K36')
        elif any(x in adds[i] for x in ext5):
            y.append('K78')
        elif any(x in adds[i] for x in ext6):
            y.append('K56')
        elif any(x in adds[i] for x in ext7):
            y.append('K32')
        else:
            y.append('NA')

In [None]:
#Step 11: Assigning the Postal-code list to the HRdata dataframe.
HRdata['Postal-code'] = y

In [None]:
#Step 12: Exporting the dataset to the working directory.
HRdata.to_csv("HRdata.csv", index = False)