**Import dependencies**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from bs4 import BeautifulSoup #requires pip install
import requests
import re
from re import sub
from decimal import Decimal 

**Functions**

In [47]:
# Convert price string into a numerical value
def to_num(price):
    value = Decimal(sub(r'[^\d.]', '', price))
    return float(value)

# Skip ads if price is not yet defined
def is_dropped(money):
    for i in range(len(money)):
        if(money[i] != '£' and money[i] != ',' and (not money[i].isdigit())):
            return True
    return False


**Updated version**

In [106]:
url = 'https://www.zoopla.co.uk/for-sale/property/london/?page_size=25&q=london&radius=0&results_sort=newest_listings&pn='

map = {}
id = 0

#set max_pages to 2 for test purposes
max_pages = 2 

    #time.sleep(10)
    #start = time.time()
    
for p in range(max_pages):
    cur_url = url + str(p + 1)

    print("Scraping page: %d" % (p + 1))
    #print(cur_url)
    html_text = requests.get(cur_url).text
    soup = BeautifulSoup(html_text, 'lxml')

    ads = soup.find_all('div', class_ = 'css-wfndrn-StyledContent e2uk8e18')
    page_nav = soup.find_all('a', class_ = 'css-slm4qd-StyledPaginationLink eaoxhri5')

    if(len(page_nav) == 0): 
        print("max page number: %d" % (p))
        end = time.time()
        print(end - start)
        break

    for k in range(len(ads)):
        ad = ads[k]

        #find link and ID ('identifier' in the link acts as a unique id for the ad)
        link = ad.find('a', class_ = 'e2uk8e4 css-gl9725-StyledLink-Link-FullCardLink e33dvwd0')
        
        #find section for address
        address = ad.find('p', class_ = 'css-wfe1rf-Text eczcs4p0').text
        
        #find price information
        price = ad.find('p', class_ = 'css-18tfumg-Text eczcs4p0').text
        
        # if the price is valid or not, if not we do not consider this ad
        if(is_dropped(price)): continue   
            
        #find public transport information
        subway_section = ad.find('div', class_ = 'css-braguw-TransportWrapper e2uk8e28')
        subway_information = subway_section.find_all('p', class_ = 'css-wfe1rf-Text eczcs4p0')
        
        #skip ads that only contain information of train station
        outlier = subway_section.find('span', class_ = 'e1uy4ban0 css-10ibqwe-StyledIcon-Icon e15462ye0')
        if(outlier['data-testid'] == 'national_rail_station'): continue
            
        #find section for bedroom, bathroom and living room information (room numbers)
        feature_section = ad.find('div', class_ = 'css-58bgfg-WrapperFeatures e2uk8e15')
        
        #find all information available for room numbers
        category = feature_section.find_all('div', class_ = 'ejjz7ko0 css-l6ka86-Wrapper-IconAndText e3e3fzo1')

        #assign id
        ad_id = link['href'] #returns url snippet with identifier from the url
        ad_id= ad_id.split("?")[0] #split by '?' ans '/' and apply index to retain only identifier number
        ad_id= ad_id.split("/")[3]

        if(ad_id in map): continue
        map[ad_id] = {}
        
        #assign link
        link = 'https://www.zoopla.co.uk/' + link['href']
        map[ad_id]["link"] = link  
        
        #assign address
        map[ad_id]["address"] = address 
        
        #assign bedroom nr
        try:
            map[ad_id]["room_nr"] = category[0].text
        except IndexError:
        #insert None value if index is not found
            map[ad_id]["room_nr"] = 'None'
            #print("Feature not listed")
            
        #assign bathroom nr                
        try:
            map[ad_id]["bath_nr"] = category[1].text
        except IndexError:
        #insert None value if index is not found
            map[ad_id]["bath_nr"] = 'None'
            #print("Feature not listed")

        #assign living room nr   
        try:
            map[ad_id]["living_nr"] = category[2].text
        except IndexError:
        #insert None value if index is not found
            map[ad_id]["living_nr"] = 'None'
            #print("Feature not listed")
                      
        #assign price
        map[ad_id]["price"] = to_num(price)
        
        #assign subway station and distance to it
        s = subway_information[0].text
        x = s.split(' miles ')
        if len(x) == 1: continue
        map[ad_id]["distance"] = float(x[0])
        map[ad_id]["subway_station"] = x[1]

print("Scraping task finished")  
%time

Scraping page: 1
Scraping page: 2
Scraping task finished
CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.39 µs


In [107]:
#transform dict to list
result = []
cur_row = 0
for cur_id in map.keys():
    link = map[cur_id]["link"]
    cur_price = map[cur_id]["price"]
    cur_bedroom = map[cur_id]["room_nr"]
    cur_bathroom = map[cur_id]["bath_nr"]
    cur_living = map[cur_id]["living_nr"]
    cur_address = map[cur_id]["address"]
    cur_distance = map[cur_id]["distance"]
    cur_subway_station = map[cur_id]["subway_station"]
    result.append([])
    result[cur_row].append(str(cur_id))
    result[cur_row].append(str(link))
    result[cur_row].append(str(cur_price))
    result[cur_row].append(str(cur_bedroom))
    result[cur_row].append(str(cur_bathroom))
    result[cur_row].append(str(cur_living))
    result[cur_row].append(str(cur_address))
    result[cur_row].append(str(cur_distance))
    result[cur_row].append(str(cur_subway_station))
    cur_row += 1

In [108]:
#transform to dataframe
df = pd.DataFrame(result, columns = ["ad_id", "link", "price", "bedrooms", "bathrooms", "living_rooms", "address", "distance", "subway_station"])
df

Unnamed: 0,ad_id,link,price,bedrooms,bathrooms,living_rooms,address,distance,subway_station
0,57988306,https://www.zoopla.co.uk//for-sale/details/579...,330000.0,2,2.0,1.0,"Laurel Court, 2 Chadwick Gardens, Uxbridge, Mi...",0.5,Uxbridge
1,57988266,https://www.zoopla.co.uk//for-sale/details/579...,700000.0,3,1.0,1.0,"Notting Hill, London W11",0.3,Westbourne Park
2,57988262,https://www.zoopla.co.uk//for-sale/details/579...,535000.0,4,1.0,2.0,"Dorrington Close, Barking, Essex IG11",0.4,Upney
3,57988263,https://www.zoopla.co.uk//for-sale/details/579...,300000.0,2,1.0,1.0,"Suffolk Road, Ilford, Essex IG3",0.3,Newbury Park
4,57988257,https://www.zoopla.co.uk//for-sale/details/579...,570000.0,3,1.0,1.0,"Harrow, Middlesex HA3",1.0,Harrow & Wealdstone
5,57988152,https://www.zoopla.co.uk//for-sale/details/579...,650000.0,1,1.0,1.0,"Carisbrooke Court, Weymouth Street, London W1G",0.3,Regent's Park
6,57988078,https://www.zoopla.co.uk//for-sale/details/579...,353500.0,4,1.0,2.0,"Dorrington Close, Barking, Essex IG11",0.4,Upney
7,57988108,https://www.zoopla.co.uk//for-sale/details/579...,18000.0,1,1.0,,"Burford Close, Ilford, Essex IG6",0.4,Barkingside
8,57988076,https://www.zoopla.co.uk//for-sale/details/579...,198500.0,2,1.0,1.0,"Suffolk Road, Ilford, Essex IG3",0.3,Newbury Park
9,57988087,https://www.zoopla.co.uk//for-sale/details/579...,299995.0,1,1.0,1.0,"Musgrove Road, London SE14",0.2,New Cross Gate
