In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import lxml
import pandas as pd
import re
import numpy as np


class ZooplaScraper:
    results = []

    def fetch(self, url):
        print('HTTP GET request to URL: %s' % url, end='')
        res = requests.get(url)
        print(' | Status code: %s' % res.status_code)
        
        return res
    
    def parse(self, html):
        content = BeautifulSoup(html, 'lxml')
        cards = content.findAll('div', {'class': 'listing-results-wrapper'})
    
        for card in cards:
            
            try:
                bedrooms = card.find('span', {"class": "num-icon num-beds"}).text
            except:
                bedrooms = 'NA'

            try:
                bathrooms = card.find('span', {"class": "num-icon num-baths"}).text
            except:
                bathrooms = 'NA'

            try:
                reception = card.find('span', {"class": "num-icon num-reception"}).text
            except:
                reception = 'NA'
            
            try:
                area = card.find('span', {"class": "num-icon num-sqft"}).text
            except:
                area = 'NA'               
                
            self.results.append({
                'link': "https://www.zoopla.co.uk"+str(card.find('a', {'class': 'listing-results-price text-price'})["href"]),
                'price': card.find('a', {'class': 'listing-results-price'}).text.strip().split(' ')[0].strip(),
                'bedrooms': bedrooms,
                'bathrooms': bathrooms,
                'receptions': reception,
                'area': area,
                'address': card.find('a', {'class': 'listing-results-address'}).text,
                'date': card.find('p', {'class': 'listing-results-marketed'}).text.split('Listed on')[1].split('by')[0].strip(),
                'agent postcode': card.find('p', {'class': 'top-half listing-results-marketed'}).text.split()[-1],
                'description': card.find('p').text.strip(),
                'title': card.find('a', {'style': 'text-decoration:underline;'}).text,
                'number of images': card.find('div', {'class': 'btn-photo-movie'}).text.strip("\n")
#                 'image': card.find('a', {'class': 'photo-hover'}).find('img')['data-src']
            })
        
    
    def to_csv(self):
        with open('zoopla.csv', 'w',newline='',encoding='utf-8-sig') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=self.results[0].keys())
            writer.writeheader()
            for row in self.results:
                writer.writerow(row)
            print('Stored results to "zoopla.csv"')
    
    def run(self):
        for page in range(100):
            url = 'https://www.zoopla.co.uk/for-sale/property/london/?identifier=london&q=London&search_source=for-sale&radius=0&page_size=100&pn='
            url += str(page+1)
            res = self.fetch(url)
            self.parse(res.text)
            time.sleep(2)

        self.to_csv()
        

if __name__ == '__main__':
    scraperZoop = ZooplaScraper()
    scraperZoop.run()


HTTP GET request to URL: https://www.zoopla.co.uk/for-sale/property/london/?identifier=london&q=London&search_source=for-sale&radius=0&page_size=100&pn=1 | Status code: 200
HTTP GET request to URL: https://www.zoopla.co.uk/for-sale/property/london/?identifier=london&q=London&search_source=for-sale&radius=0&page_size=100&pn=2 | Status code: 200
HTTP GET request to URL: https://www.zoopla.co.uk/for-sale/property/london/?identifier=london&q=London&search_source=for-sale&radius=0&page_size=100&pn=3 | Status code: 200
HTTP GET request to URL: https://www.zoopla.co.uk/for-sale/property/london/?identifier=london&q=London&search_source=for-sale&radius=0&page_size=100&pn=4 | Status code: 200
HTTP GET request to URL: https://www.zoopla.co.uk/for-sale/property/london/?identifier=london&q=London&search_source=for-sale&radius=0&page_size=100&pn=5 | Status code: 200
HTTP GET request to URL: https://www.zoopla.co.uk/for-sale/property/london/?identifier=london&q=London&search_source=for-sale&radius=0&

HTTP GET request to URL: https://www.zoopla.co.uk/for-sale/property/london/?identifier=london&q=London&search_source=for-sale&radius=0&page_size=100&pn=95 | Status code: 200
HTTP GET request to URL: https://www.zoopla.co.uk/for-sale/property/london/?identifier=london&q=London&search_source=for-sale&radius=0&page_size=100&pn=96 | Status code: 200
HTTP GET request to URL: https://www.zoopla.co.uk/for-sale/property/london/?identifier=london&q=London&search_source=for-sale&radius=0&page_size=100&pn=97 | Status code: 200
HTTP GET request to URL: https://www.zoopla.co.uk/for-sale/property/london/?identifier=london&q=London&search_source=for-sale&radius=0&page_size=100&pn=98 | Status code: 200
HTTP GET request to URL: https://www.zoopla.co.uk/for-sale/property/london/?identifier=london&q=London&search_source=for-sale&radius=0&page_size=100&pn=99 | Status code: 200
HTTP GET request to URL: https://www.zoopla.co.uk/for-sale/property/london/?identifier=london&q=London&search_source=for-sale&radi

In [2]:
# Add to the class above in the future

df = pd.read_csv("zoopla.csv")

fes = str()
ft = str()
for prop in range(len(df["link"])):
    res = requests.get(df.iloc[prop][0])
    content = BeautifulSoup(res.text, 'lxml')
    try:
        ft = content.find('section', {'class': 'dp-features'}).findAll('li', {'class': 'dp-features-list__item'})
        features = []
        for i in ft:
    #       f = str(re.sub('[^a-zA-Z]+ ', '',i.text).strip("\n"))
            f = i.text.strip()
            features.append(f)
            fes = ','.join(features)
    except:
            fes = str()
    
    df.loc[prop, "features"] = fes
    df.loc[prop, "number of features"] = len(ft)
    df.loc[prop, "agent"] = content.find('h4', {'class': 'ui-agent__name'}).text
#     Hate doing this, but this is the easiest solution without any good function
    if (len(fes) != 0):
        df.loc[prop, "number of letters"] = int(len(fes) - len(ft) - 1)
    else: 
        df.loc[prop, "number of letters"] = 0
    df.loc[prop,"description"] = content.find('div', {'class': 'dp-description__text'}).text

In [4]:
df_original = df

In [5]:
df["postcode"] = df["address"].str.split().str[-1]
postcodes = list(pd.read_csv("postcodes.csv")["0"])
df = df[df['postcode'].isin(postcodes)]
df['price'] = df['price'].str.replace(',', '').str.replace('£', '')
df = df[df["price"] != "POA"]
df["price"] = df["price"].astype("int64")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price'] = df['price'].str.replace(',', '').str.replace('£', '')


In [None]:
# ADD NUMBER OF IMAGES CLEANER

In [None]:
print(len(df))
print(df.isnull().sum())

In [6]:
df.to_csv("zoopla.csv", index = False)

In [7]:
df_original.to_csv("zoopla_original.csv", index = False)