### What is the average price of properties that are currently on the market in RightMove?

The goal of this project is to scrape the current property listings from the Rightmove website.

Additional information in the form of the csv file 'location_data.csv' will be required. Along with the following details:

location,location id,no of pages,mode
cambridge,274,19,for-sale

1. the location: the target location
2. location id: a rightmove-specific identifier
3. number of pages: the number of pages to be looked at
4. mode: for generating the url and the file with the scraped data

Four pieces of information on the properties are contained in the resulting data file, 'cambridge-for-sale.csv.'

1. title
2. address
3. price
4. seller's agent

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv

data = pd.read_csv('location_data.csv')
df = pd.DataFrame(data)

class RightmoveScraper:
    results = []
    where = ''
       
    def fetch(self, url):
        print('Fetching: %s' % url, end ='')
        
        response = requests.get(url)
        print(' | Status code: %s' % response.status_code)
        
        return response
    

    def parse(self, html):       
        content = BeautifulSoup(html, 'html.parser')   #lxml
        
        titles = [title.text.strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        addresses = [address['content'] for address in content.findAll('meta', {'itemprop': 'streetAddress'})]
        #descriptions = [description.text for description in content.findAll('span', {'data-test': 'property-description'})]
        prices = [price.text.strip() for price in content.findAll('div', {'class': 'propertyCard-priceValue'})]
        #dates = [date.text.split()[-1] for date in content.findAll('span', {'class': 'propertyCard-branchSummary-addedOrReduced'})]
        sellers = [seller.text.split('by')[-1].strip() for seller in content.findAll('span',{'class': 'propertyCard-branchSummary-branchName'})]
                
        for index in range(0, len(titles)):
            self.results.append({
                'title': titles[index],
                'address': addresses[index],
                #'description': descriptions[index],
                'price': prices[index],
                #'date': dates[index],
                'seller': sellers[index]})
                #'no_of_bedrooms' : bedrooms[index],
                #'subtype': subtypes[index]})
                
        #print(self.results)


    def to_csv(self, where, mode):
        filename = where + '-' + mode + '.csv'
        with open(filename,'w') as csv_file:
            writer = csv.DictWriter(csv_file,fieldnames=self.results[0].keys())
            writer.writeheader()
            
            for row in self.results:
                writer.writerow(row)
               
        print('Write results to {}'.format(filename))    
               
    def run(self):
        for index, row in df.iterrows():
            where = row['location']
            location_id = row['location_id']
            mode = row['mode']
            no_of_pages = row['no_of_pages']
            
            for page in range(0, no_of_pages):
                index = page * 24
                url = 'https://www.rightmove.co.uk/property-' + str(mode) \
                        +'/find.html?locationIdentifier=REGION%5E' + str(location_id) \
                        +'&index=' + str(index)
                #print (url)
                
                response = self.fetch(url)
                self.parse(response.text)
      
            self.to_csv(where, mode)
        
        """
        ### Create a html file, so we can find out what came back from the website.        
        #response = self.fetch('https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E274')

        with open ('res.html', 'w', encoding='utf8', newline='') as html_file:
            html_file.write(response.text)
        

        html = ''
        with open ('res.html', 'r', encoding='utf8', newline='') as html_file:
            for line in html_file:
                html += html_file.read()
                
        self.parse(html)
        self.to_csv(where)
        """
        
        
if __name__ == '__main__':
    scraper = RightmoveScraper() 
    scraper.run()
    


Fetching: https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E274&index=0 | Status code: 200
Fetching: https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E274&index=24 | Status code: 200
Fetching: https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E274&index=48 | Status code: 200
Fetching: https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E274&index=72 | Status code: 200
Fetching: https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E274&index=96 | Status code: 200
Fetching: https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E274&index=120 | Status code: 200
Fetching: https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E274&index=144 | Status code: 200
Fetching: https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E274&index=168 | Status 