In [None]:
#Import scrapy and pandas
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd

# Initiate empty dictionary with features to extract
Features = {"Price": [],
            "Bedrooms": [],
            "Location": [],
            "Garages": [],
            "Bathrooms": [],
            "Erf Size": [],
            "Floor Size": []}

#Initiate a scrapy Spider
class Prop24(scrapy.spiders.Spider):
    name = 'Prop24'
    
#Define website to scrape data from(Currently only for housees in the Western Cape)
    def start_requests(self):
        urls = ['https://www.property24.com/houses-for-sale/western-cape/9']
            
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

#Extract various features from website             
    def parse(self, response):
        
        number = len(response.xpath('//span[@class="p24_content"]')) #Obtain the amount of housing ads per page
         
        for i in range(number): #Initiate loop to run through all adds and extract data
            
            #Extract pricing data
            xpath_price = '(//span[@class="p24_content"])[{0}]/span[@itemprop="offers"]/span[@class="p24_price"]/@content'.format(i)
            price = response.xpath(xpath_price).extract()
            
            #Extract amount of bedrooms
            xpath_bedrooms = '(//span[@class="p24_content"])[{0}]/span[@class="p24_icons"]/span[contains(@title, "Bedrooms")]/span/text()'.format(i)
            bedrooms = response.xpath(xpath_bedrooms).extract()
          
            #Extract location
            xpath_location = '(//span[@class="p24_content"])[{0}]/span[@itemprop="offers"]/span[contains(@class,"p24_location")]/text()'.format(i)
            location = response.xpath(xpath_location).extract()
  
            #Extract amount of bathrooms
            xpath_bathrooms = '(//span[@class="p24_content"])[{0}]/span[@class="p24_icons"]/span[contains(@title, "Bathrooms")]/span/text()'.format(i)
            bathrooms = response.xpath(xpath_bathrooms).extract()

            #Extract amount of garages
            xpath_garages = '(//span[@class="p24_content"])[{0}]/span[@class="p24_icons"]/span[contains(@title, "Parking Spaces")]/span/text()'.format(i)
            garages = response.xpath(xpath_garages).extract()        
          
            #Extract erf size
            xpath_erf_size = '(//span[@class="p24_content"])[{0}]/span[@class="p24_icons"]/span[contains(@title, "Erf Size")]/span/text()'.format(i)
            erf_size = response.xpath(xpath_erf_size).extract()       

            #Extract house floor size
            xpath_floor_size = '(//span[@class="p24_content"])[{0}]/span[@class="p24_icons"]/span[contains(@title, "Floor Size")]/span/text()'.format(i)
            floor_size = response.xpath(xpath_floor_size).extract()              
            
            #Check if item exists and append the dictionary with the appropriate feature.
            if len(price)==0: 
                Features['Price'].append('0')
                
            else:
                
                Features['Price'].append(price[0])
                
            if len(bedrooms)==0:
                
                Features['Bedrooms'].append('0')
                
            else:
                
                Features['Bedrooms'].append(bedrooms[0])
                
            if len(location)==0:
                
                Features['Location'].append('0')
                
            else:
                
                 Features['Location'].append(location[0])

            if len(bathrooms)==0:
                
                Features['Bathrooms'].append('0')
                
            else:
                
                 Features['Bathrooms'].append(bathrooms[0])     
                    
                    
            if len(garages)==0:
                
                Features['Garages'].append('0')
                
            else:
                
                 Features['Garages'].append(garages[0])  
                    
            if len(erf_size)==0:
                
                Features['Erf Size'].append('0')
                
            else:
                
                 Features['Erf Size'].append(erf_size[0])                                          
 
            if len(floor_size)==0:
                
                Features['Floor Size'].append('0')
                
            else:
                
                 Features['Floor Size'].append(floor_size[0]) 
                    
        #Extract the link to the next page
        next_pages = response.xpath('//ul[contains(@class,"pagination")]/li/a/@href').extract()
        
        #Follow the next page and extract until last page is reached.
        if next_pages is not None:
                
            for next_page in next_pages:
                yield response.follow(url=next_page, callback=self.parse)
            
#Start spider process
process = CrawlerProcess()
process.crawl(Prop24)
process.start()

In [14]:
#Store extracted data in a pandas dataframe
features_data = pd.DataFrame(Features)

#Sort dataframe according to location
features_data = features_data.sort_values(['Location'], ascending=True).set_index(['Location'])


#Write dataframe to csv file
features_data.to_excel('Western_Cape_Housing_Sales.xls')
