In [1]:
from bs4 import BeautifulSoup  
import requests 
import pandas as pd 
import schedule 
import time 
import datetime  
from pathlib import Path 
import googlemaps 

In [None]:
# THIS FUNCTION WILL FETCH THE DATA FROM THE URL AND PASS IT TO BEAUTIFULSOUP 
def connection_zoopla():
    size = 25 # Amount of properties per request that. It will be used in query string(url).
    area = 'Edinburgh' # City of 'Edinburgh' is assigned to area. A different city could be introduced here. It will be used in Query string(url).
    URL = ('https://www.zoopla.co.uk/for-sale/property/%s/?q=%s&search_source=home&pn=1&view_type=list&page_size=%d'
           %(area, area,size))# Zoopla URL is given from where the data has to be scraped. This could be modified to add new cities.
    r = requests.get(URL) # It will sent the request to the server of given url.
    soup = BeautifulSoup(r.content, 'html5lib') # BeautifulSoup will parse the requests object to HTML5.
    return soup # Returning soup.

In [None]:
# This function will filter required data from BeautifulSoup and put it in a dictionary.
def get_data_zoopla(soup):
    
    # This is the main class where all the properties are .
    table = soup.find('ul', attrs = {'class':'listing-results clearfix js-gtm-list'}) #'ul' is the tag () 
    
    price_final = [] 
    bed_final = [] 
    date_final = [] 
    address_final = [] 
    agent_final = [] 
    
    # ITERATE TO GET ALL THE PROPERTIES IN THE URL PAGE, AND APPEND TO THE LISTS
    for row in table.findAll('li', attrs = {'class': 'srp clearfix'}): 

        price = row.find('a', attrs = {'class': 'listing-results-price text-price'}).text # Fetching the 'price' value in each loop.
        price = price.replace(' ', '').replace('£', '').replace(',', '') # Data cleaning: Replace unnecessary space, sign and comma
        price = price.split() # Convert price string to list
        price = price[0] # Take very first, means 0th element 
        price_final.append(price) # Appending the price in the list.

        bed = row.find('h2', attrs = {'class': 'listing-results-attr'}).text # Fetching the 'bed' value in each loop.
        bed = bed.split() # Replace unnecessary space, sign and comma
        bed = bed[0] # Take very first, means 0th element 
        bed_final.append(bed) # Appending the bed in the list.

        date_agent = row.find('p', attrs = {'class': 'top-half listing-results-marketed'}).text # Fetching the 'date and agent' data in each loop.
        date_agent = date_agent.split() # Replace unnecessary space, sign and comma
        
        date = [date_agent[2].replace('th', ''), date_agent[3], date_agent[4]] # Get date, month and year in the list.
        date = '-'.join(map(str, date)) # List to string which is join by '-'.
        date = datetime.datetime.strptime(date, '%d-%b-%Y') # Convert date string into date object.
        date = date.strftime("%d-%m-%Y") # Fetch appropriate format of date within the date object.
        date_final.append(date) # Appending the date in the list.
        
        agent =' '.join(date_agent[6:])
        agent_final.append(agent)
        
        address = row.find('a', attrs = {'class': 'listing-results-address'}).text # Fetching the 'address' value in each loop.
        address_final.append(address) # Appending the address in the list.
        
    
    data_dictionary = {'Address': address_final, 'Asking Price': price_final, 
                       'Date Posted': date_final, 'Number of beds': bed_final, 'Estate Agent':agent_final
                      } 
    
    return data_dictionary # Return data dictionary.

In [None]:
# This function defines a DataFrame by:
    # 1- Adding columns from the dictionary created previously
    # 2- Adding two additional columns "Address_input" that is used later to search for the sold price and 'Sold Price' which is populated later
    # 3- Adding two fields(LAN, LON) from the address.
    # 4- Removes incomplete addresses which dont have house number and can not be tracked later (the code uses the fact that all the addresses should start with a number, i.e. 5/3 Hartingon... or 66 Hillside...)
    
def create_dataframe(data_dictionary):
    df = pd.DataFrame(data=data_dictionary) # Create DataFrame from dictionary.
    df['Date Posted'] = pd.to_datetime(df['Date Posted']) # String to datetime.
    df['Asking Price'] = pd.to_numeric(df['Asking Price']) # String to numeric.
    
    df['Address_input'] = df['Address'].str.replace(' ','+').str.replace('/','%2F').str.replace(',','%2C').str.replace('(','%28').str.replace(')','%29').str.replace('"','%22')
    
    df["LAT"] = None # Define null column.
    df["LON"] = None # Define null column.
    
    
    df = df[df['Address'].astype(str).str.startswith(('1','2','3','4','5','6','7','8','9'))] # removes incomplete addresses which don't start with a number
    
    # THIS SECTION IS COMMENTED OUT AS A VALID GOOGLEKEY NEEDS TO BE USED.
    #google_key=
    #gmaps = googlemaps.Client(key = google_key)
    for i in range(0, len(df)): # It is used to add each latitude and longitude based on the addresses.
        
        geocode_result = gmaps.geocode(df.iat[i,0]) # Get address.
        try: 
            lat = geocode_result[0]["geometry"]["location"]["lat"] 
            lon = geocode_result[0]["geometry"]["location"]["lng"] 
            df.iat[i, df.columns.get_loc("LAT")] = lat 
            df.iat[i, df.columns.get_loc("LON")] = lon 
        except:
            lat = None
            lon = None
            
    df['Sold Price'] = None
    df[['Address', 'Asking Price', 'Date Posted', 'Number of beds', 'LAT','LON', 'Sold Price', 'Address_input', 'Estate Agent']]
    
    return df 

In [None]:
# This function is used to create CSV file from DataFrame.
def create_file(df):
    
    file_name = 'Edinburgh_Final'
    path = Path(file_name)
    
    if path.is_file(): # check if file exits.
        old_data = pd.read_csv(file_name)  # Get previous data into the dataframe from the CSV file.
        new_data = pd.concat([old_data, df]) # Concatenate previous DataFrame and new DataFrame.
        new_data  = new_data.drop_duplicates(subset='Address', keep="first", inplace=False) # Delete duplicate column.
    else:
        new_data = df # Assign new DataFrame.

    with open(file_name, 'w+', newline='') as f: # Create new file.
        new_data.to_csv(f, index=False) # Create CSV.

In [None]:
# This function is used to call all the above functions in order, to scrape the data and create dataframe.
def final_scraper():
    soup = connection_zoopla()
    data_dictionary = get_data(soup)
    df = create_dataframe(data_dictionary)
    create_file(df)
    return df

In [None]:
# Create the scheduler to automate the function on every day

schedule.every(1).day.at("23:00").do(final_scraper)
#schedule.every(1).minutes.do(final_scraper)                                 

while True:
    schedule.run_pending()
    time.sleep(1)