In [72]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import numpy as np
import time

In [95]:
def import_district_file(file_location):
    district_df = pd.read_csv(file_location)
    return district_df

def get_district_dict():
    district_df = import_district_file('C:/Users/ballinj/housing/london_district_codes.csv')
    district_dict = dict(zip(list(district_df['district']), list(district_df['code'])))
    return district_dict

def get_borough_dict():
    df = pd.read_csv('london_borough_list.csv')
    borough_dict = dict(zip(list(df['borough']),list(df['code'])))
    return borough_dict

def get_no_results(soup):
    no_results = soup.find('span', attrs={'class':'searchHeader-resultCount'}).text.strip()
    return no_results

def get_soup(index, region):
    district_id = district_dict[region]
    cert = "C:/Users/ballinj/housing/ca-certificates.crt"
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}
    url = 'https://www.rightmove.co.uk/property-for-sale/find.html'
    params = {'minBedrooms':2,
              'propertyTypes':'detached%2Cflat%2Csemi-detached%2Cterraced',
              'keywords':'',                  
              'dontShow':'retirement%2CsharedOwnership',
              'channel':'BUY',
              'secondaryDisplayPropertyType':'housesandflats',
              'index': str(index), 
              'retirement':'false',
              'includeSSTC':'false',
              'partBuyPartRent':'false',
              'sortType':2,
              'minPrice':200000,
              'viewType':'list',
              'maxPrice':450000,
              'radius':0.0,
              'locationIdentifier':'OUTCODE%' + str(district_id)[str(district_id).find('%')+1:]}
    params_string = "&".join("%s=%s" % (k,v) for k,v in params.items())
    r = requests.get(url, params=params_string)
    c = r.content    
    soup = BeautifulSoup(c, 'html.parser') 
    return soup

def get_json(index, region):
    district_id = district_dict[region]
    cert = "C:/Users/ballinj/housing/ca-certificates.crt"
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}
    url = 'https://www.rightmove.co.uk/property-for-sale/map.html'
    params = {'minBedrooms':2,
              'propertyTypes':'detached%2Cflat%2Csemi-detached%2Cterraced',
              'keywords':'',                  
              'dontShow':'retirement%2CsharedOwnership',
              'channel':'BUY',
              'secondaryDisplayPropertyType':'housesandflats',
              'index': str(index), 
              'retirement':'false',
              'includeSSTC':'false',
              'partBuyPartRent':'false',
              'sortType':2,
              'minPrice':200000,
              'viewType':'map',
              'maxPrice':450000,
              'radius':0.0,
              'locationIdentifier':'OUTCODE%' + str(district_id)[str(district_id).find('%')+1:]}
    params_string = "&".join("%s=%s" % (k,v) for k,v in params.items())
    r = requests.get(url, params=params_string)
    c = r.content
    soup = BeautifulSoup(c, 'html.parser')
    scripts = soup.findAll('script')
    script_list = [script if 'window.jsonModel' in str(script) else '' for script in scripts]
    script_list = [script for script in script_list if script != '']
    script = str(script_list[0])
    script = script[script.find('{'):script.rfind('}')+1]
    properties_json = json.loads(script)
    print('json retrieved')
    return properties_json

def format_data_soup(soup, region):
    no_results = get_no_results(soup)
    print(no_results + ' results found')
    print('obtaining soup...')
    index_array = np.arange(0,int(no_results)+24,24).tolist()
    listing_ids, links, property_types, addresses, prices, prices_per_week, featured_properties = [],[],[],[],[],[],[]
    added_reduced_array, letting_agent_name, letting_agent_number, num_pictures = [],[],[],[]
    for index in index_array:
        soup = get_soup(index, region)
        main_data = soup.find('div', attrs={'class':'main'})
        search_results = soup.find('div', attrs={'class':'l-searchResults'})
        ids = search_results.findAll('a', attrs={'class':'propertyCard-anchor'})#['id']
        for id in ids:
            listing_ids.append(id['id'][4:])
        listing_data = search_results.findAll('div', attrs={'class':'propertyCard-wrapper'})
        for listing in listing_data:
            featured_properties.append(listing.find('div', attrs={'class':'propertyCard-moreInfoFeaturedTitle'}).text.strip())

            details = listing.find('div', attrs={'class':'propertyCard-details'})
            addresses.append(listing.find('address').text.strip())
            property_types.append(listing.find('h2').text.strip())
            links.append('https://www.rightmove.co.uk/' + details.find('a')['href'])

            pricing = listing.find('div', attrs={'class':'propertyCard-price'})
            prices.append(pricing.find('div', attrs={'class':'propertyCard-priceValue'}).text.strip())
            added_reduced_array.append(listing.find('div', attrs={'class':'propertyCard-branchSummary'}).find('span', attrs={'class':'propertyCard-branchSummary-addedOrReduced'}).text.strip())
            estate_agent = listing.find('div', attrs={'class':'propertyCard-branchSummary'}).find('span', attrs={'class':'propertyCard-branchSummary-branchName'}).text.strip()
            estate_agent = estate_agent[estate_agent.find('by')+3:].strip()
            letting_agent_name.append(estate_agent)
            letting_agent_number.append(listing.find('div', attrs={'class':'propertyCard-contacts'}).find('a', attrs={'class':'propertyCard-contactsPhoneNumber'}).text.strip())
            meta_data = listing.find('div', attrs={'class':'propertyCard-moreInfoMeta'})
            num_pictures.append(meta_data.find('span', attrs={'class':'propertyCard-moreInfoNumber'}).text.strip())
        time.sleep(2)
    listing_df = pd.DataFrame(listing_ids, columns=['listing_id'])
    listing_df['address'] = addresses
    listing_df['property_type'] = property_types
    listing_df['property_link'] = links
    listing_df['price'] = prices
    listing_df['added/reduced_date'] = added_reduced_array
    listing_df['agent_name'] = letting_agent_name
    listing_df['agent_number'] = letting_agent_number
    listing_df['no_pictures'] = num_pictures
    listing_df['featured_property'] = featured_properties
    listing_df = listing_df[~listing_df['property_type'].str.contains('share')]
    listing_df = listing_df[~listing_df['property_type'].str.contains('Parking')]
    listing_df = listing_df[listing_df['address']!=""]
    listing_df = listing_df[listing_df['featured_property']==""]
    listing_df = listing_df.reset_index(drop=True)
    print('soup formatted')
    return listing_df

def format_data_json(properties_json):
    property_id, coordinates = [],[]
    for row in properties_json['properties']:
        property_id.append(str(row['id']))
        coordinates.append([row['location']['latitude'], row['location']['longitude']])
    coordinates_dict = dict(zip(property_id, coordinates))
    print('json formatted')
    return coordinates_dict

def housing_data_merge(housing_soup_data, housing_json_data):
    latitudes, longitudes = [],[]
    for id in list(housing_soup_data['listing_id']):
        latitudes.append(housing_json_data[str(id)][0])
        longitudes.append(housing_json_data[str(id)][1])
    housing_soup_data['latitude'] = latitudes
    housing_soup_data['longitude'] = longitudes
    return housing_soup_data

district_dict = get_district_dict()

district_list = import_district_file('C:/Users/ballinj/housing/london_district_codes.csv')['district'].tolist()
district_list.remove('CR0')
housing_data_total = pd.DataFrame()
for district in district_list:
    print(district)
    housing_soup = get_soup(index=0, region=district)
    housing_soup_data = format_data_soup(housing_soup, region=district)
    housing_json = get_json(index=0, region=district)
    housing_json_data = format_data_json(housing_json)
    housing_data_merge_df = housing_data_merge(housing_soup_data, housing_json_data)
    housing_data_merge_df['district'] = district
    housing_data_total = pd.concat([housing_data_total,housing_data_merge_df], ignore_index=True)
    time.sleep(5)
    print(district + ' complete')
    print('\n')

BR1
142 results found
obtaining soup...
soup formatted
json retrieved
json formatted
BR1 complete



BR2
140 results found
obtaining soup...


KeyboardInterrupt: 