In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import numpy as np
import time
import datetime
from datetime import timedelta
import glob
import os
from sqlalchemy import create_engine
from matplotlib import pyplot as plt
from math import radians, cos, sin, asin, sqrt

In [None]:
today = datetime.datetime.today().date()
yesterday = datetime.datetime.today().date()-timedelta(1)

def import_district_file(file_location):
    district_df = pd.read_csv(file_location)
    return district_df

def get_district_dict():
    district_df = import_district_file('C:/Users/ballinj/housing/london_district_codes.csv')
    district_dict = dict(zip(list(district_df['district']), list(district_df['code'])))
    return district_dict

def get_borough_dict():
    df = pd.read_csv('london_borough_list.csv')
    borough_dict = dict(zip(list(df['borough']),list(df['code'])))
    return borough_dict

def get_no_results(soup):
    no_results = soup.find('span', attrs={'class':'searchHeader-resultCount'}).text.strip()
    return no_results

def import_previous_file():
    list_of_files = glob.glob('C:/Users/ballinj/housing/data/london/rightmove/*.csv')
    latest_file = max(list_of_files, key=os.path.getctime)
    combined_df_old = pd.read_csv(latest_file, index_col=False)
    return combined_df_old

def get_individual_soup(url):
    cert = "C:/Users/ballinj/housing/ca-certificates.crt"
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}
    r = requests.get(url)
    c = r.content    
    soup = BeautifulSoup(c, 'html.parser')
    return(soup)

def get_soup(index, region):
    district_id = district_dict[region]
    cert = "C:/Users/ballinj/housing/ca-certificates.crt"
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}
    url = 'https://www.rightmove.co.uk/property-for-sale/find.html'
    params = {'minBedrooms':1,
              'propertyTypes':'detached%2Cflat%2Csemi-detached%2Cterraced',
              'keywords':'',                  
              'dontShow':'retirement%2CsharedOwnership',
              'channel':'BUY',
              'secondaryDisplayPropertyType':'housesandflats',
              'index': str(index), 
              'retirement':'false',
              'includeSSTC':'false',
              'partBuyPartRent':'false',
              'sortType':2,
              'minPrice':200000,
              'viewType':'list',
              'maxPrice':450000,
              'radius':0.0,
              'locationIdentifier':'OUTCODE%' + str(district_id)[str(district_id).find('%')+1:]}
    params_string = "&".join("%s=%s" % (k,v) for k,v in params.items())
    loaded = False
    while not loaded:
        r = requests.get(url, params=params_string)
        c = r.content    
        soup = BeautifulSoup(c, 'html.parser') 
        if soup.findAll('a', attrs={'class':'propertyCard-anchor'}) != None and soup.find('span', attrs={'class':'searchHeader-resultCount'}) != None:
            loaded = True
        else:
            print('refreshing soup')
    return soup

def get_json(index, region):
    district_id = district_dict[region]
    cert = "C:/Users/ballinj/housing/ca-certificates.crt"
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}
    url = 'https://www.rightmove.co.uk/property-for-sale/map.html'
    params = {'minBedrooms':1,
              'propertyTypes':'detached%2Cflat%2Csemi-detached%2Cterraced',
              'keywords':'',                  
              'dontShow':'retirement%2CsharedOwnership',
              'channel':'BUY',
              'secondaryDisplayPropertyType':'housesandflats',
              'index': str(index), 
              'retirement':'false',
              'includeSSTC':'false',
              'partBuyPartRent':'false',
              'sortType':2,
              'minPrice':200000,
              'viewType':'map',
              'maxPrice':450000,
              'radius':0.0,
              'locationIdentifier':'OUTCODE%' + str(district_id)[str(district_id).find('%')+1:]}
    params_string = "&".join("%s=%s" % (k,v) for k,v in params.items())
    loaded = False
    while not loaded:
        r = requests.get(url, params=params_string)
        c = r.content
        soup = BeautifulSoup(c, 'html.parser')
        scripts = soup.findAll('script')
        script_list = [script if 'window.jsonModel' in str(script) else '' for script in scripts]
        script_list = [script for script in script_list if script != '']
        if len(script_list) != 0:
            script = str(script_list[0])
            script = script[script.find('{'):script.rfind('}')+1]
            properties_json = json.loads(script)
            properties_json = properties_json['properties']
            loaded = True
        else:
            print('refreshing json')
    print('json retrieved')
    return properties_json

def get_CR0_json(index):
    district_id = district_dict['CR0']
    cert = "C:/Users/ballinj/housing/ca-certificates.crt"
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}
    url = 'https://www.rightmove.co.uk/property-for-sale/map.html'
    viewport_list = ['-0.24224%2C0.0873501%2C51.3716%2C51.4672','-0.240351%2C0.0892384%2C51.282%2C51.3777']
    properties_json_list = []
    for viewport in viewport_list:
        params = {'minBedrooms':1,
                  'propertyTypes':'detached%2Cflat%2Csemi-detached%2Cterraced',
                  'keywords':'',                  
                  'dontShow':'retirement%2CsharedOwnership',
                  'channel':'BUY',
                  'secondaryDisplayPropertyType':'housesandflats',
                  'index': str(0), 
                  'retirement':'false',
                  'includeSSTC':'false',
                  'partBuyPartRent':'false',
                  'sortType':2,
                  'minPrice':200000,
                  'viewType':'map',
                  'maxPrice':450000,
                  'radius':0.0,
                  'locationIdentifier':'OUTCODE%' + str(district_id)[str(district_id).find('%')+1:],
                  'viewport':viewport}
        params_string = "&".join("%s=%s" % (k,v) for k,v in params.items())
        r = requests.get(url, params=params_string)
        c = r.content
        soup = BeautifulSoup(c, 'html.parser')
        scripts = soup.findAll('script')
        script_list = [script if 'window.jsonModel' in str(script) else '' for script in scripts]
        script_list = [script for script in script_list if script != '']
        script = str(script_list[0])
        script = script[script.find('{'):script.rfind('}')+1]
        properties_json = json.loads(script)
        properties_json = properties_json['properties']
        for p in properties_json:
            if str(p['id']) not in [str(property['id']) for property in properties_json_list]:
                properties_json_list.append(p)
    return properties_json_list

def format_data_soup(soup, region):
    no_results = get_no_results(soup)
    print(no_results + ' results found')
    print('obtaining soup...')
    index_array = np.arange(0,int(no_results)+24,24).tolist()
    listing_ids, links, property_types, addresses, prices, featured_properties = [],[],[],[],[],[]
    added_reduced_array, letting_agent_name, letting_agent_number, num_pictures = [],[],[],[]
    for index in index_array:
        soup = get_soup(index, region)
        time.sleep(0.5)
        main_data = soup.find('div', attrs={'class':'main'})
        search_results = soup.find('div', attrs={'class':'l-searchResults'})
        ids = soup.findAll('a', attrs={'class':'propertyCard-anchor'})#['id']
        for id in ids:
            listing_ids.append(id['id'][4:])
        listing_data = search_results.findAll('div', attrs={'class':'propertyCard-wrapper'})
        for listing in listing_data:
            featured_properties.append(listing.find('div', attrs={'class':'propertyCard-moreInfoFeaturedTitle'}).text.strip())

            details = listing.find('div', attrs={'class':'propertyCard-details'})
            addresses.append(listing.find('address').text.strip())
            property_types.append(listing.find('h2').text.strip())
            links.append('https://www.rightmove.co.uk' + details.find('a')['href'])

            pricing = listing.find('div', attrs={'class':'propertyCard-price'})
            prices.append(pricing.find('div', attrs={'class':'propertyCard-priceValue'}).text.strip())
            added_reduced_array.append(listing.find('div', attrs={'class':'propertyCard-branchSummary'}).find('span', attrs={'class':'propertyCard-branchSummary-addedOrReduced'}).text.strip())
            estate_agent = listing.find('div', attrs={'class':'propertyCard-branchSummary'}).find('span', attrs={'class':'propertyCard-branchSummary-branchName'}).text.strip()
            estate_agent = estate_agent[estate_agent.find('by')+3:].strip()
            letting_agent_name.append(estate_agent)
            letting_agent_number.append(listing.find('div', attrs={'class':'propertyCard-contacts'}).find('a', attrs={'class':'propertyCard-contactsPhoneNumber'}).text.strip())
            meta_data = listing.find('div', attrs={'class':'propertyCard-moreInfoMeta'})
            num_pictures.append(meta_data.find('span', attrs={'class':'propertyCard-moreInfoNumber'}).text.strip())
    listing_df = pd.DataFrame(listing_ids, columns=['listing_id'])
    listing_df['address'] = addresses
    listing_df['property_type'] = property_types
    listing_df['property_link'] = links
    listing_df['price'] = prices
    listing_df['added/reduced_date'] = added_reduced_array
    listing_df['agent_name'] = letting_agent_name
    listing_df['agent_number'] = letting_agent_number
    listing_df['no_pictures'] = num_pictures
    listing_df['featured_property'] = featured_properties
    listing_df = listing_df[~listing_df['property_type'].str.contains('share')]
    listing_df = listing_df[~listing_df['property_type'].str.contains('Parking')]
    listing_df = listing_df[listing_df['address']!=""]
    listing_df = listing_df[listing_df['featured_property']==""]
    listing_df = listing_df.reset_index(drop=True)
    return listing_df

def format_missed_data(housing_soup_data, properties_json_list):
    soup_list = housing_soup_data['listing_id'].tolist()
    json_list = list(properties_json_list)
    missed_ids, missed_links = [],[]
    for item in json_list:
        if item not in soup_list:
            missed_ids.append(item)
    missed_links = ['https://www.rightmove.co.uk/property-for-sale/property-' + str(item) + '.html' for item in missed_ids]
    listing_ids, links, property_types, addresses, prices, featured_properties = [],[],[],[],[],[]
    added_reduced_array, letting_agent_name, letting_agent_number, num_pictures = [],[],[],[]

    for id,link in list(zip(missed_ids,missed_links))[:10]:
        listing_ids.append(id)
        links.append(link)
        soup = get_individual_soup(link)
        listing_details = soup.find('div', attrs={'id':'primaryContent'})
        property_types.append(listing_details.find('h1', attrs={'class':'fs-22'}).text.strip())
        addresses.append(listing_details.find('address', attrs={'itemprop':'address'}).text.strip())
        prices.append(listing_details.find('p', attrs={'id':'propertyHeaderPrice'}).text.strip())
        try:
            added_reduced = soup.find('div', attrs={'id':'firstListedDate'}).text.strip().replace(' Rightmove:','') + ' '
            added_reduced = added_reduced + datetime.datetime.strptime(soup.find('div', attrs={'id':'firstListedDateValue'}).text.strip(), '%d %B %Y').strftime('%d/%m/%Y')
            added_reduced_array.append(added_reduced)
        except AttributeError:
            added_reduced_array.append(None)
        letting_agent_name.append(soup.find('a', attrs={'id':'aboutBranchLink'}).text.strip())
        letting_agent_number.append(soup.find('div', attrs={'id':'requestdetails'}).contents[4].contents[1].text.strip())
        num_pictures.append(soup.find('span', attrs={'class':'gallery-main-status'}).text.strip()[-2:].strip())
        time.sleep(1)
    listing_df = pd.DataFrame(listing_ids, columns=['listing_id'])
    listing_df['address'] = addresses
    listing_df['property_type'] = property_types
    listing_df['property_link'] = links
    listing_df['price'] = prices
    listing_df['added/reduced_date'] = added_reduced_array
    listing_df['agent_name'] = letting_agent_name
    listing_df['agent_number'] = letting_agent_number
    listing_df['no_pictures'] = num_pictures
    listing_df['featured_property'] = ''
    listing_df = listing_df[~listing_df['property_type'].str.contains('share')]
    listing_df = listing_df[~listing_df['property_type'].str.contains('Parking')]
    listing_df = listing_df[listing_df['address']!=""]
    listing_df = listing_df.reset_index(drop=True)
    housing_soup_data = pd.concat([housing_soup_data,listing_df], ignore_index=True)
    return housing_soup_data
    

def format_data_json(properties_json):
    property_id, coordinates = [],[]
    for row in properties_json:
        property_id.append(str(row['id']))
        coordinates.append([row['location']['latitude'], row['location']['longitude']])
    coordinates_dict = dict(zip(property_id, coordinates))
    print('json formatted')
    return coordinates_dict

def housing_data_merge(housing_soup_data, housing_json_data):
    latitudes, longitudes = [],[]
    housing_soup_data = housing_soup_data[housing_soup_data['listing_id'].isin(list(housing_json_data.keys()))]
    for id in housing_soup_data['listing_id'].tolist():
        latitudes.append(housing_json_data[str(id)][0])
        longitudes.append(housing_json_data[str(id)][1])
    housing_soup_data['latitude'] = latitudes
    housing_soup_data['longitude'] = longitudes
    return housing_soup_data

def format_housing_data_total(housing_data_total):
    print(datetime.datetime.strftime(datetime.datetime.today(),'%H:%M:%S') + ' - formatting data')
    price_list = [price.replace('£','').replace(',','') for price in housing_data_total['price'].tolist()]
    housing_data_total.drop(columns=['price'])
    housing_data_total['price'] = price_list
    housing_data_total['most_recent_scrape_date'] = datetime.datetime.today().date()
    room_df = pd.DataFrame(housing_data_total['property_type'].str.split(' bedroom ',1).tolist(),
                                       columns = ['no_rooms','property_type'])
    housing_data_total['added/reduced_date'] = housing_data_total['added/reduced_date'].fillna('unknown')
    added_reduced_list = [ele.replace(' yesterday',' on ' + str(yesterday)).replace(' today',' on ' + str(today)) for ele in housing_data_total['added/reduced_date'].tolist()]
    housing_data_total = housing_data_total.drop(columns=['added/reduced_date'])
    housing_data_total['added/reduced_date'] = added_reduced_list
    reduced_df = pd.DataFrame(housing_data_total['added/reduced_date'].str.split(' on ',1).tolist(),
                                       columns = ['added/reduced','added/reduced_date'])
    reduced_df['added/reduced_date'] = reduced_df['added/reduced_date'].fillna('unknown')
    housing_data_total = housing_data_total.drop(columns=['property_type','added/reduced_date'])
    housing_data_total = pd.concat([housing_data_total,room_df], axis=1)
    housing_data_total = pd.concat([housing_data_total,reduced_df], axis=1)
    housing_data_total['property_type'] = housing_data_total['property_type'].str.replace(' for sale','')
    housing_data_total_old = import_previous_file()
    initial_scrape_date = []
    for index, row in housing_data_total.iterrows():
        if int(row['listing_id']) not in housing_data_total_old['listing_id'].tolist():
            initial_scrape_date.append(str(datetime.datetime.today().date()))
        else:
            required_date = housing_data_total_old[housing_data_total_old['listing_id']==int(row['listing_id'])]['initial_scrape_date'].tolist()[0]
            initial_scrape_date.append(required_date)
    housing_data_total['initial_scrape_date'] = initial_scrape_date
    housing_data_total = housing_data_total[['listing_id',
                                             'district',
                                             'address',
                                             'price',
                                             'no_rooms',
                                             'property_type',
                                             'property_link',
                                             'added/reduced',
                                             'added/reduced_date',
                                             'initial_scrape_date',
                                             'most_recent_scrape_date',
                                             'no_pictures',
                                             'latitude',
                                             'longitude',
                                             'agent_name',
                                             'agent_number']]
    for index, row in housing_data_total_old.iterrows():
        if int(row['listing_id']) not in [int(id) for id in housing_data_total['listing_id'].tolist()]:
            housing_data_total = housing_data_total.append(row)
    housing_data_total['added/reduced_date'] = housing_data_total['added/reduced_date'].apply(lambda x:datetime.datetime.strptime(str(x), '%d/%m/%Y') if '/' in str(x) else (datetime.datetime.strptime(str(x),'%Y-%m-%d') if '-' in str(x) else None))
    housing_data_total['initial_scrape_date'] = housing_data_total['initial_scrape_date'].apply(lambda x:datetime.datetime.strptime(str(x), '%d/%m/%Y') if '/' in str(x) else (datetime.datetime.strptime(str(x),'%Y-%m-%d') if '-' in str(x) else None))
    housing_data_total['most_recent_scrape_date'] = housing_data_total['most_recent_scrape_date'].apply(lambda x:datetime.datetime.strptime(str(x), '%d/%m/%Y') if '/' in str(x) else (datetime.datetime.strptime(str(x),'%Y-%m-%d') if '-' in str(x) else None))
    housing_data_total = housing_data_total[~housing_data_total['property_link'].str.contains('commercial-property')]
    housing_data_total = housing_data_total[housing_data_total['no_rooms'] != 'Hotel room for sale']
    housing_data_total = housing_data_total.reset_index(drop=True)
    print('data formatted')
    return housing_data_total

def haversine(lon1, lat1, lon2, lat2):
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 3956 # Radius of earth in kilometers = 6371. Use 3956 for miles
    return c * r

def add_stations(df):
    print(datetime.datetime.strftime(datetime.datetime.today(),'%H:%M:%S') + ' - adding stations')
    tube_stations = pd.read_csv('C:/Users/ballinj/housing/london_stations.csv', sep=',', index_col='FID')
    distance_list,ids = [],[]
    for id,lat, long in list(zip(df['listing_id'].tolist(),df['latitude'].tolist(),df['longitude'].tolist())):
        distance_temp, distance_from_station_temp = [],[]
        for index, row in tube_stations.iterrows():
            distance_temp.append(haversine(long, lat, row['x'], row['y']))
            distance_from_station_temp.append(row['NAME'])
        zipped = zip(distance_temp,distance_from_station_temp)
        distance_list.append(list(sorted(zipped, key=lambda x: x[0])[:3]))
        ids.append(id)
    distance_dict = dict(zip(ids,distance_list))
    station_a_dist, station_b_dist, station_c_dist = [],[],[]
    station_a, station_b, station_c = [],[],[]
    for listing_id in df['listing_id'].tolist():
        station_a_dist.append(distance_dict[listing_id][0][0])
        station_b_dist.append(distance_dict[listing_id][1][0])
        station_c_dist.append(distance_dict[listing_id][2][0])
        station_a.append(distance_dict[listing_id][0][1])
        station_b.append(distance_dict[listing_id][1][1])
        station_c.append(distance_dict[listing_id][2][1])
    df['closest_station'] = station_a
    df['closest_station_dist (miles)'] = station_a_dist
    df['second_closest_station'] = station_b
    df['second_closest_station_dist (miles)'] = station_b_dist
    df['third_closest_station'] = station_c
    df['third_closest_station_dist (miles)'] = station_c_dist
    print('stations added')
    return df

def add_pubs_score(housing_df):
    
    def format_total_normalised(row):
        if row['quantity'] == 0:
            return df['total_normalised'].min() - 0.01
        else:
            return row['total_normalised']
    
    print(datetime.datetime.strftime(datetime.datetime.today(),'%H:%M:%S') + ' - adding pubs')
    relevant_pubs = pd.read_csv('C:/Users/ballinj/housing/pubs_data.csv', index_col=0)
    relevant_pubs['rating'] = relevant_pubs['rating'].replace('unknown',0)
    relevant_pubs['rating'] = pd.to_numeric(relevant_pubs['rating'])
    relevant_pubs['rating'] = relevant_pubs['rating'].replace(0,relevant_pubs[relevant_pubs['rating']!=0]['rating'].mean())
    property_pub_proximity_ratings, property_pub_average_ratings, property_pub_nos = [],[],[]
    for lat, long in list(zip(housing_df['latitude'],housing_df['longitude'])):
        pub_distances = []
        for index, row in relevant_pubs.iterrows():
            pub_distances.append(haversine(long, lat,row['long'],row['lat']))
        pub_details = list(zip(pub_distances, relevant_pubs['rating'].tolist()))
        near_pub_details = [row for row in pub_details if row[0] < 1]
        if len(near_pub_details) > 0:
            near_pub_average_rating = sum([row[1] for row in near_pub_details])/len(near_pub_details)
            near_pub_proximity_rating = sum([row[0] for row in near_pub_details])/len(near_pub_details)
        else:
            near_pub_proximity_rating = 0
            near_pub_average_rating = 0
        property_pub_proximity_ratings.append(near_pub_proximity_rating)
        property_pub_average_ratings.append(near_pub_average_rating)
        property_pub_nos.append(len(near_pub_details))

    list_df = list(zip(housing_df['listing_id'], property_pub_proximity_ratings, property_pub_nos, property_pub_average_ratings))
    df = pd.DataFrame(list_df, columns=['listing_id','proximity','quantity','quality'])
    df['proximity_normalised'] = df['proximity']-0.3 # here we presume that 300m is the optimal distance away from a pub
    df['proximity_normalised'] = df['proximity_normalised'].abs() # here we penalised houses that are within 300m of a pub by taking the modulus
    df['proximity_normalised'] = (df['proximity_normalised'] - df['proximity_normalised'].min())/(df['proximity_normalised'] - df['proximity_normalised'].min()).max()
    df['quantity_normalised'] = df['quantity']/df['quantity'].max()
    df['quality_normalised'] = (df['quality']-df[df['quality']!=0]['quality'].min())/(df['quality']-df[df['quality']!=0]['quality'].min()).max()
    df['quality_normalised'][df['quality_normalised']<0] = 0
    df['total_normalised'] = 0.5*df['quantity_normalised'] + 2*df['quality_normalised'] - df['proximity_normalised']
    df['total_normalised'] = df.apply(lambda row: format_total_normalised(row), axis=1)
    df['total_rank'] = df['total_normalised'].rank(ascending=False)
    housing_df = housing_df.drop(columns=['listing_id.1', 'proximity',
       'proximity_normalised', 'quality', 'quality_normalised', 'quantity',
       'quantity_normalised', 'total_normalised', 'total_rank'])
    housing_df = pd.concat([housing_df,df], axis=1)
    return housing_df
print('functions loaded')

functions loaded


In [None]:
start_time = str(datetime.datetime.now().time()) 
print(start_time)
district_dict = get_district_dict()
district_list = import_district_file('C:/Users/ballinj/housing/london_district_codes.csv')['district'].tolist()
housing_data_total = pd.DataFrame()
i = 1
for district in district_list:
    print(district)
    housing_soup = get_soup(index=0, region=district)
    housing_soup_data = format_data_soup(housing_soup, region=district)
    if district != 'CR0':
        housing_json = get_json(index=0, region=district)
        housing_json_data = format_data_json(housing_json)
    else:
        housing_json = get_CR0_json(index=0)
        housing_json_data = format_data_json(housing_json)
        housing_soup_data = format_missed_data(housing_soup_data, housing_json_data)
    housing_data_merge_df = housing_data_merge(housing_soup_data, housing_json_data)
    housing_data_merge_df['district'] = district
    housing_data_total = pd.concat([housing_data_total,housing_data_merge_df], ignore_index=True)
    time.sleep(3)
    print(district + ' complete')
    print(str(i) + ' out of ' + str(len(district_list)))
    print('\n')
    i += 1
housing_data_total_formatted = format_housing_data_total(housing_data_total)
housing_data_total_stations = add_stations(housing_data_total_formatted)
housing_data_total_pubs = add_pubs_score(housing_data_total_stations)
housing_data_total_pubs.to_csv('data/london/rightmove/properties_by_district_{}.csv'.format(str(today)), index=False)
print('PROCESS COMPLETE')
end_time = str(datetime.datetime.now().time()) 
print(end_time)
time_taken = datetime.datetime.strptime(end_time, '%H:%M:%S.%f') - datetime.datetime.strptime(start_time, '%H:%M:%S.%f')
time_taken = str(time_taken)
print('time taken: ' + time_taken)

19:40:23.646927
BR1
195 results found
obtaining soup...
json retrieved
json formatted
BR1 complete
1 out of 298


BR2
166 results found
obtaining soup...
json retrieved
json formatted
BR2 complete
2 out of 298


BR3
110 results found
obtaining soup...
json retrieved
json formatted
BR3 complete
3 out of 298


BR4
21 results found
obtaining soup...
json retrieved
json formatted
BR4 complete
4 out of 298


BR5
148 results found
obtaining soup...
json retrieved
json formatted
BR5 complete
5 out of 298


BR6
132 results found
obtaining soup...
json retrieved
json formatted
BR6 complete
6 out of 298


BR7
58 results found
obtaining soup...
json retrieved
json formatted
BR7 complete
7 out of 298


BR8
72 results found
obtaining soup...
json retrieved
json formatted
BR8 complete
8 out of 298


CM13
60 results found
obtaining soup...
json retrieved
json formatted
CM13 complete
9 out of 298


CM14
202 results found
obtaining soup...
json retrieved
json formatted
CM14 complete
10 out of 298


CR0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


CR0 complete
11 out of 298


CR2
220 results found
obtaining soup...
json retrieved
json formatted
CR2 complete
12 out of 298


CR3
160 results found
obtaining soup...
json retrieved
json formatted
CR3 complete
13 out of 298


CR4
220 results found
obtaining soup...
json retrieved
json formatted
CR4 complete
14 out of 298


CR5
57 results found
obtaining soup...
json retrieved
json formatted
CR5 complete
15 out of 298


CR6
29 results found
obtaining soup...
json retrieved
json formatted
CR6 complete
16 out of 298


CR7
222 results found
obtaining soup...
json retrieved
json formatted
CR7 complete
17 out of 298


CR8
124 results found
obtaining soup...
json retrieved
json formatted
CR8 complete
18 out of 298


DA1
205 results found
obtaining soup...
json retrieved
json formatted
DA1 complete
19 out of 298


DA14
97 results found
obtaining soup...
json retrieved
json formatted
DA14 complete
20 out of 298


DA15
103 results found
obtaining soup...
json retrieved
json formatted
DA15 compl

In [None]:
fig, ax = plt.subplots(figsize=(16,9))

housing_data_total_pubs.groupby('district')['total_normalised'].mean().sort_values(ascending=False).plot.bar()

plt.show()

In [1]:
housing_data_total_pubs.to_csv('data/london/rightmove/properties_by_district_{}.csv'.format(str(today)), index=False)

NameError: name 'housing_data_total_pubs' is not defined

In [None]:
housing_data_