# Get Suburbs

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
website_text = requests.get('https://www.domain.com.au/liveable-sydney/sydneys-most-liveable-suburbs-2019/sydneys-569-suburbs-ranked-for-liveability-2019-903130/').text
soup = BeautifulSoup(website_text)
sub_list = []
for row in soup.find_all('h3'):
    sub_list.append(row.getText())

In [None]:
filtered_list = []
for i in sub_list: 
    if i.split('.')[0].isdigit():
        filtered_list.append(i)
        
lr_list = []
f_sub_list = []
for i in filtered_list:
    lr_list.append(int(i.split('.')[0]))
    f_sub_list.append(i.split('.')[1].replace('\xa0','').strip())


sub_dict = {'liveability_ranking':lr_list,'suburb':f_sub_list}

df = pd.DataFrame.from_dict(sub_dict)



In [None]:
suburb_list = list(df.suburb)

In [None]:
import requests
import json

In [None]:
base_url = 'https://location-api.domain.com.au/locations/suggestLocations?pageSize=15&prefixText='
def get_ref(json_text,a):
    if 'Postcode' in json_text[a] and json_text[a]['Postcode'].startswith( '2' ):
        return json_text[a]['NameSlug']
    else:
        a += 1
        return get_ref(json_text,a)

ref_list =[]
        
for i in suburb_list:
    a = 0
    item_url = base_url+i
    item_text = requests.get(item_url).text
    json_text = json.loads(item_text)
    sub_ref = get_ref(json_text,a)
    ref_list.append(sub_ref)
    

df['sub_ref'] = ref_list 
    

In [None]:
df['postcode'] = df['sub_ref'].str[-4:]

In [None]:
df.to_csv('../data/final_data/liveability_suburbs.csv')

In [None]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres:password@this_postgres')
name = 'liveability_ranking'
df.to_sql(name, engine, if_exists='replace')

# Get Suburb Profile

In [None]:
import psycopg2 as pg2
import pandas as pd

con = pg2.connect(host='this_postgres',
                  user='postgres',
                  password='password',
                  database='postgres')
con.autocommit = True
cur = con.cursor()

def select(sql):
    return pd.read_sql(sql,con)

In [None]:
sql = '''select sub_ref from liveability_ranking'''

In [None]:
df_ref = select(sql)
ref_list = list(df_ref.sub_ref)
ref_list

## Suburb Profile Scraper

In [None]:
import pandas as pd
import os.path
import requests

def write_file(file_name, content):
    with open(file_name,'w',encoding='utf-8') as f:
        f.write(content)


def append_file(file_name, content):
    with open(file_name,'a',encoding='utf-8') as f:
        f.write(content)

def read_file(file_name):
    with open(file_name,'r',encoding='utf-8') as f:
        return f.read()


def get_or_download(url,file_name):

    if os.path.isfile(file_name):
        print (f'{url} already exists as {file_name}')
        return read_file(file_name)
    else:
        content = requests.get(url).text
        write_file(file_name,content)
        print (f'{url} downloaded to {file_name}')
        return read_file(file_name)


In [None]:
base_url = 'https://www.domain.com.au/suburb-profile/'
for item in ref_list:
    get_or_download(base_url+item,f'../suburb_profile/{item}.html')


## Suburb Profile Parser

In [None]:
import glob
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [None]:
def read_file(file_name):
    with open(file_name,'r',encoding='utf-8') as f:
        return f.read()
    
def get_profile_feature(sect,feature_name):
    for feature in sect.select('div.css-48zwbo') :
        if feature_name in feature.getText():
            return feature.select('div')[0].getText()
        
def get_ratios(sect):
    container = sect.select('div.css-16g4kqh')[0]
    ratio_dict = {}
    for item in container.select('div.css-14hea9r'):
        left_text = item.select('span[data-testid="left-text"]')[0].getText()
        left_value = item.select('span[data-testid="left-value"]')[0].getText()
        right_text = item.select('span[data-testid="right-text"]')[0].getText()
        right_value = item.select('span[data-testid="right-value"]')[0].getText()
        ratio_dict[left_text]=int(re.findall(r'\d+',left_value)[0])
        ratio_dict[right_text]=int(re.findall(r'\d+',right_value)[0])
    return ratio_dict
    

In [None]:
tabs = []
tabs_ratio = []
for sub_file in glob.glob('../suburb_profile/*.html'):
    soup = BeautifulSoup(read_file(sub_file))
    table = soup.find("table", {"aria-describedby":"market-data-context"})
    if table:
        suburb = sub_file.split('/')[2].split('nsw')[0][:-1].replace('-',' ')
        sub_ref = sub_file.split('/')[2].split('.')[0]        
        sect = soup.select('section#demographics')
        if sect:
            sect = sect[0]
            population_t = get_profile_feature(sect,'Population')
            population = int(population_t.replace(',',''))
            average_age = get_profile_feature(sect,'Average age')
            age_list = average_age.split('to')
            if len(age_list) == 1:
                min_age = int(re.findall(r'\d+',age_list[0])[0])
                max_age = None
            elif len(age_list) == 2:
                min_age = int(re.findall(r'\d+',age_list[0])[0])
                max_age = int(re.findall(r'\d+',age_list[1])[0])
            ratio_dictionary = get_ratios(sect)
            ratio_dictionary['population'] = population
            ratio_dictionary['min_age'] = min_age
            ratio_dictionary['max_age'] = max_age
            ratio_dictionary['suburb'] = suburb
            ratio_dictionary['sub_ref'] = sub_ref
            tabs_ratio.append(ratio_dictionary)
                



        trs = table.find_all('tr')
        for tr in trs:
            if tr.find_all('td'):
                beds_num = int(tr.find_all('td')[0].getText())
                m_price_text = tr.find_all('td')[2].getText()
                price_num_s = re.findall(r'[\d|\.]+',m_price_text)[0] if re.findall(r'[\d|\.]+',m_price_text) else None
                if 'm' in m_price_text:
                    price = float(price_num_s)*1000000
                elif 'k' in m_price_text:
                    price = float(price_num_s)*1000
                else:
                    price = float(price_num_s) if price_num_s else None
                days_text = tr.find_all('td')[3].getText()
                days = int(re.findall(r'\d+',days_text)[0]) if re.findall(r'\d+',days_text) else None
                c_rate_text = tr.find_all('td')[4].getText()
                c_rate = int(re.findall(r'\d+',c_rate_text)[0]) if re.findall(r'\d+',c_rate_text) else None
                sold_last_12 = int(tr.find_all('td')[5].getText())
                

                
                tabs.append({
                    'bedrooms':beds_num,
                    'type':tr.find_all('td')[1].getText(),
                    'median_price':price,
                    'avg_days_on_market':days,
                    'clearance_rate':c_rate,
                    'sold_last_12_months':sold_last_12,
                    'Suburb':suburb,
                    'sub_ref':sub_ref
                })


df = pd.DataFrame(tabs)

df_ratios = pd.DataFrame(tabs_ratio)


In [None]:
df.to_csv('../data/final_data/suburbs profile.csv')

df_ratios.to_csv('../data/final_data/suburbs ratios.csv')

from sqlalchemy import create_engine,types

engine = create_engine('postgresql://postgres:password@this_postgres')
name = 'suburb_profile'
df.to_sql(name, engine, if_exists='replace',
         dtype={
             'bedrooms':types.INTEGER(),
             'median_price':types.INTEGER(),
             'avg_days_on_market':types.INTEGER(),
             'clearance_rate':types.INTEGER(),
             'sold_last_12_months':types.INTEGER(),
             'type':types.VARCHAR(),
             'Suburb':types.VARCHAR(),
             'sub_ref':types.VARCHAR()
         })

name = 'suburb_ratios'
df_ratios.to_sql(name, engine, if_exists='replace')

In [None]:
df

In [None]:
df_ratios

## all suburbs list

In [None]:
import psycopg2 as pg2
import pandas as pd
con = pg2.connect(host='this_postgres',
                  user='postgres',
                  password='password',
                  database='postgres')
con.autocommit = True
cur = con.cursor()

def select(sql):
    return pd.read_sql(sql,con)

In [None]:
# sql1 = '''select sum(sold_last_12_months) as Sold,"Suburb","sub_ref","type"
# from suburb_profile
# where type = 'House'
# group by "Suburb","sub_ref",type
# order by Sold desc
# limit 50'''


sql = '''select sum(sold_last_12_months) as Sold,"Suburb","sub_ref"
from suburb_profile
group by "Suburb","sub_ref"
order by Sold desc'''

In [None]:
# df1 = select(sql1)
df = select(sql)

In [None]:
from sqlalchemy import create_engine,types
engine = create_engine('postgresql://postgres:password@this_postgres')
name = 'chosen_suburbs_all'
df.to_sql(name, engine, if_exists='replace')
# name1 = 'chosen_suburbs_50' 
# df1.to_sql(name, engine, if_exists='replace')

In [None]:
# df1.to_csv('../data/final_data/50 chosen suburbs.csv')
df.to_csv('../data/final_data/all chosen suburbs.csv') 

# Get Chosen Suburb Sold/On Sale data

## Chosen Suburb Sold Scraper

In [None]:
import requests
from bs4 import BeautifulSoup
import os.path

In [None]:
def write_file(file_name, content):
    with open(file_name,'w') as f:
        f.write(content)


def read_file(file_name):
    with open(file_name,'r',encoding='utf-8') as f:
        return f.read()


In [None]:
def get_next_page_url(page):
    next_button_selector = page.select('a[data-testid="paginator-navigation-button"]')
    for item in next_button_selector:
        button_text = item.getText()
        if button_text == 'next page':
            return item.attrs['href']

In [None]:
def get_or_download(url,file_name):

    if os.path.isfile(file_name):
        print (f'{url} already exists as {file_name}')
        return read_file(file_name)
    else:
        content = requests.get(url).text
        write_file(file_name,content)
        print (f'{url} downloaded to {file_name}')
        return read_file(file_name)


In [None]:

def get_urls(base_url,page_url,get_next_page_url):

    html = requests.get(base_url + page_url).text
    page = BeautifulSoup(html)
    current_urls = set([base_url + page_url])
    next_page_url = get_next_page_url(page)
        
    if next_page_url:
        return current_urls | get_urls(base_url,next_page_url,get_next_page_url)
    else:
        return current_urls



In [None]:
ref_list 

In [None]:
base_url = 'https://www.domain.com.au'

In [None]:
for ref in ref_list:
    print(ref)
    start_page = f'/sold-listings/{ref}/?excludepricewithheld=1&ssubs=0&page=1'
    for urls in  get_urls(base_url,start_page,get_next_page_url):
        page_number = urls.split('=')[-1]
        get_or_download(urls,f'../domain_sold/{ref}-{page_number}.html')

## Chosen Suburb Sold Parser

In [None]:
import glob
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime

In [None]:
def write_file(file_name, content):
    with open(file_name,'w') as f:
        f.write(content)


def read_file(file_name):
    with open(file_name,'r',encoding='utf-8') as f:
        return f.read()
    
def ifisempty(v):
    if v != []:
        return v[0]
    else:
        return ''

In [None]:
def month_text_to_num(m):
    if m.lower() in ['jan' , 'january']:
        m = '01'
    elif  m.lower() in ['feb' , 'february']:
        m = '02'
    elif  m.lower() in ['mar' , 'march']:
        m = '03'
    elif  m.lower() in ['apr' , 'april']:
        m = '04'
    elif  m.lower() in ['may']:
        m = '05'
    elif  m.lower() in ['jun' , 'june']:
        m = '06'
    elif  m.lower() in ['jul' , 'july']:
        m = '07'
    elif  m.lower() in ['aug' , 'august']:
        m = '08'
    elif  m.lower() in ['sep' , 'september']:
        m = '09'
    elif  m.lower() in ['oct' , 'october']:
        m = '10'
    elif  m.lower() in ['nov' , 'november']:
        m = '11'
    elif  m.lower() in ['dec' , 'december']:
        m = '12'
    return m

In [None]:
tabs = []
for file_name in glob.glob('../domain_sold/*.html'):
    print(f'running:{file_name}')
    page = BeautifulSoup(read_file(file_name))
    house_cards = page.select('div[data-testid^="listing-card-wrapper"]')
    for card in house_cards:
        price_t = card.select('p[data-testid="listing-card-price"]')[0].getText() or ''
        price = int(re.findall(r'([\d,]+)',price_t)[0].replace(',',''))
        address = card.select('a.address')[0].find(itemprop="name").get("content")
        suburb = card.select('span[data-testid="address-line2"]')[0].getText() or ''
        sub_ref = suburb.replace(' ','-').lower()
        sold_date_tag = card.select('div[data-testid="listing-card-tag"]')[0].getText() or ''
        if 'Sold at auction' in sold_date_tag:
            sold_type = 'Auction'
        else:
            sold_type = 'Private Treaty'
        sold_date = sold_date_tag.split()[-1]+'-'+month_text_to_num(sold_date_tag.split()[-2])+'-'+sold_date_tag.split()[-3]
        try:
            sold_date = datetime.strptime(sold_date, "%Y-%m-%d").date()
        except:
            sold_date = None

        
        features = card.select('div[data-testid="listing-card-features-wrapper"]')[0].getText()

        beds = int(re.findall(r'(\d) Bed',features)[0]) if re.findall(r'(\d) Bed',features) else None
        baths = int(re.findall(r'(\d) Bath',features)[0]) if re.findall(r'(\d) Bath',features) else None
        parks = int(re.findall(r'(\d) Park',features)[0]) if re.findall(r'(\d) Park',features) else None
        lands = int(re.findall(r'(\d+)m²',features)[0]) if re.findall(r'(\d+)m²',features) else None

        property_type = card.select('span[class="css-693528"]')[0].getText() or ''

        links = card.select('link[itemprop="url"]')
        for urls in links:
            url = urls.attrs['href']
            listing_id = int(re.findall(r'^(\d+)',url.split('-')[-1])[0])

            
        tabs.append({
            'sold_price':price,
            'sold_price_desc':price_t,
            'address':address,
            'suburb':suburb,
            'sub_ref':sub_ref,
            'sold_type':sold_type,
            'sold_date':sold_date,
            'bedrooms':beds,
            'bathrooms':baths,
            'parkings':parks,
            'landsize_m²':lands,
            'property_type':property_type,
            'url':url,
            'listing_id':listing_id
            
        })
             


In [None]:
cols = ['listing_id','address','suburb','sub_ref','sold_date','sold_type','property_type','sold_price','sold_price_desc',  'bedrooms', 'bathrooms', 'parkings', 'landsize_m²', 'url']
df = pd.DataFrame(tabs)[cols]


In [None]:
df.to_csv('../data/final_data/sold_history_all_subs.csv')

In [None]:
from sqlalchemy import create_engine,types

engine = create_engine('postgresql://postgres:password@this_postgres')
name = 'sold_history_all_subs'
df.to_sql(name, engine, if_exists='replace')

## Chosen Suburb On Sale Scraper

In [None]:
import requests
from bs4 import BeautifulSoup
import os.path

In [None]:
def write_file(file_name, content):
    with open(file_name,'w') as f:
        f.write(content)


def read_file(file_name):
    with open(file_name,'r',encoding='utf-8') as f:
        return f.read()


In [None]:
def get_next_page_url(page):
    next_button_selector = page.select('a[data-testid="paginator-navigation-button"]')
    for item in next_button_selector:
        button_text = item.getText()
        if button_text == 'next page':
            return item.attrs['href']

In [None]:
def get_or_download(url,file_name):

    if os.path.isfile(file_name):
        print (f'{url} already exists as {file_name}')
        return read_file(file_name)
    else:
        content = requests.get(url).text
        write_file(file_name,content)
        print (f'{url} downloaded to {file_name}')
        return read_file(file_name)


In [None]:

def get_urls(base_url,page_url,get_next_page_url):

    html = requests.get(base_url + page_url).text
    page = BeautifulSoup(html)
    current_urls = set([base_url + page_url])
    next_page_url = get_next_page_url(page)
        
    if next_page_url:
        return current_urls | get_urls(base_url,next_page_url,get_next_page_url)
    else:
        return current_urls



In [None]:
import psycopg2 as pg2
import pandas as pd
con = pg2.connect(host='this_postgres',
                  user='postgres',
                  password='password',
                  database='postgres')
con.autocommit = True
cur = con.cursor()

def select(sql):
    return pd.read_sql(sql,con)

In [None]:
sql = '''select sub_ref from chosen_suburbs_all'''
df = select(sql)

In [None]:
ref_list = list(df.sub_ref)

In [None]:
base_url = 'https://www.domain.com.au'

In [None]:
for ref in ref_list:
    start_page = f'/sale/{ref}/?excludeunderoffer=1&ssubs=0&page=1'
    print(ref)
    for urls in  get_urls(base_url,start_page,get_next_page_url):
        page_number = urls.split('=')[-1]
        get_or_download(urls,f'../domain_on_sale/{ref}-{page_number}.html')

## Chosen Suburb On Sale Parser

In [None]:
import glob
from bs4 import BeautifulSoup
import re
import pandas as pd

In [None]:

def write_file(file_name, content):
    with open(file_name,'w') as f:
        f.write(content)


def read_file(file_name):
    with open(file_name,'r',encoding='utf-8') as f:
        return f.read()
    
    
def get_num(price):
    price_num = int(''.join(re.findall(r'\d+', price)))
    return  price_num

In [None]:

tabs = []

for file_name in glob.glob('../domain_on_sale/*.html'):
    print(file_name)
    page = BeautifulSoup(read_file(file_name))
    house_cards = page.select('div[data-testid^="listing-card-wrapper"]')
    for card in house_cards:
        price_t = card.select('p[data-testid="listing-card-price"]')[0].getText() or ''
        fixed_price = re.findall(r'^\$[\d,]+$',price_t)
        if fixed_price: 
            fixed_price = get_num(fixed_price[0])
        else:
            fixed_price = None
        if price_t:
            if fixed_price is None and len(re.findall(r'[\d,]{5,}',price_t)) == 1:
                min_price = get_num(re.findall(r'[\d,]{5,}',price_t)[0])
                max_price = None
            elif fixed_price is None and len(re.findall(r'[\d,]{5,}',price_t)) == 2:
                min_price = get_num(re.findall(r'[\d,]{5,}',price_t)[0])
                max_price = get_num(re.findall(r'[\d,]{5,}',price_t)[1])
            else:
                min_price = None
                max_price = None
        else:
            min_price = None
            max_price = None

        address = card.select('h2[data-testid="address-wrapper"]')[0].getText() or ''
        suburb = card.select('span[data-testid="address-line2"]')[0].getText() or ''
        sub_ref = suburb.replace(' ','-').lower()
        features = card.select('div[data-testid="listing-card-features-wrapper"]')[0].getText()

        beds = int(re.findall(r'(\d) Bed',features)[0]) if re.findall(r'(\d) Bed',features) else None
        baths = int(re.findall(r'(\d) Bath',features)[0]) if re.findall(r'(\d) Bath',features) else None
        parks = int(re.findall(r'(\d) Park',features)[0]) if re.findall(r'(\d) Park',features) else None
        lands = int(re.findall(r'(\d+)m²',features)[0]) if re.findall(r'(\d+)m²',features) else None


        property_type = features.split()[-1]
        property_type = property_type.replace('Parking','')

        links = card.select('link[itemprop="url"]')
        for urls in links:
            url = urls.attrs['href']
            try:
                listing_id = int(re.findall(r'^(\d+)',url.split('-')[-1])[0])
            except:
                listing_id = None

        
                    
        tabs.append({
            'fixed_price':fixed_price,
            'min_price':min_price,
            'max_price':max_price,
            'address':address,
            'suburb':suburb,
            'sub_ref':sub_ref,
            'bedrooms':beds,
            'bathrooms':baths,
            'parkings':parks,
            'landsize_m²':lands,
            'property_type':property_type,
            'url':url,
            'listing_id':listing_id
            
        })
                            


In [None]:
cols = ['listing_id','address', 'suburb', 'sub_ref','property_type', 'fixed_price','min_price','max_price', 'bedrooms', 'bathrooms', 'parkings', 'landsize_m²', 'url']
df = pd.DataFrame(tabs)[cols]

df = df[df.listing_id.notnull()]



In [None]:
df.to_csv('../data/final_data/on_sale_properties_all_subs.csv')

In [None]:
from sqlalchemy import create_engine,types

engine = create_engine('postgresql://postgres:password@this_postgres')
name = 'on_sale_properties_all_subs'
df.to_sql(name, engine, if_exists='replace',dtype={
             'fixed_price':types.INTEGER()
         })

# Haversine (lat & long) data

In [None]:
import requests
import json
import pandas as pd
from haversine import haversine, Unit

In [None]:
response = requests.get('https://raw.githubusercontent.com/michalsn/australian-suburbs/master/data/suburbs.json').text

data = json.loads(response)
data = data.get('data')

df = pd.DataFrame.from_records(data)

In [None]:
sydney_df = df[(2000<=df.postcode) & (df.postcode < 2800)]

In [None]:
sydney_df

## Get Distance to CBD

In [None]:
syd_subs_data = sydney_df.to_dict('records')

In [None]:
syd_cbd_coordinates = (151.2073,-33.8708)

In [None]:
for sub in syd_subs_data:
    sub_ref = sub['suburb'].lower().replace(' ','-') +'-'+sub['state'].lower()+'-'+ str(sub['postcode'])
    distance_to_cbd = haversine((sub['lng'],sub['lat']),syd_cbd_coordinates)
    sub['distance_to_cbd'] = round(distance_to_cbd,2)
    sub['sub_ref'] = sub_ref

In [None]:
sydney_df = pd.DataFrame.from_records(syd_subs_data)

In [None]:
sydney_df.to_csv('../data/final_data/sydney_suburbs_lat_lng_list.csv')

In [None]:
from sqlalchemy import create_engine,types

engine = create_engine('postgresql://postgres:password@this_postgres')
name = 'syd_subs_lat_lng'
sydney_df.to_sql(name, engine, if_exists='replace')

## Surrounding Suburbs

In [None]:
import pandas as pd
from haversine import haversine, Unit

In [None]:
import psycopg2 as pg2
import pandas as pd

con = pg2.connect(host='this_postgres',
                  user='postgres',
                  password='password',
                  database='postgres')
con.autocommit = True
cur = con.cursor()

def select(sql):
    return pd.read_sql(sql,con)

In [None]:
sql = '''select * from syd_subs_lat_lng'''

In [None]:
df = select(sql)

In [None]:
subs = set(df.suburb)
subs = list(subs)
subs

In [None]:
syd_subs_data = df.to_dict('records')
final_data = []
for sub in subs:
    subs_recommended = list(filter(lambda s: sub in s['suburb'],syd_subs_data))[0]
    coordinates = (float(subs_recommended['lng']),float(subs_recommended['lat']))
    get_10km_subs = lambda s: coordinates[0] -0.09 < float(s['lng']) < coordinates[0] + 0.09 and coordinates[1] -0.103 < float(s['lat']) < coordinates[1] + 0.103
    sorrunding_subs = list(filter(get_10km_subs,syd_subs_data))
    hav = lambda s: haversine(coordinates,(float(s['lng']),float(s['lat'])))
    surrounding_data = [(sub,subs_recommended['sub_ref'],s['suburb'],s['sub_ref'],round(hav(s),2)) for s in sorrunding_subs]
# #     sorted_data = sorted(surrounding_data, key=lambda tup: tup[2])[1:11]
    

    [final_data.append(i) for i in surrounding_data]
    





df = pd.DataFrame(final_data,columns=['selected_suburb','selected_suburb_ref', 'surrounding_suburbs','surrounding_suburb_ref', 'distance'])

df.to_csv('../data/final_data/surrounding_suburbs.csv')


In [None]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres:password@this_postgres')
name = 'surrounding_suburbs'
df.to_sql(name, engine, if_exists='replace')