In [18]:
import os
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from sklearn import preprocessing

# Suburb Cluster Analysis

Load files

In [65]:
sydney_suburbs = pd.read_csv(os.path.join('sydney_suburbs.csv'), index_col=0)
brisbane_suburbs = pd.read_csv(os.path.join('brisbane_suburbs.csv'), index_col=0)
melbourne_suburbs = pd.read_csv(os.path.join('melbourne_suburbs.csv'), index_col=0)
hobart_suburbs = pd.read_csv(os.path.join('hobart_suburbs.csv'), index_col=0)
canberra_suburbs = pd.read_csv(os.path.join('canberra_suburbs.csv'), index_col=0)
adelaide_suburbs = pd.read_csv(os.path.join('adelaide_suburbs.csv'), index_col=0)
darwin_suburbs = pd.read_csv(os.path.join('darwin_suburbs.csv'), index_col=0)
perth_suburbs = pd.read_csv(os.path.join('perth_suburbs.csv'), index_col=0)

top_venues_grouped = pd.read_csv(os.path.join('top_venues_grouped.csv'), index_col=0)
top_venues_sorted = pd.read_csv(os.path.join('top_venues_sorted.csv'), index_col=0)

### Extract suburb information area, density, distance

In [77]:
suburbs_df = pd.concat([sydney_suburbs,
                         brisbane_suburbs,
                         melbourne_suburbs,
                         hobart_suburbs,
                         canberra_suburbs,
                         adelaide_suburbs,
                         darwin_suburbs,
                         perth_suburbs])

suburbs_df = suburbs_df[['Suburb', 'Postcode', 'area', 'density', 'distance']]

# Merge suburb table with top venues
suburbs_df = suburbs_df.merge(top_venues_grouped, how = 'inner', on = ['Suburb', 'Postcode'])

# Add suburb_id column
suburbs_df['Suburb_id'] = np.arange(0, suburbs_df.shape[0])

# Move suburb_id column to the front
cols = list(suburbs_df.columns)
cols = [cols[-1]] + cols[:-1]
suburbs_df = suburbs_df[cols]

# Convert Postcode column to string
suburbs_df.Postcode = suburbs_df.Postcode.astype(str)
# Remoe '.0' from Postcode column
suburbs_df.Postcode = suburbs_df.Postcode.str.extract('([0-9][0-9][0-9][0-9])',expand=False)

suburbs_df.head()

Unnamed: 0,Suburb_id,Suburb,Postcode,area,density,distance,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,...,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,0,Abbotsbury,2176,4.833,879.991724,31.895053,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Abbotsford,2046,0.994,5405.432596,7.682393,0.0,0.0,0.0,0.0,...,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Acacia Gardens,2763,0.957,3968.652038,30.950468,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Agnes Banks,2753,15.723,57.940597,53.58922,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Airds,2560,2.353,1243.518912,42.977938,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# User input

In [209]:
user_input = [
              {'Suburb':'South Yarra', 'Postcode':'3141', 'rating':5},
              {'Suburb':'St Kilda', 'Postcode':'3182', 'rating':4},
              {'Suburb':'Hawthorn', 'Postcode':'3122', 'rating':3},
              {'Suburb':'Richmond', 'Postcode':'3121', 'rating':2},
              {'Suburb':'Fitzroy', 'Postcode':'3065', 'rating':1}
             ]

user_input = [
              {'Suburb':'Point Cook', 'Postcode':'3030', 'rating':1},
              {'Suburb':'Hoppers Crossing', 'Postcode':'3029', 'rating':2},
              {'Suburb':'Werribee', 'Postcode':'3030', 'rating':3},
              {'Suburb':'Seabrook', 'Postcode':'3028', 'rating':4},
              {'Suburb':'Altona', 'Postcode':'3018', 'rating':5}
             ]

output_city = 'Sydney'

print('You have chosen the following suburbs:')
for index, inputs in enumerate(user_input):
    print('{}. {}, {}'.format(index+1, inputs['Suburb'], inputs['Postcode']))
print('')
print('And would like to find similar suburbs in {}'.format(output_city))

You have chosen the following suburbs:
1. Point Cook, 3030
2. Hoppers Crossing, 3029
3. Werribee, 3030
4. Seabrook, 3028
5. Altona, 3018

And would like to find similar suburbs in Sydney


Convert user inputs to dataframe

In [210]:
# Convert to dataframe
input_suburbs = pd.DataFrame(user_input)
# rearrange columns
input_suburbs = input_suburbs[['Suburb', 'Postcode', 'rating']]
input_suburbs

Unnamed: 0,Suburb,Postcode,rating
0,Point Cook,3030,1
1,Hoppers Crossing,3029,2
2,Werribee,3030,3
3,Seabrook,3028,4
4,Altona,3018,5


# Suburb content-based recommendation for area, distance, density

Create suburbs matrix

In [211]:
# Drop suburb and postcode column for analysis
suburbs_info = suburbs_df[['area', 'density', 'distance']]

Standardise matrix

In [248]:
# Create the Scaler object
scaler = preprocessing.MinMaxScaler()

# Fit your data on the scaler object
suburbs_info_scaled = scaler.fit_transform(suburbs_info)

suburbs_info_scaled = pd.DataFrame(suburbs_info_scaled, columns=suburbs_info.columns)

# Add suburb and postcode labels to rows
suburbs_info_scaled.insert(0, 'Suburb_id', suburbs_df.Suburb_id)
suburbs_info_scaled.insert(1, 'Suburb', suburbs_df.Suburb)
suburbs_info_scaled.insert(2, 'Postcode', suburbs_df.Postcode)

In [249]:
suburbs_info_scaled.head(10)

Unnamed: 0,Suburb_id,Suburb,Postcode,area,density,distance
0,0,Abbotsbury,2176,0.016476,0.041004,0.378833
1,1,Abbotsford,2046,0.003202,0.251873,0.085151
2,2,Acacia Gardens,2763,0.003074,0.184925,0.367376
3,3,Agnes Banks,2753,0.054129,0.0027,0.641967
4,4,Airds,2560,0.007901,0.057943,0.51326
5,5,Alexandria,2015,0.011642,0.112075,0.045652
6,6,Alfords Point,2234,0.010439,0.046928,0.250711
7,7,Allambie Heights,2100,0.021766,0.051327,0.145897
8,8,Allawah,2218,0.001777,0.456836,0.165178
9,9,Ambarvale,2560,0.00909,0.127401,0.529717


Since the distribution of the data are heavily skewed, I will transform the data to either log scale or sqrt scale to obtain a normal distribution.

In [251]:
np.log(0)

  """Entry point for launching an IPython kernel.


-inf

# Suburb content-based recommendation for top local venues

# Recommended suburbs for location, distance, area, and top local venues