In [3]:
# DATA2001 Week 9 Tutorial
# Material last updated: 26 Apr 2023
# Note: this notebook was designed with the Roboto Condensed font, which can be installed here: https://www.1001fonts.com/roboto-condensed-font.html

from IPython.display import HTML
HTML('''
    <style> body {font-family: "Roboto Condensed Light", "Roboto Condensed";} h2 {padding: 10px 12px; background-color: #E64626; position: static; color: #ffffff; font-size: 40px;} .text_cell_render p { font-size: 15px; } .text_cell_render h1 { font-size: 30px; } h1 {padding: 10px 12px; background-color: #E64626; color: #ffffff; font-size: 40px;} .text_cell_render h3 { padding: 10px 12px; background-color: #0148A4; position: static; color: #ffffff; font-size: 20px;} h4:before{ 
    content: "@"; font-family:"Wingdings"; font-style:regular; margin-right: 4px;} .text_cell_render h4 {padding: 8px; font-family: "Roboto Condensed Light"; position: static; font-style: italic; background-color: #FFB800; color: #ffffff; font-size: 18px; text-align: center; border-radius: 5px;}input[type=submit] {background-color: #E64626; border: solid; border-color: #734036; color: white; padding: 8px 16px; text-decoration: none; margin: 4px 2px; cursor: pointer; border-radius: 20px;}</style>
''')

In [4]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon
from geoalchemy2 import Geometry, WKTElement
import matplotlib.pyplot as plt

In [5]:
regions = gpd.read_file('SA2/SA2_2021_AUST_GDA2020.shp')
business = pd.read_csv('Businesses.csv')
stops = pd.read_csv('Stops.txt')
polling_places = pd.read_csv('PollingPlaces2019.csv')
primary = gpd.read_file('catchments/catchments_primary.shp')
secondary = gpd.read_file('catchments/catchments_secondary.shp')
future = gpd.read_file('catchments/catchments_future.shp')
population = pd.read_csv('Population.csv')
income = pd.read_csv('Income.csv')

In [6]:
business.describe()

Unnamed: 0,sa2_code,0_to_50k_businesses,50k_to_200k_businesses,200k_to_2m_businesses,2m_to_5m_businesses,5m_to_10m_businesses,10m_or_more_businesses,total_businesses
count,12217.0,12217.0,12217.0,12217.0,12217.0,12217.0,12217.0,12217.0
mean,114958700.0,18.82287,22.797659,23.555947,2.98011,1.089711,1.282639,70.540313
std,8810935.0,51.385349,43.099939,60.411508,14.196956,6.613522,15.953875,175.595935
min,101021000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,108011200.0,3.0,3.0,3.0,0.0,0.0,0.0,8.0
50%,116011300.0,8.0,10.0,10.0,0.0,0.0,0.0,33.0
75%,122021400.0,20.0,26.0,26.0,3.0,0.0,0.0,80.0
max,199999500.0,3589.0,1680.0,3782.0,811.0,458.0,1504.0,10125.0


In [7]:
business.head(10)

Unnamed: 0,industry_code,industry_name,sa2_code,sa2_name,0_to_50k_businesses,50k_to_200k_businesses,200k_to_2m_businesses,2m_to_5m_businesses,5m_to_10m_businesses,10m_or_more_businesses,total_businesses
0,A,"Agriculture, Forestry and Fishing",101021007,Braidwood,136,92,63,4,0,0,296
1,A,"Agriculture, Forestry and Fishing",101021008,Karabar,6,3,0,0,0,0,9
2,A,"Agriculture, Forestry and Fishing",101021009,Queanbeyan,6,4,3,0,0,3,15
3,A,"Agriculture, Forestry and Fishing",101021010,Queanbeyan - East,0,3,0,0,0,0,3
4,A,"Agriculture, Forestry and Fishing",101021012,Queanbeyan West - Jerrabomberra,7,4,5,0,0,0,16
5,A,"Agriculture, Forestry and Fishing",101021610,Googong,0,3,0,0,0,0,3
6,A,"Agriculture, Forestry and Fishing",101021611,Queanbeyan Surrounds,182,80,60,5,0,0,327
7,A,"Agriculture, Forestry and Fishing",101031013,Bombala,65,92,81,6,0,0,246
8,A,"Agriculture, Forestry and Fishing",101031014,Cooma,28,30,36,3,0,3,96
9,A,"Agriculture, Forestry and Fishing",101031015,Cooma Surrounds,91,84,73,0,0,0,251


In [8]:
print("Shape of the data:", business.shape)

Shape of the data: (12217, 11)


In [9]:
business.isnull().values.any()

False

No null values

In [10]:
print(business.groupby('industry_code').size())

industry_code
A    643
B    643
C    643
D    643
E    643
F    643
G    643
H    643
I    643
J    643
K    643
L    643
M    643
N    643
O    643
P    643
Q    643
R    643
S    643
dtype: int64


As we can see we have equal number of each category in our dataset

In [11]:
business.dtypes

industry_code             object
industry_name             object
sa2_code                   int64
sa2_name                  object
0_to_50k_businesses        int64
50k_to_200k_businesses     int64
200k_to_2m_businesses      int64
2m_to_5m_businesses        int64
5m_to_10m_businesses       int64
10m_or_more_businesses     int64
total_businesses           int64
dtype: object

In [12]:
business = business.astype({'industry_code': 'string', 'industry_name' : 'string', 'sa2_name' : 'string'})
business.dtypes

industry_code             string
industry_name             string
sa2_code                   int64
sa2_name                  string
0_to_50k_businesses        int64
50k_to_200k_businesses     int64
200k_to_2m_businesses      int64
2m_to_5m_businesses        int64
5m_to_10m_businesses       int64
10m_or_more_businesses     int64
total_businesses           int64
dtype: object

In [13]:
Income = pd.read_csv('Income.csv')
Income.head()

Unnamed: 0,sa2_code,sa2_name,earners,median_age,median_income,mean_income
0,101021007,Braidwood,2426,50,44246,61745
1,101021008,Karabar,5128,42,62946,67345
2,101021009,Queanbeyan,6778,39,61724,67276
3,101021010,Queanbeyan - East,3360,40,64010,71770
4,101021011,Queanbeyan Region,13502,44,74042,85303


In [21]:
numeric_columns = ['median_age', 'median_income', 'mean_income', 'earners']

In [22]:
for numeric_column in numeric_columns:
    try:   
        Income= Income.astype({numeric_column: 'float64'})
    except ValueError:
        print("Could not convert column", numeric_column)

Could not convert column earners


In [23]:
for numeric_column in numeric_columns:
    Income[numeric_column] = pd.to_numeric(Income[numeric_column], errors='coerce')

Income.isna().sum()

sa2_code         0
sa2_name         0
earners          6
median_age       6
median_income    6
mean_income      6
dtype: int64

In [24]:
for numeric_column in numeric_columns:
    try:   
        Income= Income.astype({numeric_column: 'float64'})
    except ValueError:
        print("Could not convert column", numeric_column)

In [25]:
Income.isna().sum()

sa2_code         0
sa2_name         0
earners          6
median_age       6
median_income    6
mean_income      6
dtype: int64

In [16]:
Population = pd.read_csv('Population.csv')
Population.head()

Unnamed: 0,sa2_code,sa2_name,0-4_people,5-9_people,10-14_people,15-19_people,20-24_people,25-29_people,30-34_people,35-39_people,...,45-49_people,50-54_people,55-59_people,60-64_people,65-69_people,70-74_people,75-79_people,80-84_people,85-and-over_people,total_people
0,102011028,Avoca Beach - Copacabana,424,522,623,552,386,222,306,416,...,572,602,570,520,464,369,226,142,70,7530
1,102011029,Box Head - MacMasters Beach,511,666,702,592,461,347,420,535,...,749,749,794,895,863,925,603,331,264,11052
2,102011030,Calga - Kulnura,200,225,258,278,274,227,214,286,...,325,436,422,397,327,264,190,100,75,4748
3,102011031,Erina - Green Point,683,804,880,838,661,502,587,757,...,859,882,901,930,917,1065,976,773,1028,14803
4,102011032,Gosford - Springfield,1164,1044,1084,1072,1499,1864,1750,1520,...,1330,1241,1377,1285,1166,949,664,476,537,21346


In [17]:
print(len(business['sa2_name'].unique()))

643
