In [1]:
import itertools
import json
import os

import pandas as pd
import geopandas as gpd
import ijson
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Assigning tweets to an SA2

## Loading SA2 data

In [2]:
sa2_data = "../data/1270055001_sa2_2016_aust_shape.zip"

sa2_df = gpd.read_file(sa2_data)
# filter to only include melbourne
sa2_df = sa2_df[sa2_df['GCC_NAME16'] == 'Greater Melbourne']
sa2_df

Unnamed: 0,SA2_MAIN16,SA2_5DIG16,SA2_NAME16,SA3_CODE16,SA3_NAME16,SA4_CODE16,SA4_NAME16,GCC_CODE16,GCC_NAME16,STE_CODE16,STE_NAME16,AREASQKM16,geometry
682,206011105,21105,Brunswick,20601,Brunswick - Coburg,206,Melbourne - Inner,2GMEL,Greater Melbourne,2,Victoria,5.1425,"POLYGON ((144.94974 -37.76277, 144.95003 -37.7..."
683,206011106,21106,Brunswick East,20601,Brunswick - Coburg,206,Melbourne - Inner,2GMEL,Greater Melbourne,2,Victoria,2.1680,"POLYGON ((144.97340 -37.76204, 144.97335 -37.7..."
684,206011107,21107,Brunswick West,20601,Brunswick - Coburg,206,Melbourne - Inner,2GMEL,Greater Melbourne,2,Victoria,3.1795,"POLYGON ((144.93407 -37.75969, 144.93405 -37.7..."
685,206011108,21108,Coburg,20601,Brunswick - Coburg,206,Melbourne - Inner,2GMEL,Greater Melbourne,2,Victoria,6.9346,"POLYGON ((144.94847 -37.73951, 144.94878 -37.7..."
686,206011109,21109,Pascoe Vale South,20601,Brunswick - Coburg,206,Melbourne - Inner,2GMEL,Greater Melbourne,2,Victoria,2.9887,"POLYGON ((144.93264 -37.74226, 144.93251 -37.7..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
986,214021381,21381,Mount Eliza,21402,Mornington Peninsula,214,Mornington Peninsula,2GMEL,Greater Melbourne,2,Victoria,23.2077,"POLYGON ((145.07201 -38.18710, 145.07216 -38.1..."
987,214021382,21382,Mount Martha,21402,Mornington Peninsula,214,Mornington Peninsula,2GMEL,Greater Melbourne,2,Victoria,30.2094,"POLYGON ((145.02584 -38.25242, 145.02616 -38.2..."
988,214021383,21383,Point Nepean,21402,Mornington Peninsula,214,Mornington Peninsula,2GMEL,Greater Melbourne,2,Victoria,67.1875,"MULTIPOLYGON (((144.73141 -38.35025, 144.73147..."
989,214021384,21384,Rosebud - McCrae,21402,Mornington Peninsula,214,Mornington Peninsula,2GMEL,Greater Melbourne,2,Victoria,23.7957,"POLYGON ((144.87372 -38.36268, 144.87388 -38.3..."


## Determining tweet location

In [3]:
from shapely.geometry import shape, Point

sa2_main16_df = sa2_df[['SA2_MAIN16', 'geometry']]


# return the SA2 main code for 2016 boundaries
def get_sa2_main16(coordinates):
    point = Point([coordinates['longitude'], coordinates['latitude']])
    # check if point falls in cell
    row_filter = sa2_main16_df.apply(lambda row: row['geometry'].contains(point) or row['geometry'].intersects(point), axis=1)
    return sa2_main16_df[row_filter]['SA2_MAIN16'].iloc[0]


In [4]:
test_point = sa2_main16_df['geometry'].iloc[0].representative_point()
lon, lat = test_point.coords[0]

In [5]:
get_sa2_main16({"latitude": lat, "longitude":lon})

'206011105'

In [6]:
# Test the correct code produced for a representative point in each region
index = 50
test_point = sa2_main16_df['geometry'].iloc[index].representative_point()
lon, lat = test_point.coords[0]
out_sa2_main16 = get_sa2_main16({"latitude": lat, "longitude":lon})
print(f"Output: {out_sa2_main16}")
print(f"Expected: {sa2_main16_df['SA2_MAIN16'].iloc[index]}")

Output: 207011155
Expected: 207011155


# Language Spoken at home 

In [7]:
language_sa2_data = '..\data\SA2-P13_Language_Spoken_at_Home_by_Sex-Census_2016.json\sa2_p13_lang_spoken_at_home_by_sex_census_2016-6394559813018131150.json'
#language_sa2_df = pd.read_json(language_sa2_data, index='features')
lang_dicts = []
with open(language_sa2_data) as f:
    data_points = ijson.items(f, 'features.item.properties')
    lang_dicts = list(data_points)

lang_df = pd.DataFrame(lang_dicts)
lang_df
        

Unnamed: 0,sa2_name16,sa2_main16,tot_p,lang_spoken_home_ns_p,spks_eng_on_p
0,Collingwood,206071141,8619,1073,5112
1,Prahran - Windsor,206061136,19768,1864,13354
2,Hadfield,210031440,5613,291,2741
3,Sunbury - South,210041241,26335,1224,22708
4,Sunbury,210041240,12576,539,10933
...,...,...,...,...,...
304,Flinders,214021378,5269,357,4697
305,Hastings - Somers,214021379,21797,1270,19743
306,Mornington,214021380,23344,1066,20838
307,Mount Eliza,214021381,17475,568,15546


- determine statistics
- lang_spoken_home_ns_p: this gives people who didn't report if they speak another language at home. 
- let's assume that the number of people who do so are randomly distributed
- then the people who reported speaking a language is num_reported = tot_p - lang_spoken_home_ns_p
- the proportion of people who speak a language other than English is then 1-spks_eng_on_p/(num_reported)


In [8]:
lang_df['num_reported'] = lang_df['tot_p'] - lang_df['lang_spoken_home_ns_p']
lang_df['prop_spk_english_only'] = lang_df['spks_eng_on_p']/lang_df['num_reported']
lang_df['prop_spk_other_lang'] = 1 - lang_df['prop_spk_english_only']

Let's look at the areas with the lowest proportion of people who speak English only:

In [9]:
lang_df.sort_values(by='prop_spk_other_lang').head(10)

Unnamed: 0,sa2_name16,sa2_main16,tot_p,lang_spoken_home_ns_p,spks_eng_on_p,num_reported,prop_spk_english_only,prop_spk_other_lang
219,Bunyip - Garfield,212011288,8467,526,7679,7941,0.967007,0.032993
305,Hastings - Somers,214021379,21797,1270,19743,20527,0.961806,0.038194
117,Panton Hill - St Andrews,209031212,5003,369,4452,4634,0.960725,0.039275
153,Riddells Creek,210021234,3882,121,3609,3761,0.959585,0.040415
122,Hurstbridge,209031210,3377,106,3134,3271,0.958117,0.041883
308,Somerville,214021385,17709,852,16150,16857,0.958059,0.041941
217,Yarra Valley,211051286,15901,955,14317,14946,0.957915,0.042085
304,Flinders,214021378,5269,357,4697,4912,0.95623,0.04377
203,Healesville - Yarra Glen,211051276,13661,812,12260,12849,0.95416,0.04584
154,Romsey,210021235,9197,642,8160,8555,0.953828,0.046172


This seems reasonable. Now let's look at the areas with the highest proportion of people who do not only speak English

In [10]:
lang_df.sort_values(by='prop_spk_other_lang').dropna().tail(10)

Unnamed: 0,sa2_name16,sa2_main16,tot_p,lang_spoken_home_ns_p,spks_eng_on_p,num_reported,prop_spk_english_only,prop_spk_other_lang
233,Dandenong,212041311,31369,2840,7677,28529,0.269095,0.730905
251,Cairnlea,213011329,9732,377,2517,9355,0.269054,0.730946
232,Clayton South,212041310,12857,601,3276,12256,0.267298,0.732702
53,Melbourne,206041122,57734,10183,12602,47551,0.265021,0.734979
166,Campbellfield - Coolaroo,210051243,16201,1221,3863,14980,0.257877,0.742123
162,Meadow Heights,210051247,14839,938,3492,13901,0.251205,0.748795
240,Springvale South,212041318,12690,524,2885,12166,0.237136,0.762864
264,Sunshine North,213011337,11739,597,2616,11142,0.234787,0.765213
262,St Albans - South,213011335,17796,1491,3427,16305,0.210181,0.789819
239,Springvale,212041317,21958,1238,3809,20720,0.183832,0.816168


This also seems reasonable

# SEIFA



In [11]:
seifa_sa2_data = '..\data\ABS_-_Socio-Economic_Indexes_for_Areas__SEIFA__-_The_Index_of_Relative_Socio-economic_Advantage_and_Disadvantage__SA2__2016.json\sa2_seifa_irsad_2016-8012956430373350930.json'
#seifa_sa2_df = pd.read_json(seifa_sa2_data, index='features')
seifa_dicts = []
with open(seifa_sa2_data) as f:
    data_points = ijson.items(f, 'features.item.properties')
    seifa_dicts = list(data_points)

seifa_df = pd.DataFrame(seifa_dicts)

Unnamed: 0,usual_res_pop,state_rank,ur_pop_wthout_sa1_level_score_pc,state,state_percentile,state_decile,max_sa1_score_in_area,sa2_name16,national_percentile,irsad_score,national_decile,national_rank,sa2_main16,min_sa1_score_in_area
0,19160,142,0.05,VIC,32,4,1040,Altona Meadows,37,966,4,799,213021342,875
1,15053,164,0.13,VIC,37,4,1075,Cranbourne West,41,976,5,891,212031304,889
2,19714,415,0.00,VIC,92,10,1162,Prahran - Windsor,90,1106,9,1965,206061136,841
3,8513,215,0.00,VIC,48,5,1181,Collingwood,51,999,6,1114,206071141,516
4,5604,95,0.07,VIC,21,3,996,Hadfield,28,947,3,604,210031440,850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,17888,435,0.00,VIC,96,10,1185,Mount Eliza,94,1121,10,2033,214021381,953
301,18548,366,0.02,VIC,81,9,1155,Mount Martha,82,1076,9,1773,214021382,992
302,17123,195,0.02,VIC,43,5,1139,Point Nepean,48,992,5,1042,214021383,885
303,20943,72,4.10,VIC,16,2,1072,Rosebud - McCrae,23,934,3,489,214021384,801


In [16]:
seifa_df.sort_values(by='state_rank', ascending=False)

Unnamed: 0,usual_res_pop,state_rank,ur_pop_wthout_sa1_level_score_pc,state,state_percentile,state_decile,max_sa1_score_in_area,sa2_name16,national_percentile,irsad_score,national_decile,national_rank,sa2_main16,min_sa1_score_in_area
20,4962,454,0.10,VIC,100,10,1199,East Melbourne,99,1160,10,2151,206041119,1081
65,16326,453,0.00,VIC,100,10,1201,Glen Iris - East,99,1154,10,2142,207011150,1105
69,16319,452,0.00,VIC,100,10,1180,Surrey Hills (West) - Canterbury,98,1147,10,2127,207011155,1106
54,13050,451,0.00,VIC,100,10,1196,Toorak,97,1144,10,2119,206061138,1098
85,22695,450,0.00,VIC,99,10,1189,Brighton (Vic.),97,1144,10,2118,208011169,937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,14842,7,0.00,VIC,2,1,905,Meadow Heights,4,846,1,71,210051247,734
215,11306,6,0.03,VIC,2,1,915,Doveton,3,844,1,64,212021295,775
27,58,5,0.00,VIC,2,1,838,Braeside,3,838,1,58,208031184,838
155,16122,3,0.16,VIC,1,1,887,Campbellfield - Coolaroo,3,828,1,49,210051243,754


# Perform tweet analysis

- determine tweet SA2 code
- perform sentiment analysis
- join with language and seifa 
- draw some plots