In [1]:
import requests
import pandas as pd
import time

In [2]:
BASE_URL ='https://myhospitalsapi.aihw.gov.au//api/v1'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}
measure_category_code = 'MYH-ED-TIME'

In [13]:
def create_table(measure_code: str):
    # CREATE VALUES TABLE
    value_response = requests.get(f'{BASE_URL}/measures/{measure_code}/data-items', headers=headers).json()
    value_list = []
    for result in value_response['result']:
        value_list.append([result['reported_measure_code'], result['reporting_unit_summary']['reporting_unit_code'], result['value']])
    values_df = pd.DataFrame(data=value_list, columns=['reported_measure_code', 'reporting_unit_code', 'value'])
    
    # CREATE REPORTED MEASURES TABLE
    reported_measure_code_list = list(values_df['reported_measure_code'].unique())

    reported_measure_list = []
    for reported_measure_code in reported_measure_code_list:
        reported_measure_response = requests.get(f'{BASE_URL}/reported-measures/{reported_measure_code}', headers=headers).json()
        reported_measure_list.append([reported_measure_code, reported_measure_response['result']['reported_measure_name']])
    reported_measures_df = pd.DataFrame(data=reported_measure_list, columns=['reported_measure_code', 'reported_measure_name'])
    
    # CREATE REPORTING UNITS TABLE
    reporting_unit_response = requests.get(f'{BASE_URL}/reporting-units', headers=headers).json()   

    reporting_unit_list = []
    for result in reporting_unit_response['result']:
        # GET STATE
        mapped_reporting_units = result['mapped_reporting_units']
        state = None
        for mapped_reporting_unit in mapped_reporting_units:
            if mapped_reporting_unit['map_type']['mapped_reporting_unit_code'] == "STATE_MAPPING":
                state = mapped_reporting_unit['mapped_reporting_unit']['reporting_unit_code']
                break # Set state as the first reporting_unit_code in mapped_reporting_units

        # GET REPORTING UNIT INFO
        reporting_unit_list.append([result['reporting_unit_code'], result['reporting_unit_name'], result['reporting_unit_type']['reporting_unit_type_code'], result['reporting_unit_type']['reporting_unit_type_name'], state, result['closed'], result['private'], result['latitude'], result['longitude']])

    reporting_units_df = pd.DataFrame(data=reporting_unit_list, columns=['reporting_unit_code', 'reporting_unit_name', 'reporting_unit_type_code', 'reporting_unit_type_name', 'state', 'closed', 'private', 'latitude', 'longitude',])
    
    # JOIN VALUES TABLE AND REPORTED MEASURES TABLE
    df_join = pd.merge(values_df, reported_measures_df, on='reported_measure_code',how='inner')
    df_join = pd.merge(df_join, reporting_units_df, on='reporting_unit_code',how='inner')
    
    df_select = df_join[['reporting_unit_name', 'reported_measure_name', 'value']]
    return df_select.head()  

In [17]:
create_table('MYH0036')
# %timeit create_table('MYH0036')

2.73 s ± 35.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
'''
CREATE MEASURE TABLE
EXAMPLE URL: https://myhospitalsapi.aihw.gov.au//api/v1/measure-categories/MYH-ED-TIME/measures
'''
measure_response = requests.get(f'{BASE_URL}/measure-categories/{measure_category_code}/measures', headers=headers).json()
measure_list = []
for i in measure_response['result']:
    measure_list.append([i['measure_code'], i['measure_name']])
measure_df = pd.DataFrame(data=measure_list, columns=['measure_code', 'measure_name'])
measure_df=measure_df.set_index('measure_code')
measure_df.head()

Unnamed: 0_level_0,measure_name
measure_code,Unnamed: 1_level_1
MYH0005,Percentage of patients who depart the emergenc...
MYH0012,Number of patients presenting to the emergency...
MYH0013,Time until most patients (90%) departed the em...
MYH0036,Median time (50%) patients departed emergency ...


In [5]:
'''
CREATE VALUES TABLE
EXAMPLE URL: https://myhospitalsapi.aihw.gov.au//api/v1/measures/MYH0036/data-items
'''
measure_code = 'MYH0036'
value_response = requests.get(f'{BASE_URL}/measures/{measure_code}/data-items', headers=headers).json()
value_list = []
for result in value_response['result']:
    value_list.append([result['reported_measure_code'], result['reporting_unit_summary']['reporting_unit_code'], result['value']])
values_df = pd.DataFrame(data=value_list, columns=['reported_measure_code', 'reporting_unit_code', 'value'])
values_df.head()

Unnamed: 0,reported_measure_code,reporting_unit_code,value
0,MYH-RM0298,H0014,355.0
1,MYH-RM0299,H0014,170.0
2,MYH-RM0300,H0014,204.0
3,MYH-RM0298,H0014,338.0
4,MYH-RM0299,H0014,173.0


In [7]:
'''
CREATE REPORTED MEASURES TABLE
EXAMPLE URL: https://myhospitalsapi.aihw.gov.au//api/v1/reported-measures/MYH-RM0025
'''
reported_measure_code_list = list(values_df['reported_measure_code'].unique())

reported_measure_list = []
for reported_measure_code in reported_measure_code_list:
    reported_measure_response = requests.get(f'{BASE_URL}/reported-measures/{reported_measure_code}', headers=headers).json()
    reported_measure_list.append([reported_measure_code, reported_measure_response['result']['reported_measure_name']])
reported_measures_df = pd.DataFrame(data=reported_measure_list, columns=['reported_measure_code', 'reported_measure_name'])
reported_measures_df.head()

Unnamed: 0,reported_measure_code,reported_measure_name
0,MYH-RM0298,Subsequently admitted patients
1,MYH-RM0299,Not subsequently admitted patients
2,MYH-RM0300,All patients


In [8]:
'''
CREATE REPORTING UNITS TABLE
EXAMPLE URL: https://myhospitalsapi.aihw.gov.au/api/v1/reporting-units
'''
reporting_unit_response = requests.get(f'{BASE_URL}/reporting-units', headers=headers).json()   

reporting_unit_list = []
for result in reporting_unit_response['result']:
    # GET STATE
    mapped_reporting_units = result['mapped_reporting_units']
    state = None
    for mapped_reporting_unit in mapped_reporting_units:
        if mapped_reporting_unit['map_type']['mapped_reporting_unit_code'] == "STATE_MAPPING":
            state = mapped_reporting_unit['mapped_reporting_unit']['reporting_unit_code']
            break # Set state as the first reporting_unit_code in mapped_reporting_units
    
    # GET REPORTING UNIT INFO
    reporting_unit_list.append([result['reporting_unit_code'], result['reporting_unit_name'], result['reporting_unit_type']['reporting_unit_type_code'], result['reporting_unit_type']['reporting_unit_type_name'], state, result['closed'], result['private'], result['latitude'], result['longitude']])
    
reporting_units_df = pd.DataFrame(data=reporting_unit_list, columns=['reporting_unit_code', 'reporting_unit_name', 'reporting_unit_type_code', 'reporting_unit_type_name', 'state', 'closed', 'private', 'latitude', 'longitude',])
reporting_units_df.head()

Unnamed: 0,reporting_unit_code,reporting_unit_name,reporting_unit_type_code,reporting_unit_type_name,state,closed,private,latitude,longitude
0,H0012,State Forensic Mental Health Service,H,Hospital,WA,False,False,-31.960937,115.788431
1,H0013,Justice Health Services,H,Hospital,NSW,False,False,-33.96907,151.243206
2,H0014,The Children's Hospital at Westmead,H,Hospital,NSW,False,False,-33.801554,150.991759
3,H0015,Sydney Children's Hospital,H,Hospital,NSW,False,False,-33.917179,151.238334
4,H0016,Sacred Heart Health Service,H,Hospital,NSW,False,False,-33.880525,151.219237


In [9]:
'''
JOIN VALUES TABLE, REPORTED MEASURES TABLE AND 
'''
df_join = pd.merge(values_df, reported_measures_df, on='reported_measure_code',how='inner')
df_join = pd.merge(df_join, reporting_units_df, on='reporting_unit_code',how='inner')
df_join.head()

Unnamed: 0,reported_measure_code,reporting_unit_code,value,reported_measure_name,reporting_unit_name,reporting_unit_type_code,reporting_unit_type_name,state,closed,private,latitude,longitude
0,MYH-RM0298,H0014,355.0,Subsequently admitted patients,The Children's Hospital at Westmead,H,Hospital,NSW,False,False,-33.801554,150.991759
1,MYH-RM0298,H0014,338.0,Subsequently admitted patients,The Children's Hospital at Westmead,H,Hospital,NSW,False,False,-33.801554,150.991759
2,MYH-RM0298,H0014,314.0,Subsequently admitted patients,The Children's Hospital at Westmead,H,Hospital,NSW,False,False,-33.801554,150.991759
3,MYH-RM0298,H0014,256.0,Subsequently admitted patients,The Children's Hospital at Westmead,H,Hospital,NSW,False,False,-33.801554,150.991759
4,MYH-RM0298,H0014,263.0,Subsequently admitted patients,The Children's Hospital at Westmead,H,Hospital,NSW,False,False,-33.801554,150.991759


In [12]:
'''
SELECT COLUMNS
'''
df_select = df_join[['reporting_unit_name', 'reported_measure_name', 'value']]
df_select.head()

Unnamed: 0,reporting_unit_name,reported_measure_name,value
0,The Children's Hospital at Westmead,Subsequently admitted patients,355.0
1,The Children's Hospital at Westmead,Subsequently admitted patients,338.0
2,The Children's Hospital at Westmead,Subsequently admitted patients,314.0
3,The Children's Hospital at Westmead,Subsequently admitted patients,256.0
4,The Children's Hospital at Westmead,Subsequently admitted patients,263.0


In [1]:
!echo $PYTHONPATH

/home/kelvin/spark/spark-3.0.3-bin-hadoop3.2/python/lib/py4j-0.10.9-src.zip:/home/kelvin/spark/spark-3.0.3-bin-hadoop3.2/python/:


In [2]:
# SPARK
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

ModuleNotFoundError: No module named 'pyspark'

In [18]:


spark.createDataFrame(df)
# Check df schema 
df = df.repartition(24)
df.write.parquet('data/f'{df})

In [None]:
'''TO DO
FILTER BY
reporting_unit_code
reported_measure_code

GET YEAR

Check other measures e.g. number of presentations

Load pandas df to spark df and partition + cluster
Create a udf

'''

In [None]:
# CONVERT PANDAS DF TO SPARK DF

# PARTITION AND CLUSTER

# LOAD INTO GCS

# READ FILES FROM GCS WITH PYSPARK AND LOAD TO BIGQUERY