In [None]:
import requests
import pandas as pd
import time
import os

In [None]:
BASE_URL ='https://myhospitalsapi.aihw.gov.au//api/v1'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}
measure_category_code = 'MYH-ED-TIME'

In [None]:
def create_tables(measure_category_code: str, measure_code: str):
    # CREATE MEASURE TABLE
    measure_response = requests.get(f'{BASE_URL}/measure-categories/{measure_category_code}/measures', headers=headers).json()
    measure_list = []
    for i in measure_response['result']:
        measure_list.append([i['measure_code'], i['measure_name']])
    measures_df = pd.DataFrame(data=measure_list, columns=['measure_code', 'measure_name'])
    measures_df.name = 'measures'
    # CREATE VALUES TABLE
    value_response = requests.get(f'{BASE_URL}/measures/{measure_code}/data-items', headers=headers).json()
    value_list = []
    for result in value_response['result']:
        value_list.append([result['reported_measure_code'], result['reporting_unit_summary']['reporting_unit_code'], result['value']])
    values_df = pd.DataFrame(data=value_list, columns=['reported_measure_code', 'reporting_unit_code', 'value'])
    values_df.name = 'values'
    # CREATE REPORTED MEASURES TABLE
    reported_measure_code_list = list(values_df['reported_measure_code'].unique())
    reported_measure_list = []
    for reported_measure_code in reported_measure_code_list:
        reported_measure_response = requests.get(f'{BASE_URL}/reported-measures/{reported_measure_code}', headers=headers).json()
        reported_measure_list.append([reported_measure_code, reported_measure_response['result']['reported_measure_name']])
    reported_measures_df = pd.DataFrame(data=reported_measure_list, columns=['reported_measure_code', 'reported_measure_name'])
    reported_measures_df.name = 'reported_measures'
    # CREATE REPORTING UNITS TABLE
    reporting_unit_response = requests.get(f'{BASE_URL}/reporting-units', headers=headers).json()   

    reporting_unit_list = []
    for result in reporting_unit_response['result']:
        # GET STATE
        mapped_reporting_units = result['mapped_reporting_units']
        state = None
        for mapped_reporting_unit in mapped_reporting_units:
            if mapped_reporting_unit['map_type']['mapped_reporting_unit_code'] == "STATE_MAPPING":
                state = mapped_reporting_unit['mapped_reporting_unit']['reporting_unit_code']
                break # Set state as the first reporting_unit_code in mapped_reporting_units

        # GET REPORTING UNIT INFO
        reporting_unit_list.append([result['reporting_unit_code'], result['reporting_unit_name'], result['reporting_unit_type']['reporting_unit_type_code'], result['reporting_unit_type']['reporting_unit_type_name'], state, result['closed'], result['private'], result['latitude'], result['longitude']])

    reporting_units_df = pd.DataFrame(data=reporting_unit_list, columns=['reporting_unit_code', 'reporting_unit_name', 'reporting_unit_type_code', 'reporting_unit_type_name', 'state', 'closed', 'private', 'latitude', 'longitude'])
    reporting_units_df.name = 'reporting_units'
    # JOIN VALUES TABLE, REPORTED MEASURES TABLE AND REPORTING UNITS TABLE
    df_join = pd.merge(values_df, reported_measures_df, on='reported_measure_code',how='inner')
    df_join = pd.merge(df_join, reporting_units_df, on='reporting_unit_code',how='inner')
    
    #SELECT COLUMNS
    df_select = df_join[['reporting_unit_name', 'reported_measure_name', 'value']]
    df_select.name = 'select'
    # CREATE LIST OF DATAFRAMES
    df_list = [measures_df, values_df, reported_measures_df, reporting_units_df]
    return df_list

In [None]:
create_tables('MYH-ED-TIME', 'MYH0036')[1].head()
# %timeit create_table('MYH-ED-TIME', MYH0036')

In [None]:
'''
CREATE MEASURE TABLE
EXAMPLE URL: https://myhospitalsapi.aihw.gov.au//api/v1/measure-categories/MYH-ED-TIME/measures
'''
measure_response = requests.get(f'{BASE_URL}/measure-categories/{measure_category_code}/measures', headers=headers).json()
measure_list = []
for i in measure_response['result']:
    measure_list.append([i['measure_code'], i['measure_name']])
measures_df = pd.DataFrame(data=measure_list, columns=['measure_code', 'measure_name'])
measures_df.head()

In [None]:
'''
CREATE VALUES TABLE
EXAMPLE URL: https://myhospitalsapi.aihw.gov.au//api/v1/measures/MYH0036/data-items
'''
measure_code = 'MYH0036'
value_response = requests.get(f'{BASE_URL}/measures/{measure_code}/data-items', headers=headers).json()
value_list = []
for result in value_response['result']:
    value_list.append([result['reported_measure_code'], result['reporting_unit_summary']['reporting_unit_code'], result['value']])
values_df = pd.DataFrame(data=value_list, columns=['reported_measure_code', 'reporting_unit_code', 'value'])
values_df.head()

In [None]:
'''
CREATE REPORTED MEASURES TABLE
EXAMPLE URL: https://myhospitalsapi.aihw.gov.au//api/v1/reported-measures/MYH-RM0025
'''
reported_measure_code_list = list(values_df['reported_measure_code'].unique())

reported_measure_list = []
for reported_measure_code in reported_measure_code_list:
    reported_measure_response = requests.get(f'{BASE_URL}/reported-measures/{reported_measure_code}', headers=headers).json()
    reported_measure_list.append([reported_measure_code, reported_measure_response['result']['reported_measure_name']])
reported_measures_df = pd.DataFrame(data=reported_measure_list, columns=['reported_measure_code', 'reported_measure_name'])
reported_measures_df.head()

In [None]:
'''
CREATE REPORTING UNITS TABLE
EXAMPLE URL: https://myhospitalsapi.aihw.gov.au/api/v1/reporting-units
'''
reporting_unit_response = requests.get(f'{BASE_URL}/reporting-units', headers=headers).json()   

reporting_unit_list = []
for result in reporting_unit_response['result']:
    # GET STATE
    mapped_reporting_units = result['mapped_reporting_units']
    state = None
    for mapped_reporting_unit in mapped_reporting_units:
        if mapped_reporting_unit['map_type']['mapped_reporting_unit_code'] == "STATE_MAPPING":
            state = mapped_reporting_unit['mapped_reporting_unit']['reporting_unit_code']
            break # Set state as the first reporting_unit_code in mapped_reporting_units
    
    # GET REPORTING UNIT INFO
    reporting_unit_list.append([result['reporting_unit_code'], result['reporting_unit_name'], result['reporting_unit_type']['reporting_unit_type_code'], result['reporting_unit_type']['reporting_unit_type_name'], state, result['closed'], result['private'], result['latitude'], result['longitude']])
    
reporting_units_df = pd.DataFrame(data=reporting_unit_list, columns=['reporting_unit_code', 'reporting_unit_name', 'reporting_unit_type_code', 'reporting_unit_type_name', 'state', 'closed', 'private', 'latitude', 'longitude',])
reporting_units_df.head()

In [None]:
'''
JOIN VALUES TABLE, REPORTED MEASURES TABLE AND REPORTING UNITS TABLE
'''
df_join = pd.merge(values_df, reported_measures_df, on='reported_measure_code',how='inner')
df_join = pd.merge(df_join, reporting_units_df, on='reporting_unit_code',how='inner')
df_join.head()

In [None]:
'''
SELECT COLUMNS
'''
df_select = df_join[['reporting_unit_name', 'reported_measure_name', 'value']]
df_select.head()

In [7]:
# SPARK (CLOUD)
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

credentials_location = '/home/kelvin/.gc/gc-key.json'

conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", "/home/kelvin/spark/spark-3.3.1-bin-hadoop3/lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

sc = SparkContext.getOrCreate(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [None]:
# Load data to local disk as parquet files
directory = 'data/MYH0036'
if not os.path.exists(directory):
    os.makedirs(directory)
df_list = create_tables('MYH-ED-TIME', 'MYH0036')
for df in df_list:
    file_path = os.path.join(directory, f'{df.name}.parquet')
    df.to_parquet(file_path)

In [None]:
# Partition reporting_units.parquet and values.parquet
for filename in ['reporting_units.parquet', 'values.parquet']:
    input_path = f'{directory}/{filename}'
    output_path = f'{directory}/{os.path.splitext(filename)[0]}'
    df = spark.read \
        .option("header", "true") \
        .parquet(input_path)
    df \
        .repartition(24) \
        .write.parquet(output_path, mode='overwrite')

In [None]:
pd.read_parquet(f'{directory}/reporting_units.parquet')

In [None]:
# Load data to GCS
!gsutil -m cp -r data/MYH0036 gs://de-project-bucket/data/MYH0036

In [8]:
# You can read partitioned parquet files by parsing the directory to speak.read.parquet
bucket_name = "de-project-bucket"
file_path = "data/MYH0036/reporting_units.parquet"
reporting_units_df = spark.read.parquet(f'gs://{bucket_name}/{file_path}')
reporting_units_df.show()

+-------------------+--------------------+------------------------+------------------------+-----+------+-------+----------+----------+
|reporting_unit_code| reporting_unit_name|reporting_unit_type_code|reporting_unit_type_name|state|closed|private|  latitude| longitude|
+-------------------+--------------------+------------------------+------------------------+-----+------+-------+----------+----------+
|              H0012|State Forensic Me...|                       H|                Hospital|   WA| false|  false|-31.960937|115.788431|
|              H0013|Justice Health Se...|                       H|                Hospital|  NSW| false|  false| -33.96907|151.243206|
|              H0014|The Children's Ho...|                       H|                Hospital|  NSW| false|  false|-33.801554|150.991759|
|              H0015|Sydney Children's...|                       H|                Hospital|  NSW| false|  false|-33.917179|151.238334|
|              H0016|Sacred Heart Heal...|      

In [None]:
output_table = 'MYH0036.reporting_units'
reporting_units_df.write.format('bigquery') \
    .option('table', output) \
    .save()

In [None]:
'''TO DO
FILTER BY
reporting_unit_code
reported_measure_code

Check other measures e.g. number of presentations

Load pandas df to spark df and partition + cluster

Create a udf

'''

In [None]:
# CONVERT PANDAS DF TO SPARK DF

# LOAD INTO GCS

# READ FILES FROM GCS WITH PYSPARK AND LOAD TO BIGQUERY