<a href="https://colab.research.google.com/github/liampearson/nndss/blob/main/nndss_backfill.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [354]:
import pandas as pd
import numpy as np

In [355]:
urls = [
        'https://www.health.gov.au/sites/default/files/documents/2022/02/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-3-to-16-january-2022-table.xlsx',
        'https://www.health.gov.au/sites/default/files/documents/2022/02/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-17-to-30-january-2022-table.xlsx',
        'https://www.health.gov.au/sites/default/files/documents/2022/03/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-31-january-to-13-february-2022-table.xlsx',
        'https://www.health.gov.au/sites/default/files/documents/2022/03/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-14-to-27-february-2022-table.xlsx',
        'https://www.health.gov.au/sites/default/files/documents/2022/03/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-28-february-to-13-march-2022-table.xlsx',
        'https://www.health.gov.au/sites/default/files/documents/2022/04/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-14-to-27-march-2022-table.xlsx',
        'https://www.health.gov.au/sites/default/files/documents/2022/05/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-28-march-to-10-april-2022-table.xlsx',
        'https://www.health.gov.au/sites/default/files/documents/2022/05/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-11-to-17-april-2022-table.xlsx',
        'https://www.health.gov.au/sites/default/files/documents/2022/06/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-18-april-to-1-may-2022-table.xlsx',
        'https://www.health.gov.au/sites/default/files/documents/2022/06/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-2-to-15-may-2022-table.xlsx',
        'https://www.health.gov.au/sites/default/files/documents/2022/06/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-16-to-29-may-2022-table.xlsx',
        'https://www.health.gov.au/sites/default/files/documents/2022/07/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-30-may-to-12-june-2022-table.xlsx',
        'https://www.health.gov.au/sites/default/files/documents/2022/07/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-13-to-26-june-2022-table.xlsx'
#        'https://www.health.gov.au/sites/default/files/documents/2022/07/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-27-june-to-10-july-2022-table.xlsx',

]

In [356]:
# empty table to hold the fortnightly snapshots
table = pd.DataFrame()

#iterate over each report/url/excel_file
for url in urls:

  df = pd.read_excel(url,
                    header=2
                    )
  #get the report start and end dates
  start_date = df['This reporting period'].iloc[0].strftime('%Y-%m-%d')
  end_date = df['This reporting period'].iloc[1].strftime('%Y-%m-%d')

  print("processing report for dates: {} --> {}...".format(start_date, end_date))

  #tidy up the header names: make all lower case
  df = df.rename(columns=str.lower)
  #replace spaces in header names with underscore
  df.columns = df. columns.str.replace(' ','_')

  #Get the index where the footnotes begin
  footnote_index = df[df['disease_group'].str.contains('Footnotes')==True].index.values[0]

  # Only keep the rows BEFORE the index
  # -1 to also remove the totals row
  df = df[df.index < footnote_index-1]

  #Forward fill the 'Disease group' labels (they're merged cells in excel)
  df['disease_group'] = df['disease_group'].fillna(method = 'ffill')

  #drop the date rows
  df.drop(index=[0,1], inplace=True)

  #convert code to int
  df['disease_code'] = df['disease_code'].astype(int)

  #convert report numbers to int


  # we just want the raw numbers
  keep_headers = ['disease_group', 'disease_name', 'disease_code', 'act', 'nsw', 'nt',
                  'qld', 'sa', 'tas', 'vic', 'wa']

  # reduce dataframe to just the columns we want
  df = df[keep_headers]

  #add start and end dates
  df['start_date'] = start_date
  df['end_date'] = end_date

  #convert the state numbers to INT
  states = ['act', 'nsw', 'nt','qld', 'sa', 'tas', 'vic', 'wa']
  
  #strip punctuation
  for state in states:
  #  if df[state].dtype==object: #then we need to clean the numbers
    df[state] = df[state].astype(str).str.replace("[^\w\s]", "", regex=True)
    # replace NN (not notifiable) with nulls
    df[state].replace('NN', np.nan, inplace=True)
    # workaround since Int64 cannot convert objects
    df[state] = pd.to_numeric(df[state], errors='coerce').astype('Int64')  

  table = table.append(df)

  print("completed report for period: {} --> {}".format(start_date, end_date))

processing report for dates: 2022-01-03 --> 2022-01-16...
completed report for period: 2022-01-03 --> 2022-01-16
processing report for dates: 2022-01-17 --> 2022-01-30...
completed report for period: 2022-01-17 --> 2022-01-30
processing report for dates: 2022-01-31 --> 2022-02-13...
completed report for period: 2022-01-31 --> 2022-02-13
processing report for dates: 2022-02-14 --> 2022-02-27...
completed report for period: 2022-02-14 --> 2022-02-27
processing report for dates: 2022-02-28 --> 2022-03-13...
completed report for period: 2022-02-28 --> 2022-03-13
processing report for dates: 2022-03-14 --> 2022-03-27...
completed report for period: 2022-03-14 --> 2022-03-27
processing report for dates: 2022-03-28 --> 2022-04-10...
completed report for period: 2022-03-28 --> 2022-04-10
processing report for dates: 2022-04-11 --> 2022-04-17...
completed report for period: 2022-04-11 --> 2022-04-17
processing report for dates: 2022-04-18 --> 2022-05-01...
completed report for period: 2022-04-1

In [357]:
table

Unnamed: 0,disease_group,disease_name,disease_code,act,nsw,nt,qld,sa,tas,vic,wa,start_date,end_date
2,Bloodborne diseases,Hepatitis B (newly acquired),39,0,0,0,20,0,0,0,0,2022-01-03,2022-01-16
3,Bloodborne diseases,Hepatitis B (unspecified),52,3,44,0,330,2,0,40,130,2022-01-03,2022-01-16
4,Bloodborne diseases,Hepatitis C (newly acquired),40,0,1,0,150,0,0,0,10,2022-01-03,2022-01-16
5,Bloodborne diseases,Hepatitis C (unspecified),53,4,46,30,350,0,0,29,270,2022-01-03,2022-01-16
6,Bloodborne diseases,Hepatitis D,50,0,0,0,10,0,1,0,0,2022-01-03,2022-01-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,Zoonoses,Q fever,27,0,3,0,40,0,0,0,10,2022-06-13,2022-06-26
70,Zoonoses,Rabies,28,0,0,0,0,0,0,0,0,2022-06-13,2022-06-26
71,Zoonoses,Tularaemia,70,0,0,0,0,0,0,0,0,2022-06-13,2022-06-26
72,Other notifiable diseases,iGAS^,82,0,,50,200,30,,,90,2022-06-13,2022-06-26


In [358]:
table.to_csv('nndss.csv', index=False)