<a href="https://colab.research.google.com/github/liampearson/nndss/blob/main/nndss_fortnightly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [116]:
import pandas as pd
import numpy as np

In [117]:
#get the current table
url = 'https://raw.githubusercontent.com/liampearson/nndss/main/nndss.csv'
table = pd.read_csv(url)
table

Unnamed: 0,disease_group,disease_name,disease_code,act,nsw,nt,qld,sa,tas,vic,...,act_cumulative_sum,nsw_cumulative_sum,nt_cumulative_sum,qld_cumulative_sum,sa_cumulative_sum,tas_cumulative_sum,vic_cumulative_sum,wa_cumulative_sum,aus,aus_total
0,Vectorborne diseases,Flavivirus infection (unspecified),1,0.0,0.0,0,0,0,0.0,0.0,...,0.0,0.0,0,0,0,0.0,0.0,0,0.0,0.0
1,Vectorborne diseases,Flavivirus infection (unspecified),1,0.0,0.0,0,0,0,0.0,0.0,...,0.0,0.0,0,0,0,0.0,0.0,0,0.0,0.0
2,Vectorborne diseases,Flavivirus infection (unspecified),1,0.0,0.0,0,0,0,0.0,0.0,...,0.0,0.0,0,0,0,0.0,0.0,0,0.0,0.0
3,Vectorborne diseases,Flavivirus infection (unspecified),1,0.0,0.0,0,0,0,0.0,0.0,...,0.0,0.0,0,0,0,0.0,0.0,0,0.0,0.0
4,Vectorborne diseases,Flavivirus infection (unspecified),1,0.0,0.0,0,0,0,0.0,0.0,...,0.0,0.0,0,0,0,0.0,0.0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,Zoonoses,Monkeypox virus (MPXV) infection ++,84,0.0,1.0,0,0,0,0.0,1.0,...,0.0,1.0,0,0,0,0.0,1.0,0,2.0,2.0
1066,Zoonoses,Monkeypox virus (MPXV) infection ++,84,0.0,3.0,0,0,0,0.0,2.0,...,0.0,4.0,0,0,0,0.0,3.0,0,5.0,7.0
1067,Zoonoses,Monkeypox virus (MPXV) infection++,84,0.0,1.0,0,0,0,0.0,1.0,...,0.0,5.0,0,0,0,0.0,4.0,0,2.0,9.0
1068,Zoonoses,Monkeypox virus (MPXV) infection++,84,2.0,10.0,0,1,1,0.0,4.0,...,2.0,15.0,0,1,1,0.0,8.0,0,18.0,27.0


In [112]:
urls = [
    'https://www.health.gov.au/sites/default/files/documents/2022/08/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-11-july-to-24-july-2022-table.xlsx'
]

In [113]:
#iterate over each report/url/excel_file
for url in urls:
  # import the data
  df = pd.read_excel(url, header=2)

  #get the report start and end dates
  start_date = df['This reporting period'].iloc[0].strftime('%Y-%m-%d')
  end_date = df['This reporting period'].iloc[1].strftime('%Y-%m-%d')

  print("report for dates: {} --> {}; processing...".format(start_date, end_date))

  # DATA PREP ------------------
  #tidy up the header names: make all lower case
  df = df.rename(columns=str.lower)

  #replace spaces in header names with underscore
  df.columns = df. columns.str.replace(' ','_')

  #Get the index where the footnotes begin - we don't want these
  footnote_index = df[df['disease_group'].str.contains('Footnotes')==True].index.values[0]

  # Only keep the rows BEFORE the footnotes index
  # -1 to also remove the totals row
  df = df[df.index < footnote_index-1]
  #Forward fill the 'Disease group' labels (they're merged cells in excel)
  df['disease_group'] = df['disease_group'].fillna(method = 'ffill')

  #drop the date rows
  df.drop(index=[0,1], inplace=True)

  #convert disease code to int
  df['disease_code'] = df['disease_code'].astype(int)

  #convert report numbers to int

  # we just want the raw numbers, not the % change etc etc
  keep_headers = ['disease_group', 'disease_name', 'disease_code', 'act', 'nsw', 'nt',
                  'qld', 'sa', 'tas', 'vic', 'wa']

  # reduce dataframe to just the columns we want
  df = df[keep_headers]

  #add start and end dates
  df['start_date'] = start_date
  df['end_date'] = end_date

  # for each state, strip punctuation and convert the state numbers to INT & 
  states = ['act', 'nsw', 'nt','qld', 'sa', 'tas', 'vic', 'wa']
  
  #strip punctuation
  for state in states:
    # we need to remove all punctuation (like commas and asterisks), but that will cause 10.0 to go to 100 so:
    df[state] = df[state].astype(str).str.replace(".0", "", regex=False)

   #if df[state].dtype==object: #then we need to clean the numbers
    df[state] = df[state].astype(str).str.replace("[^\w\s]", "", regex=True)

    # replace NN (not notifiable) with nulls
    df[state].replace('NN', np.nan, inplace=True)

    # workaround since Int64 cannot convert objects
    df[state] = pd.to_numeric(df[state], errors='coerce').astype('Int64')

  # add this report to the rest of the data
  table = table.append(df)

  print("report for period: {} --> {}; completed".format(start_date, end_date))

report for dates: 2022-07-11 --> 2022-07-24; processing...
report for period: 2022-07-11 --> 2022-07-24; completed


## Get Cumulative Sums and AUS-wide totals

In [114]:
states = ['act', 'nsw', 'nt','qld', 'sa', 'tas', 'vic', 'wa']

for state in states:
  table[state+'_cumulative_sum'] = table.groupby(['disease_code'])[state].cumsum()

#row -wise sum of the states
table['aus'] = table[states].sum(axis=1)
table['aus_cumulative_sum'] = table.groupby(['disease_code'])['aus'].cumsum()

In [120]:
#row -wise sum of the states
table['aus'] = table[states].sum(axis=1)
table['aus_cumulative_sum'] = table.groupby(['disease_code'])['aus'].cumsum()

## Inspect a disease

In [121]:
table[table['disease_code']==84]

Unnamed: 0,disease_group,disease_name,disease_code,act,nsw,nt,qld,sa,tas,vic,...,act_cumulative_sum,nsw_cumulative_sum,nt_cumulative_sum,qld_cumulative_sum,sa_cumulative_sum,tas_cumulative_sum,vic_cumulative_sum,wa_cumulative_sum,aus,aus_cumulative_sum
1065,Zoonoses,Monkeypox virus (MPXV) infection ++,84,0.0,1.0,0,0,0,0.0,1.0,...,0.0,1.0,0,0,0,0.0,1.0,0,2.0,2.0
1066,Zoonoses,Monkeypox virus (MPXV) infection ++,84,0.0,3.0,0,0,0,0.0,2.0,...,0.0,4.0,0,0,0,0.0,3.0,0,5.0,7.0
1067,Zoonoses,Monkeypox virus (MPXV) infection++,84,0.0,1.0,0,0,0,0.0,1.0,...,0.0,5.0,0,0,0,0.0,4.0,0,2.0,9.0
1068,Zoonoses,Monkeypox virus (MPXV) infection++,84,2.0,10.0,0,1,1,0.0,4.0,...,2.0,15.0,0,1,1,0.0,8.0,0,18.0,27.0
1069,Zoonoses,Monkeypox virus (MPXV) infection++,84,0.0,8.0,0,0,0,0.0,8.0,...,2.0,23.0,0,1,1,0.0,16.0,0,16.0,43.0


### Sort table & save file

In [122]:
table.sort_values(by=['disease_code', 'start_date'], inplace=True)

In [124]:
table.to_csv('nndss.csv', index=False)