# Import

## Modules

In [1]:
import sys
from datetime import timedelta, datetime
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor


Setup figure plot settings

In [2]:
%config InlineBackend.figure_format = 'retina'
az.style.use("arviz-darkgrid")
sns.set_palette("deep")

 Setup paths to import scripts

In [3]:
PROJECT_ROOT = Path.cwd().parent.resolve()
sys.path.append(str(PROJECT_ROOT))

## Scripts

In [4]:
from src.data.make_dataset import get_cases_data, to_datetime, subset_latest_outbreak, get_daily_cases_stats

## Data

In [5]:
raw_cases_data = get_cases_data()

## Audit

In [6]:
raw_cases_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9866 entries, 0 to 9865
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   notification_date           9866 non-null   object 
 1   postcode                    9061 non-null   object 
 2   likely_source_of_infection  9866 non-null   object 
 3   lhd_2010_code               9050 non-null   object 
 4   lhd_2010_name               9050 non-null   object 
 5   lga_code19                  9050 non-null   float64
 6   lga_name19                  9050 non-null   object 
dtypes: float64(1), object(6)
memory usage: 539.7+ KB


In [7]:
raw_cases_data.head()

Unnamed: 0,notification_date,postcode,likely_source_of_infection,lhd_2010_code,lhd_2010_name,lga_code19,lga_name19
0,2020-01-25,2134,Overseas,X700,Sydney,11300.0,Burwood (A)
1,2020-01-25,2121,Overseas,X760,Northern Sydney,16260.0,Parramatta (C)
2,2020-01-25,2071,Overseas,X760,Northern Sydney,14500.0,Ku-ring-gai (A)
3,2020-01-27,2033,Overseas,X720,South Eastern Sydney,16550.0,Randwick (C)
4,2020-03-01,2077,Overseas,X760,Northern Sydney,14000.0,Hornsby (A)


# Preprocess Raw Data

In [8]:
# Transform column notification_date to datetime
raw_cases_data = to_datetime('notification_date', raw_cases_data)

# Subset data to the latest outbreak
interim_data = subset_latest_outbreak('2021-06-01', 'Overseas', raw_cases_data)

# Aggregate number of cases by day
data = get_daily_cases_stats(interim_data)
data

Unnamed: 0,notification_date,Daily Number of Cases,Pct Change,Cumsum,Daily Difference,Growth Factor,Weekly Rolling Average,Weekly Average CumSum,Epidemiological Days
0,2021-06-16,3,,3,,,,,-24.0
1,2021-06-17,1,-0.666667,4,-2.0,,,,-23.0
2,2021-06-18,2,1.0,6,1.0,-0.5,,,-22.0
3,2021-06-19,1,-0.5,7,-1.0,-1.0,,,-21.0
4,2021-06-20,2,1.0,9,1.0,-1.0,,,-20.0
5,2021-06-21,5,1.5,14,3.0,3.0,,,-19.0
6,2021-06-22,17,2.4,31,12.0,4.0,4.0,4.0,-18.0
7,2021-06-23,12,-0.294118,43,-5.0,-0.416667,6.0,10.0,-17.0
8,2021-06-24,21,0.75,64,9.0,-1.8,9.0,19.0,-16.0
9,2021-06-25,28,0.333333,92,7.0,0.777778,12.0,31.0,-15.0


## Split Predictor and Target Columns

Get initial number of cases, based on weekly rolling mean

In [9]:
mask = data['Epidemiological Days'] == 0
initial_number_of_cases = data.loc[mask, 'Weekly Rolling Average'].values[0]
print(f"Initial number of cases: {initial_number_of_cases}")

Initial number of cases: 45.0


Split predictor and target columns

In [10]:
mask = data['Epidemiological Days'] >= 0
X = data.loc[mask, ["Epidemiological Days", 'Daily Number of Cases', 'Pct Change']]
y = data.loc[mask, "Weekly Rolling Average"]

# Model Training

## Data Preparation

Train-test split

In [11]:
train_size = 0.7
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)

Setting up initial parameters for Bayesian modelling

In [12]:
daily_number_of_cases_std = X_train['Daily Number of Cases'].std()
average_pct_change = X_train['Pct Change'].mean()
std_pct_change = X_train['Pct Change'].std()

print(f"Daily number of cases std: {daily_number_of_cases_std}")
print(f"Average percenntage change: {average_pct_change}")
print(f"Standard deviation of percentage change: {std_pct_change}")

Daily number of cases std: 48.96283771519154
Average percenntage change: 0.10203126654564262
Standard deviation of percentage change: 0.35077546910006835


Selecting only epidemiological days for predictor and define datasets as `np.ndarray` objects

In [13]:
X_train = X_train["Epidemiological Days"].values
y_train = y_train.values
X_test = X_test["Epidemiological Days"].values
y_test = y_test.values

X = X["Epidemiological Days"].values
y = y.values

## Train Model