Header

In [1]:
from vaccination_forecaster_io import *
import datetime
import math
import numpy as np
import scipy.optimize as optimize
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

In [2]:
path = "../data/covid_vaccine.xlsx"
initial_table = pd.read_excel(path, engine="openpyxl")
cleaned_table = clean_excel_table(initial_table)


In [3]:
def generate_daily_differences(panda_data_frame_column, column_name):
	'''
	This function will take an input column and compute the differences 
	between each row and output another column
	'''
	inital_list = [panda_data_frame_column[0]]

	for index, row in enumerate(panda_data_frame_column.items()):
		if index > 0:
			inital_list.append(
				panda_data_frame_column[index]-panda_data_frame_column[index-1])

	column = pd.Series(inital_list)
	column.name = column_name
	return column


In [4]:
def calculate_date_difference(input_date):
	'''
	Calculates the difference in dates since the 14th of September
	'''
	date_format = "%d-%m-%Y"
	original_date = datetime.datetime.strptime("14-09-2021", date_format)
	new_date = datetime.datetime.strptime(input_date, date_format)

	date_diff = new_date-original_date

	return date_diff.days

def calculate_new_input_value(difference):
	'''
	Gets the new input value for the function
	WARNING!!!!!!!!! ORIGINAL END INDEX IS HARD CODED AS 207.... THIS IS BASED ON LENGTH OF DATASET
	'''
	# THIS IS DANGEROUS HERE, I KNOW... BUT I'M LAZY
	original_input_value = 207

	new_input_value = original_input_value + difference

	return new_input_value

def calculate_estimate_at_given_date(logistic_function,parameter_tuple,date_string):
	'''
	This function calcuates the predicted vaccination at a given date.
	Parameter tuple should be ordered (b,c)
	'''

	date_difference = calculate_date_difference(date_string)

	new_date_index = calculate_new_input_value(date_difference)

	new_estimate = logistic_function(new_date_index,parameter_tuple[0],parameter_tuple[1])

	return new_estimate



In [5]:
def run_parameter_optimisation(logistic_function_vectorised, x_input, y_input, p0_array, bounds_array):
	'''
	Function for running the curve fit function function
	'''
	return optimize.curve_fit(logistic_function_vectorised, x_input, y_input, bounds = bounds_array, p0 = p0_array)



In [6]:
daily_first_vaccines = generate_daily_differences(
	cleaned_table.people_vaccinated, "daily_first_vaccines")
daily_second_vaccines = generate_daily_differences(
    cleaned_table.people_fully_vaccinated, "daily_second_vaccines")


In [7]:
def first_vaccine_logistic(t,b,c):
	'''
	t: timestep
	a: maximum population
	b: rate of vaccination
	c: shift 
	'''
	a = 4.9e6
	return (a/(1 + math.exp(-b*(t-c))))

def second_vaccine_logistic(t,b,c):
	a = 4.9e6
	return (a/(1 + math.exp(-b*(t-c))))




first_logistic_vectorised = np.vectorize(first_vaccine_logistic)
second_logistic_vectorised = np.vectorize(second_vaccine_logistic)


In [8]:
table_len = len(cleaned_table)
index_vec = np.array(list(range(table_len)))

In [9]:
split_index = 199
training_dataset = cleaned_table[0:split_index]
testing_dataset = cleaned_table[split_index:-1]

training_x = index_vec[0:split_index]
training_y_first = np.array(training_dataset["people_vaccinated"])
training_y_second = np.array(training_dataset["people_fully_vaccinated"])

testing_x = index_vec[split_index:-1]
testing_y_first = np.array(testing_dataset["people_vaccinated"])
testing_y_second = np.array(testing_dataset["people_fully_vaccinated"])


In [10]:
p0 = np.array([0.01, 0])

bounds = (0, [0.05, 1000])


In [11]:
(b_first, c_first), cov_first = run_parameter_optimisation(
    first_logistic_vectorised, training_x, training_y_first, p0, bounds)
# plt.plot(training_x, first_logistic_vectorised(training_x, b_first, c_first))

first_vaccine_function_testing_estimation = first_logistic_vectorised(
    testing_x, b_first, c_first)
first_vaccine_function_training_estimation = first_logistic_vectorised(
    training_x, b_first, c_first)

rmse_testing_first = mean_squared_error(
    first_vaccine_function_testing_estimation, testing_y_first, squared=False)
rmse_training_first = mean_squared_error(
    first_vaccine_function_training_estimation, training_y_first, squared=False)



calculate_estimate_at_given_date(first_vaccine_logistic, (b_first, c_first), "15-09-2021")


2583054.265110246

In [14]:
(b_second, c_second), cov_second = run_parameter_optimisation(
    second_logistic_vectorised, training_x, training_y_second, p0, bounds)

second_vaccine_function_testing_estimation = second_logistic_vectorised(
    testing_x, b_second, c_second)
second_vaccine_function_training_estimation = second_logistic_vectorised(
    training_x, b_second, c_second)

rmse_testing_second = mean_squared_error(
    second_vaccine_function_testing_estimation, testing_y_second, squared=False)
rmse_training_second = mean_squared_error(
    second_vaccine_function_training_estimation, training_y_second, squared=False)


In [15]:
calculate_estimate_at_given_date(
    second_vaccine_logistic, (b_second, c_second) ,"15-09-2021")


1541982.2477657795