In [None]:
# import standard packages
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
import pandas as pd

In [None]:
# set seaborn style
sns.set_theme() #reset to default
# sns.set_context('paper')
sns.set_palette('rocket')

In [None]:
# import new cases by region data
newcases_region = pd.read_csv(r'newCasesByPublishDate_region_2023-05-18.csv')

In [None]:
# head
newcases_region.head(5)

In [None]:
# tail
newcases_region.tail(5)

In [None]:
# get dataset info
newcases_region.info()

In [None]:
# verify the date range
print(f"Most recent data entry: {newcases_region['date'][0]}")
print(f"First data entry: {newcases_region['date'][10142]}")

In [None]:
# get the data types of the columns
newcases_region.dtypes

In [None]:
# change data type of date column from object to date
newcases_region['date'] = pd.to_datetime(newcases_region['date'], format='%d/%m/%Y')
newcases_region.dtypes

In [None]:
#  check that date conversion worked as expected
newcases_region.head(5)

In [None]:
# check for null values
newcases_region['newCasesByPublishDate'].isnull().sum()

In [None]:
# check for values == 0
(newcases_region['newCasesByPublishDate'] == 0).sum()

Out of 10142 rows, 2890 are equal to zero. This is likely due to no data being reported on those days. Therefore it makes sense to drop these rows. 

In [None]:
# drop rows == 0
newcases_region.drop(newcases_region[newcases_region['newCasesByPublishDate'] == 0].index, inplace=True)
(newcases_region['newCasesByPublishDate'] == 0).sum()

In [None]:
# investigate how many rows are left, expected to be 10143 - 2890 = 7253
newcases_region.info()

In [None]:
# describe new cases data
newcases_region['newCasesByPublishDate'].describe()

In [None]:
# multiple line plot of infections per region
p = sns.relplot(data=newcases_region, x='date', y='newCasesByPublishDate', hue = 'areaName', kind='line')
p.fig.set_size_inches(15,5)
plt.show()
plt.legend(loc='upper right')

In [None]:
# multiple line plot of infections per region with a log scale
#plt.figure(figsize=(30,30))
p=sns.relplot(data=newcases_region, x='date', y='newCasesByPublishDate', hue = 'areaName', kind='line')
plt.yscale('log')
p.fig.set_size_inches(15,5)
#plt.show()

In [None]:
# select rows between January and June 2022 with the large spike in new cases
mask = (newcases_region['date'] >= '2022-01-01') & (newcases_region['date'] < '2022-05-01')
newcases_region_start_2022 = newcases_region.loc[mask]
newcases_region_start_2022.head(5)

In [None]:
newcases_region_start_2022.tail(5)

In [None]:
# plot infections for first half of 2022 to further investigate the large spike
p=sns.relplot(data=newcases_region_start_2022, x='date', y='newCasesByPublishDate', hue = 'areaName', kind='line')
plt.yscale('log')
p.fig.set_size_inches(15,5)

Two things are notable from inspecting these dates more closely:
1. The large spike at the end of January is by nearly an order of magnitude. This needs to be investigated as a possible outlier. 
2. All regions follow a pattern in which they spike on the same day. This needs to be investigated. Is it linked to the Covid-19 data reporting schedule? How could this be the case if all values of 0 have been dropped? To make the overall trend of the dataset more apparent, it will be useful to apply a smoothing algorithm to the trend line. 

In [None]:
# plot the full dataset with a smoothed line using scipy B-spline library
from scipy.interpolate import make_interp_spline


# Choropleth

In [None]:
import json
import plotly.graph_objs as go
# import plotly.plotly as py
import plotly.express as px

In [None]:
# open json
f = open('eer.json')

In [None]:
# load geojson
regions = json.load(f)

In [None]:
# inspect json features
regions["features"][0]

In [None]:
# sort dataframe by date
newcases_region.sort_values(by='date', inplace=True)
newcases_region.head()

In [None]:
# usea areaName as mapping identifier
fig = px.choropleth_mapbox(
    data_frame=newcases_region,
    locations="areaName",
    featureidkey="properties.EER13NM",
    geojson=regions,
    color="newCasesByPublishDate",
    hover_name="newCasesByPublishDate",
    mapbox_style="open-street-map",
    zoom=4,
    center = {"lat": 55, "lon": 0},
    animation_group="date",
    width=800,
    height=600
    )
fig.show()

In [None]:
# slider might not be showing due to too many days. Summarize by year to see if it works
newcases_region['Year'] = newcases_region['date'].dt.strftime('%Y')
newcases_region['Month'] = newcases_region['date'].dt.strftime('%m')
newcases_region.head()

In [None]:
# check dtypes
newcases_region.dtypes

In [None]:
fig = px.choropleth_mapbox(
    data_frame=newcases_region,
    locations="areaName",
    featureidkey="properties.EER13NM",
    geojson=regions,
    color="newCasesByPublishDate",
    hover_name="newCasesByPublishDate",
    mapbox_style="open-street-map",
    zoom=4,
    center = {"lat": 55, "lon": 0},
    animation_group="Year",
    width=800,
    height=600
    )
fig.show()

In [None]:
# combination of the separate year column and using animation_frame rather than animation_group worked
fig = px.choropleth_mapbox(
    data_frame=newcases_region,
    locations="areaName",
    featureidkey="properties.EER13NM",
    geojson=regions,
    color="newCasesByPublishDate",
    hover_name="newCasesByPublishDate",
    mapbox_style="open-street-map",
    zoom=4,
    center = {"lat": 55, "lon": 0},
    animation_frame="Year",
    width=800,
    height=600
    )
fig.show()

In [None]:
# month won't work, so split the dataframe into 3 and do a separate plot for each year
newcases_region_2020 = newcases_region[newcases_region['Year'] == '2020']
newcases_region_2021 = newcases_region[newcases_region['Year'] == '2021']
newcases_region_2022 = newcases_region[newcases_region['Year'] == '2022']
newcases_region_2023 = newcases_region[newcases_region['Year'] == '2023']

In [None]:
newcases_region_2020.head()

In [None]:
newcases_region_2020.tail()

In [None]:
newcases_region_2023.head()

In [None]:
newcases_region_2023.tail()

In [None]:
newcases_region['month_year'] = newcases_region['date'].dt.strftime('%m/%Y')
newcases_region.head()

In [None]:
# plot 2020 per month
fig = px.choropleth_mapbox(
    data_frame=newcases_region_2020,
    locations="areaName",
    featureidkey="properties.EER13NM",
    geojson=regions,
    color="newCasesByPublishDate",
    hover_name="newCasesByPublishDate",
    mapbox_style="open-street-map",
    zoom=4,
    center = {"lat": 55, "lon": 0},
    animation_frame="Month",
    width=800,
    height=600
    )
fig.show()

In [None]:
# plot 2021 per month
fig = px.choropleth_mapbox(
    data_frame=newcases_region_2021,
    locations="areaName",
    featureidkey="properties.EER13NM",
    geojson=regions,
    color="newCasesByPublishDate",
    hover_name="newCasesByPublishDate",
    mapbox_style="open-street-map",
    zoom=4,
    center = {"lat": 55, "lon": 0},
    animation_frame="Month",
    width=800,
    height=600
    )
fig.show()

In [None]:
# plot 2022 per month
fig = px.choropleth_mapbox(
    data_frame=newcases_region_2022,
    locations="areaName",
    featureidkey="properties.EER13NM",
    geojson=regions,
    color="newCasesByPublishDate",
    hover_name="newCasesByPublishDate",
    mapbox_style="open-street-map",
    zoom=4,
    center = {"lat": 55, "lon": 0},
    animation_frame="Month",
    width=800,
    height=600
    )
fig.show()

In [None]:
# plot 2023 per month
fig = px.choropleth_mapbox(
    data_frame=newcases_region_2023,
    locations="areaName",
    featureidkey="properties.EER13NM",
    geojson=regions,
    color="newCasesByPublishDate",
    hover_name="newCasesByPublishDate",
    mapbox_style="open-street-map",
    zoom=4,
    center = {"lat": 55, "lon": 0},
    animation_frame="Month",
    width=800,
    height=600
    )
fig.show()