## Pre-requisites

In [47]:
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go

In [48]:
dir_home = os.environ['HOME']
# Location of data relative to home:
dir_data = os.path.join(dir_home, "data", "coronavirus_data_gov_uk", "2020-06-11")

In [49]:
dir_data

'/Users/jason/data/coronavirus_data_gov_uk/2020-06-11'

In [50]:
file_cases = "coronavirus-cases_latest.csv"
file_deaths = "coronavirus-deaths_latest.csv"
file_cases, file_deaths

('coronavirus-cases_latest.csv', 'coronavirus-deaths_latest.csv')

In [51]:
file_cases = os.path.join(dir_data, file_cases)
file_cases

'/Users/jason/data/coronavirus_data_gov_uk/2020-06-11/coronavirus-cases_latest.csv'

In [52]:
file_deaths = os.path.join(dir_data, file_deaths)
file_deaths

'/Users/jason/data/coronavirus_data_gov_uk/2020-06-11/coronavirus-deaths_latest.csv'

In [53]:
df_cases = pd.read_csv(file_cases)

In [54]:
df_cases

Unnamed: 0,Area name,Area code,Area type,Specimen date,Daily lab-confirmed cases,Previously reported daily cases,Change in daily cases,Cumulative lab-confirmed cases,Previously reported cumulative cases,Change in cumulative cases,Cumulative lab-confirmed cases rate
0,England,E92000001,Nation,2020-06-10,27.0,0.0,27.0,156018,155714.0,304.0,278.7
1,South West,E12000009,Region,2020-06-10,0.0,,,7858,,,140.3
2,South East,E12000008,Region,2020-06-10,3.0,,,22179,,,242.8
3,London,E12000007,Region,2020-06-10,0.0,,,27240,,,305.8
4,East of England,E12000006,Region,2020-06-10,0.0,,,14634,,,236.0
...,...,...,...,...,...,...,...,...,...,...,...
36345,Tunbridge Wells,E07000116,Lower tier local authority,2020-01-31,1.0,,,1,,,0.8
36346,England,E92000001,Nation,2020-01-30,1.0,1.0,0.0,1,1.0,0.0,0.0
36347,Yorkshire and The Humber,E12000003,Region,2020-01-30,1.0,,,1,,,0.0
36348,York,E06000014,Upper tier local authority,2020-01-30,1.0,,,1,,,0.5


## Data quality and consistency checks (not exhaustive)

There use to be only three area types, but there are now 4.

In [55]:
area_types_all = set(df_cases['Area type'])
print(area_types_all)
len(area_types_all)==4

{'Lower tier local authority', 'Upper tier local authority', 'Region', 'Nation'}


True

Check all Specimen dates are of length 10 (YYYY-MM-DD)

In [56]:
check_set = set(df_cases['Specimen date'])
x = [len(i) for i in check_set]
print(set(x))
set(x) == {10}

{10}


True

Check all Area Codes are of length 9

In [57]:
check_set = set(df_cases['Area code'])
x = [len(i) for i in check_set]
print(set(x))
set(x) == {9}

{9}


True

## Dataframe re-ordering

Have the latest dates at the bottom of the dataframe.

In [58]:
df_cases = df_cases.sort_values(by=['Specimen date', 'Area code'])

## England

In [59]:
df_cases_england = df_cases[df_cases['Area name']=="England"]

In [60]:
df_cases_england

Unnamed: 0,Area name,Area code,Area type,Specimen date,Daily lab-confirmed cases,Previously reported daily cases,Change in daily cases,Cumulative lab-confirmed cases,Previously reported cumulative cases,Change in cumulative cases,Cumulative lab-confirmed cases rate
36346,England,E92000001,Nation,2020-01-30,1.0,1.0,0.0,1,1.0,0.0,0.0
36342,England,E92000001,Nation,2020-01-31,1.0,1.0,0.0,2,2.0,0.0,0.0
36329,England,E92000001,Nation,2020-02-03,6.0,6.0,0.0,8,8.0,0.0,0.0
36325,England,E92000001,Nation,2020-02-05,1.0,1.0,0.0,9,9.0,0.0,0.0
36321,England,E92000001,Nation,2020-02-08,3.0,3.0,0.0,12,12.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1030,England,E92000001,Nation,2020-06-06,208.0,195.0,13.0,155456,155409.0,47.0,277.7
841,England,E92000001,Nation,2020-06-07,177.0,179.0,-2.0,155633,155588.0,45.0,278.0
618,England,E92000001,Nation,2020-06-08,223.0,114.0,109.0,155856,155702.0,154.0,278.4
476,England,E92000001,Nation,2020-06-09,135.0,12.0,123.0,155991,155714.0,277.0,278.7


In [61]:
fig = px.line(df_cases_england, x='Specimen date', y='Cumulative lab-confirmed cases')
fig.update_traces(mode='lines+markers')
fig.show()

In [73]:
fig2 = px.bar(df_cases_england, x="Specimen date", y="Daily lab-confirmed cases")
fig2.show()

In [74]:
pd.options.mode.chained_assignment == None 

False

In [75]:
df_cases_england['SMAV'] = df_cases_england['Daily lab-confirmed cases'].rolling(7).sum()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [81]:
df_cases_england

Unnamed: 0,Area name,Area code,Area type,Specimen date,Daily lab-confirmed cases,Previously reported daily cases,Change in daily cases,Cumulative lab-confirmed cases,Previously reported cumulative cases,Change in cumulative cases,Cumulative lab-confirmed cases rate,SMAV
36346,England,E92000001,Nation,2020-01-30,1.0,1.0,0.0,1,1.0,0.0,0.0,
36342,England,E92000001,Nation,2020-01-31,1.0,1.0,0.0,2,2.0,0.0,0.0,
36329,England,E92000001,Nation,2020-02-03,6.0,6.0,0.0,8,8.0,0.0,0.0,
36325,England,E92000001,Nation,2020-02-05,1.0,1.0,0.0,9,9.0,0.0,0.0,
36321,England,E92000001,Nation,2020-02-08,3.0,3.0,0.0,12,12.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
1030,England,E92000001,Nation,2020-06-06,208.0,195.0,13.0,155456,155409.0,47.0,277.7,2510.0
841,England,E92000001,Nation,2020-06-07,177.0,179.0,-2.0,155633,155588.0,45.0,278.0,2383.0
618,England,E92000001,Nation,2020-06-08,223.0,114.0,109.0,155856,155702.0,154.0,278.4,2162.0
476,England,E92000001,Nation,2020-06-09,135.0,12.0,123.0,155991,155714.0,277.0,278.7,1819.0


The best way to explore the moving average values is hover over the points in the chart below.  An outstanding task is to overlay the moving average on the bar chart above.

In [78]:
fig2 = px.line(df_cases_england, x="Specimen date", y="SMAV")
fig2.data[0].update(mode='markers+lines')
fig2.show()

## Regions

In [24]:
df_cases_regions = df_cases[df_cases['Area type']=="Region"]

In [25]:
df_cases_regions

Unnamed: 0,Area name,Area code,Area type,Specimen date,Daily lab-confirmed cases,Previously reported daily cases,Change in daily cases,Cumulative lab-confirmed cases,Previously reported cumulative cases,Change in cumulative cases,Cumulative lab-confirmed cases rate
36347,Yorkshire and The Humber,E12000003,Region,2020-01-30,1.0,,,1,,,0.0
36343,South East,E12000008,Region,2020-01-31,1.0,,,1,,,0.0
36333,Yorkshire and The Humber,E12000003,Region,2020-02-03,1.0,,,2,,,0.0
36332,East of England,E12000006,Region,2020-02-03,1.0,,,1,,,0.0
36331,South East,E12000008,Region,2020-02-03,1.0,,,2,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...
5,West Midlands,E12000005,Region,2020-06-10,4.0,,,16606,,,281.4
4,East of England,E12000006,Region,2020-06-10,0.0,,,14634,,,236.0
3,London,E12000007,Region,2020-06-10,0.0,,,27240,,,305.8
2,South East,E12000008,Region,2020-06-10,3.0,,,22179,,,242.8


In [26]:
def plot_line_graph(y_axis='Daily lab-confirmed cases', log_y_setting=False):
    fig3 = px.line(df_cases_regions, x="Specimen date", y=y_axis, 
               color="Area name", 
               line_group="Area name", 
               hover_name="Area name",
               log_y=log_y_setting)
    fig3.update_traces(mode='lines+markers')
    fig3.show()
    return

In [27]:
plot_line_graph()

In [28]:
plot_line_graph('Cumulative lab-confirmed cases')

In [29]:
plot_line_graph('Cumulative lab-confirmed cases', log_y_setting=True)