In [1]:
import pandas as pd

In [2]:
data_path = './corona_scotland.xlsx'

# Cases and Death Data

In [3]:
df = pd.read_excel(data_path, 
                   dtype={'Total Tests': 'Int32',
                          'Total Positive Cases': 'Int32',
                          'Total Deaths': 'Int32'}
                   ).ffill()
df = df[df['Health Board'] != 'Golden Jubilee National Hospital']

In [4]:
df

Unnamed: 0,Date,Health Board,Number of Positive cases,Total Tests,Total Positive Cases,Total Deaths
0,2020-03-06,Tayside,1,1514,11,0
1,2020-03-06,Ayrshire & Arran,1,1514,11,0
2,2020-03-06,Forth Valley,2,1514,11,0
3,2020-03-06,Greater Glasgow and Clyde,1,1514,11,0
4,2020-03-06,Grampian,3,1514,11,0
...,...,...,...,...,...,...
1216,2020-06-02,Lothian,2716,116848,15471,2375
1217,2020-06-02,Orkney,8,116848,15471,2375
1218,2020-06-02,Shetland,54,116848,15471,2375
1219,2020-06-02,Tayside,1671,116848,15471,2375


## clean data

In [5]:
df['Health Board'].value_counts()

Greater Glasgow and Clyde        89
Fife                             89
Forth Valley                     89
Tayside                          89
Lothian                          89
Grampian                         89
Lanarkshire                      88
Shetland                         86
Borders                          84
Highland                         81
Dumfries and Galloway            79
Ayrshire and Arran               66
Eileanan Siar (Western Isles)    63
Orkney                           63
Ayrshire and Arran               20
Ayrshire & Arran                  3
Name: Health Board, dtype: int64

In [6]:
# clean up some names
df.loc[df['Health Board'] == 'Ayrshire\xa0and Arran', 'Health Board'] = 'Ayrshire and Arran' 
df.loc[df['Health Board'] == 'Ayrshire & Arran', 'Health Board'] = 'Ayrshire and Arran' 

In [7]:
df['Health Board'].value_counts()

Greater Glasgow and Clyde        89
Fife                             89
Forth Valley                     89
Ayrshire and Arran               89
Tayside                          89
Lothian                          89
Grampian                         89
Lanarkshire                      88
Shetland                         86
Borders                          84
Highland                         81
Dumfries and Galloway            79
Eileanan Siar (Western Isles)    63
Orkney                           63
Name: Health Board, dtype: int64

## table of number cases per region over time

In [8]:
df_cases = df[['Date', 'Health Board', 'Number of Positive cases']]

In [9]:
df_table = pd.pivot_table(df_cases, 
                          columns=['Health Board'], 
                          values=['Number of Positive cases'], 
                          index=['Date'],
                          fill_value=0,
                         ).astype('Int32')

df_table.columns = df_table.columns.droplevel(0)

In [10]:
df_table['Scotland (Total)'] = df_table.sum(axis=1)

In [11]:
df_table.to_pickle('corona_scotland_regions_ts.pk')

## table of national cases, tests, and deaths over time

In [12]:
df_national = (df.drop(columns=['Health Board', 'Number of Positive cases'])
                 .groupby('Date').first()
              )

In [13]:
df_national.index

DatetimeIndex(['2020-03-06', '2020-03-07', '2020-03-08', '2020-03-09',
               '2020-03-10', '2020-03-11', '2020-03-12', '2020-03-13',
               '2020-03-14', '2020-03-15', '2020-03-16', '2020-03-17',
               '2020-03-18', '2020-03-19', '2020-03-20', '2020-03-21',
               '2020-03-22', '2020-03-23', '2020-03-24', '2020-03-25',
               '2020-03-26', '2020-03-27', '2020-03-28', '2020-03-29',
               '2020-03-30', '2020-03-31', '2020-04-01', '2020-04-02',
               '2020-04-03', '2020-04-04', '2020-04-05', '2020-04-06',
               '2020-04-07', '2020-04-08', '2020-04-09', '2020-04-10',
               '2020-04-11', '2020-04-12', '2020-04-13', '2020-04-14',
               '2020-04-15', '2020-04-16', '2020-04-17', '2020-04-18',
               '2020-04-19', '2020-04-20', '2020-04-21', '2020-04-22',
               '2020-04-23', '2020-04-24', '2020-04-25', '2020-04-26',
               '2020-04-27', '2020-04-28', '2020-04-29', '2020-04-30',
      

In [14]:
df_national.to_pickle('corona_scotland_total_stats.pk')

## Scotland Population Data

In [15]:
region_pops = {
    'Ayrshire and Arran': 369670,
    'Borders': 115270, 
    'Dumfries and Galloway': 148790,
    'Fife': 371910,
    'Forth Valley': 306070,
    'Grampian': 584550,
    'Greater Glasgow and Clyde': 1196335,
    'Highland': 321800,
    'Lanarkshire': 563185,
    'Lothian': 897770,
    'Orkney': 22190,
    'Shetland': 22990,
    'Tayside': 416080,
    'Eileanan Siar (Western Isles)': 26500,
    #'Scotland (Total)': 5438000,
}

In [16]:
df_pop = pd.DataFrame(region_pops.items(), columns=['Region', 'Pop']).set_index('Region').sort_index()

In [17]:
df_pop.loc['Scotland (Total)'] = 5438000

In [18]:
df_pop

Unnamed: 0_level_0,Pop
Region,Unnamed: 1_level_1
Ayrshire and Arran,369670
Borders,115270
Dumfries and Galloway,148790
Eileanan Siar (Western Isles),26500
Fife,371910
Forth Valley,306070
Grampian,584550
Greater Glasgow and Clyde,1196335
Highland,321800
Lanarkshire,563185


In [19]:
df_pop[:14].sum()
## hmm missing few thousands??

Pop    5363110
dtype: int64

In [20]:
df_pop.to_pickle('scotland_population.pk')

# Hospital Data

In [21]:
from datetime import datetime

In [22]:
df = pd.read_excel(data_path, 
                   usecols=['date', 'health_board', 
                            'num_hospital', 'num_icu'],
                   dtype={
                       'date': datetime,
                       'health_board': 'str',
                       'num_hospital': 'Int32',
                       'num_icu': 'Int32'},
                   na_values={'num_hospital': '*',
                              'num_icu': '*'},
                   sheet_name=1,
                   )
df['date'].ffill(inplace=True)
df.fillna(0, inplace=True)

In [23]:
df

Unnamed: 0,date,health_board,num_hospital,num_icu
0,2020-04-08,Ayrshire and Arran,91,16
1,2020-04-08,Borders,56,9
2,2020-04-08,Dumfries and Galloway,44,6
3,2020-04-08,Fife,127,13
4,2020-04-08,Forth Valley,107,9
...,...,...,...,...
834,2020-06-02,Orkney,0,0
835,2020-06-02,Shetland,0,0
836,2020-06-02,Tayside,27,0
837,2020-06-02,Eileanan Siar (Western Isles),0,0


In [24]:
df_hos = df[['date', 'health_board', 'num_hospital']]
df_icu = df[['date', 'health_board', 'num_icu']]

In [25]:
df_table = pd.pivot_table(df_hos, 
                          columns=['health_board'], 
                          values=['num_hospital'], 
                          index=['date'],
                          fill_value=0,
                         ).astype('int32')

df_table.columns = df_table.columns.droplevel(0)
df_table['Scotland (Total)'] = df_table.sum(axis=1)

In [26]:
df_table.tail()

health_board,Ayrshire and Arran,Borders,Dumfries and Galloway,Eileanan Siar (Western Isles),Fife,Forth Valley,Golden Jubilee National Hospital,Grampian,Greater Glasgow and Clyde,Highland,Lanarkshire,Lothian,Orkney,Shetland,Tayside,Scotland (Total)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-05-29,46,11,18,0,94,51,0,87,354,45,143,343,0,0,20,1212
2020-05-30,39,10,0,0,91,46,0,85,343,38,137,283,0,0,20,1092
2020-05-31,38,17,14,0,79,49,0,75,338,29,135,276,0,0,22,1072
2020-06-01,41,18,12,0,83,53,0,75,338,32,125,239,0,0,28,1044
2020-06-02,48,13,23,0,84,62,0,78,332,62,134,302,0,0,27,1165


In [27]:
df_table.to_pickle('corona_scotland_regions_hospital_ts.pk')

In [28]:
df_table = pd.pivot_table(df_icu, 
                          columns=['health_board'], 
                          values=['num_icu'], 
                          index=['date'],
                          fill_value=0,
                         ).astype('int32')

df_table.columns = df_table.columns.droplevel(0)
df_table['Scotland (Total)'] = df_table.sum(axis=1)

In [29]:
df_table.tail()

health_board,Ayrshire and Arran,Borders,Dumfries and Galloway,Eileanan Siar (Western Isles),Fife,Forth Valley,Golden Jubilee National Hospital,Grampian,Greater Glasgow and Clyde,Highland,Lanarkshire,Lothian,Orkney,Shetland,Tayside,Scotland (Total)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-05-29,0,0,0,0,0,0,0,5,9,0,8,9,0,0,0,31
2020-05-30,0,0,0,0,0,0,0,0,9,0,0,9,0,0,0,18
2020-05-31,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,7
2020-06-01,0,0,0,0,0,0,0,0,7,0,0,5,0,0,0,12
2020-06-02,0,0,0,0,5,0,0,0,7,0,7,5,0,0,0,24


In [30]:
df_table.to_pickle('corona_scotland_regions_icu_ts.pk')