# 1.
### Downloading and cleaning up data

In [1]:
import pandas as pd

#### download Covid Case data from York Region website

In [2]:
data_url = 'https://ww4.yorkmaps.ca/COVID19/Data/YR_CaseData.csv'
york_covid = pd.read_csv(data_url)

In [8]:
york_covid.dtypes

Case Count                  int64
Gender                     object
Age Decade                 object
Municipality               object
Community                  object
Date Reported              object
Estimated Date of Onset    object
Acquisition                object
Status                     object
dtype: object

### convert 'Date Reported' and 'Estimated Date on Onset' to datetime64 datatype

In [3]:
york_covid['Date Reported'] = pd.to_datetime(york_covid['Date Reported'])
york_covid['Estimated Date of Onset'] = pd.to_datetime(york_covid['Estimated Date of Onset'])

In [4]:
york_covid.dtypes

Case Count                          int64
Gender                             object
Age Decade                         object
Municipality                       object
Community                          object
Date Reported              datetime64[ns]
Estimated Date of Onset    datetime64[ns]
Acquisition                        object
Status                             object
dtype: object

### remove spaces from column headers and rename 'EstimatedDateofOnset'

In [5]:
# remove spaces from column headers
york_covid.columns = york_covid.columns.str.replace(' ','')

In [6]:
# rename column name
york_covid.rename(columns={'EstimatedDateofOnset':'EstimatedOnsetDate'})

Unnamed: 0,CaseCount,Gender,AgeDecade,Municipality,Community,DateReported,EstimatedOnsetDate,Acquisition,Status
0,24413,Female,30s,Richmond Hill,Richmond Hill,2020-02-29,2020-02-27,Travel,Resolved
1,24414,Female,60s,Richmond Hill,Richmond Hill,2020-03-02,2020-02-29,Travel,Resolved
2,24415,Female,70s,Richmond Hill,Richmond Hill,2020-03-03,2020-02-29,Travel,Resolved
3,24416,Male,50s,Vaughan,Maple,2020-03-04,2020-02-27,Travel,Resolved
4,24417,Male,40s,Richmond Hill,Richmond Hill,2020-03-04,2020-02-28,Travel,Resolved
...,...,...,...,...,...,...,...,...,...
24340,48753,Male,Under 20,Richmond Hill,Richmond Hill,2021-01-23,2021-01-21,Close Contact,Self-Isolating
24341,48754,Male,20s,Georgina,Georgina,2021-01-23,2021-01-21,Close Contact,Self-Isolating
24342,48755,Female,Under 20,Markham,Markham,2021-01-23,2021-01-22,Local Transmission,Self-Isolating
24343,48756,Male,30s,Vaughan,Maple,2021-01-23,2021-01-21,Under Investigation,Self-Isolating


### reduce memory usage

In [7]:
york_covid.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24345 entries, 0 to 24344
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   CaseCount             24345 non-null  int64         
 1   Gender                24345 non-null  object        
 2   AgeDecade             24345 non-null  object        
 3   Municipality          24345 non-null  object        
 4   Community             24345 non-null  object        
 5   DateReported          24345 non-null  datetime64[ns]
 6   EstimatedDateofOnset  24345 non-null  datetime64[ns]
 7   Acquisition           24345 non-null  object        
 8   Status                24345 non-null  object        
dtypes: datetime64[ns](2), int64(1), object(6)
memory usage: 9.7 MB


#### reduce memory usage by converting some/all columns to categories datatype

In [8]:
york_covid['Gender'] = york_covid['Gender'].astype('category')
york_covid['AgeDecade'] = york_covid['AgeDecade'].astype('category')
york_covid['Municipality'] = york_covid['Municipality'].astype('category')
york_covid['Community'] = york_covid['Community'].astype('category')
york_covid['DateReported'] = york_covid['DateReported'].astype('category')
york_covid['EstimatedDateofOnset'] = york_covid['EstimatedDateofOnset'].astype('category')
york_covid['Acquisition'] = york_covid['Acquisition'].astype('category')
york_covid['Status'] = york_covid['Status'].astype('category')

In [9]:
york_covid.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24345 entries, 0 to 24344
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   CaseCount             24345 non-null  int64   
 1   Gender                24345 non-null  category
 2   AgeDecade             24345 non-null  category
 3   Municipality          24345 non-null  category
 4   Community             24345 non-null  category
 5   DateReported          24345 non-null  category
 6   EstimatedDateofOnset  24345 non-null  category
 7   Acquisition           24345 non-null  category
 8   Status                24345 non-null  category
dtypes: category(8), int64(1)
memory usage: 458.4 KB


In [10]:
york_covid.head()

Unnamed: 0,CaseCount,Gender,AgeDecade,Municipality,Community,DateReported,EstimatedDateofOnset,Acquisition,Status
0,24413,Female,30s,Richmond Hill,Richmond Hill,2020-02-29,2020-02-27,Travel,Resolved
1,24414,Female,60s,Richmond Hill,Richmond Hill,2020-03-02,2020-02-29,Travel,Resolved
2,24415,Female,70s,Richmond Hill,Richmond Hill,2020-03-03,2020-02-29,Travel,Resolved
3,24416,Male,50s,Vaughan,Maple,2020-03-04,2020-02-27,Travel,Resolved
4,24417,Male,40s,Richmond Hill,Richmond Hill,2020-03-04,2020-02-28,Travel,Resolved


### export dataframe to excel file

In [11]:
york_covid.to_excel('yrCovidCases.xls')