# Cleaning Covid cases for Ottawa and Vancouver

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt

### Cleaning data for Ottawa

In [2]:
#Read data from the Ottawa Data file
dfOttawa = pd.read_csv("../raw_data/COVID-19_Cases_and_Deaths_in_Ottawa.csv")
dfOttawa

Unnamed: 0,Date,Cumulative_Cases_by_Episode_Date,Cumulative_Resolved_Cases_by_Episode_Date,Total_Active_Cases_by_Date,Cumulative_Deaths_by_Date_of_Death,Daily_Cases_by_Reported_Date,7-day_Average_of_Newly_Reported_cases_by_Reported_Date,Daily_Cases_by_Episode_Date,Daily_Cases_Linked_to_a_Community_Outbreak_by_Episode_Date,Daily_Cases_Linked_to_a_School_or_Childcare_Outbreak_by_Episode_Date,...,"Cumulative_Rate_for_90_Years_and_Over_per_100,000_pop_by_Episode_Date","Cumulative_Rate_for_Males_per_100,000_pop_by_Episode_Date","Cumulative_Rate_for_Females_per_100,000_pop_by_Episode_Date",Source_of_Infection_is_a_Close_Contact_by_Episode_Date,Source_of_Infection_is_an_Outbreak_by_Episode_Date,Source_of_Infection_is_Travel_by_Episode_Date,Number_of_Cases_with_Missing_Information_for_Source_of_Infection_by_Episode_Date,Number_of_Cases_with_No_Known_Epidemiological_Link_by_Episode_Date,%_No_Known_Epidemiological_Link_by_Episode_Date_-_14_Day_Average,OBJECTID
0,2020/02/10,1,,1,,,0.0,1.0,0,0,...,,,0.2,0,0,1,0,0,0.0,0
1,2020/02/11,1,,1,,,0.0,,0,0,...,,,0.2,0,0,0,0,0,0.0,1
2,2020/02/12,1,,1,,,0.0,,0,0,...,,,0.2,0,0,0,0,0,0.0,2
3,2020/02/13,1,,1,,,0.0,,0,0,...,,,0.2,0,0,0,0,0,0.0,3
4,2020/02/14,1,,1,,,0.0,,0,0,...,,,0.2,0,0,0,0,0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,2021/06/12,27475,26486.0,404,585.0,31.0,27.0,14.0,0,0,...,5877.0,2599.5,2604.0,6,1,0,1,6,27.8,488
489,2021/06/13,27487,26517.0,385,585.0,18.0,26.4,12.0,0,1,...,5877.0,2600.3,2605.5,7,1,0,3,1,27.1,489
490,2021/06/14,27499,26571.0,342,586.0,22.0,27.1,12.0,1,0,...,5877.0,2601.7,2606.4,4,2,1,4,1,26.5,490
491,2021/06/15,27513,26611.0,316,586.0,22.0,26.1,14.0,0,0,...,5877.0,2603.8,2606.9,2,0,0,11,1,24.8,491


In [3]:
#Clean up the original dataset to contain only required columns and add/split date column for Ottawa data
OttawaAnalysisDf = dfOttawa[['Date', 'Total_Active_Cases_by_Date', 'Daily_Cases_by_Reported_Date']].copy()
OttawaAnalysisDf[["Year", "Month", "Day"]] = OttawaAnalysisDf["Date"].str.split("/", expand = True)
OttawaAnalysisDf.loc[:,'Daily_Cases_by_Reported_Date'] =  OttawaAnalysisDf['Daily_Cases_by_Reported_Date'].fillna(0)
OttawaAnalysisDf['Date'] = pd.to_datetime(OttawaAnalysisDf['Date'])
OttawaAnalysisDf['Weekday'] = OttawaAnalysisDf['Date'].dt.day_name()
OttawaAnalysisDf

Unnamed: 0,Date,Total_Active_Cases_by_Date,Daily_Cases_by_Reported_Date,Year,Month,Day,Weekday
0,2020-02-10,1,0.0,2020,02,10,Monday
1,2020-02-11,1,0.0,2020,02,11,Tuesday
2,2020-02-12,1,0.0,2020,02,12,Wednesday
3,2020-02-13,1,0.0,2020,02,13,Thursday
4,2020-02-14,1,0.0,2020,02,14,Friday
...,...,...,...,...,...,...,...
488,2021-06-12,404,31.0,2021,06,12,Saturday
489,2021-06-13,385,18.0,2021,06,13,Sunday
490,2021-06-14,342,22.0,2021,06,14,Monday
491,2021-06-15,316,22.0,2021,06,15,Tuesday


In [4]:
#Check for null values
OttawaAnalysisDf.isnull().sum(axis = 0)

Date                            0
Total_Active_Cases_by_Date      0
Daily_Cases_by_Reported_Date    0
Year                            0
Month                           0
Day                             0
Weekday                         0
dtype: int64

### Cleaning data for Vancouver

In [5]:
#Read data from british Columbia
dfBC = pd.read_csv("../raw_data/BCCDC_COVID19_Regional_Summary_Data.csv")
dfBC

Unnamed: 0,Date,Province,HA,HSDA,Cases_Reported,Cases_Reported_Smoothed
0,2020-01-29,BC,All,All,0,0.00
1,2020-01-29,BC,Fraser,All,0,0.00
2,2020-01-29,BC,Fraser,Fraser East,0,0.00
3,2020-01-29,BC,Fraser,Fraser North,0,0.00
4,2020-01-29,BC,Fraser,Fraser South,0,0.00
...,...,...,...,...,...,...
12645,2021-06-17,BC,Vancouver Island,All,4,4.14
12646,2021-06-17,BC,Vancouver Island,Central Vancouver Island,1,0.57
12647,2021-06-17,BC,Vancouver Island,North Vancouver Island,0,0.43
12648,2021-06-17,BC,Vancouver Island,South Vancouver Island,3,3.14


In [6]:
#Check the column values
dfBC['HSDA'].unique()

array(['All', 'Fraser East', 'Fraser North', 'Fraser South', 'Unknown',
       'East Kootenay', 'Kootenay Boundary', 'Okanagan',
       'Thompson Cariboo Shuswap', 'Northeast', 'Northern Interior',
       'Northwest', 'North Shore/Coast Garibaldi', 'Richmond',
       'Vancouver', 'Central Vancouver Island', 'North Vancouver Island',
       'South Vancouver Island', 'Out of Canada'], dtype=object)

In [7]:
#Filter the rows only relevant to Vancouver
VancouverAnalysisDF = dfBC.loc[dfBC['HSDA'].str.contains('Vancouver'), :].copy()

#split the dates
VancouverAnalysisDF[["Year", "Month", "Day"]] = VancouverAnalysisDF["Date"].str.split("-", expand = True)
VancouverAnalysisDF

Unnamed: 0,Date,Province,HA,HSDA,Cases_Reported,Cases_Reported_Smoothed,Year,Month,Day
19,2020-01-29,BC,Vancouver Coastal,Vancouver,0,0.00,2020,01,29
21,2020-01-29,BC,Vancouver Island,Central Vancouver Island,0,0.00,2020,01,29
22,2020-01-29,BC,Vancouver Island,North Vancouver Island,0,0.00,2020,01,29
23,2020-01-29,BC,Vancouver Island,South Vancouver Island,0,0.00,2020,01,29
44,2020-01-30,BC,Vancouver Coastal,Vancouver,0,0.00,2020,01,30
...,...,...,...,...,...,...,...,...,...
12623,2021-06-16,BC,Vancouver Island,South Vancouver Island,2,3.71,2021,06,16
12644,2021-06-17,BC,Vancouver Coastal,Vancouver,10,13.00,2021,06,17
12646,2021-06-17,BC,Vancouver Island,Central Vancouver Island,1,0.57,2021,06,17
12647,2021-06-17,BC,Vancouver Island,North Vancouver Island,0,0.43,2021,06,17


In [8]:
#get the day of the week column
VancouverAnalysisDF['Date'] = pd.to_datetime(VancouverAnalysisDF['Date'])
VancouverAnalysisDF["Day of Week"] = VancouverAnalysisDF['Date'].dt.day_name()
VancouverAnalysisDF

Unnamed: 0,Date,Province,HA,HSDA,Cases_Reported,Cases_Reported_Smoothed,Year,Month,Day,Day of Week
19,2020-01-29,BC,Vancouver Coastal,Vancouver,0,0.00,2020,01,29,Wednesday
21,2020-01-29,BC,Vancouver Island,Central Vancouver Island,0,0.00,2020,01,29,Wednesday
22,2020-01-29,BC,Vancouver Island,North Vancouver Island,0,0.00,2020,01,29,Wednesday
23,2020-01-29,BC,Vancouver Island,South Vancouver Island,0,0.00,2020,01,29,Wednesday
44,2020-01-30,BC,Vancouver Coastal,Vancouver,0,0.00,2020,01,30,Thursday
...,...,...,...,...,...,...,...,...,...,...
12623,2021-06-16,BC,Vancouver Island,South Vancouver Island,2,3.71,2021,06,16,Wednesday
12644,2021-06-17,BC,Vancouver Coastal,Vancouver,10,13.00,2021,06,17,Thursday
12646,2021-06-17,BC,Vancouver Island,Central Vancouver Island,1,0.57,2021,06,17,Thursday
12647,2021-06-17,BC,Vancouver Island,North Vancouver Island,0,0.43,2021,06,17,Thursday


In [9]:
VancouverAnalysisDF.dtypes

Date                       datetime64[ns]
Province                           object
HA                                 object
HSDA                               object
Cases_Reported                      int64
Cases_Reported_Smoothed           float64
Year                               object
Month                              object
Day                                object
Day of Week                        object
dtype: object

In [10]:
#Check for null values
VancouverAnalysisDF.isnull().sum(axis = 0)

Date                       0
Province                   0
HA                         0
HSDA                       0
Cases_Reported             0
Cases_Reported_Smoothed    0
Year                       0
Month                      0
Day                        0
Day of Week                0
dtype: int64

In [12]:
#export clean data to csv
OttawaAnalysisDf.to_csv("../data_sets/Ottawa_CovidCases.csv", index=False)
VancouverAnalysisDF.to_csv("../data_sets/Vancouver_CoivdCases.csv", index=False)