In [2]:
import pandas as pd

In [3]:
from io import StringIO
import requests

# Toronto Open Data is stored in a CKAN instance. It's APIs are documented here:
# https://docs.ckan.org/en/latest/api/

# To hit our API, you'll be making requests to:
base_url = "https://ckan0.cf.opendata.inter.prod-toronto.ca"

# Datasets are called "packages". Each package can contain many "resources"
# To retrieve the metadata for this package and its resources, use the package name in this page's URL:
url = base_url + "/api/3/action/package_show"
params = { "id": "covid-19-cases-in-toronto"}
package = requests.get(url, params = params).json()

# To get resource data:
for idx, resource in enumerate(package["result"]["resources"]):

       # for datastore_active resources:
       if resource["datastore_active"]:

           # To get all records in CSV format:
           url = base_url + "/datastore/dump/" + resource["id"]
           resource_dump_data = requests.get(url).text
           df = pd.read_csv(StringIO(resource_dump_data))
           
        #    print(resource_dump_data)

# print(df)

In [4]:
df.head()

Unnamed: 0,_id,Assigned_ID,Outbreak Associated,Age Group,Neighbourhood Name,FSA,Source of Infection,Classification,Episode Date,Reported Date,Client Gender,Outcome,Ever Hospitalized,Ever in ICU,Ever Intubated
0,1,1,Sporadic,50 to 59 Years,Willowdale East,M2N,Travel,CONFIRMED,2020-01-22,2020-01-23,FEMALE,RESOLVED,No,No,No
1,2,2,Sporadic,50 to 59 Years,Willowdale East,M2N,Travel,CONFIRMED,2020-01-21,2020-01-23,MALE,RESOLVED,Yes,No,No
2,3,3,Sporadic,20 to 29 Years,Parkwoods-Donalda,M3A,Travel,CONFIRMED,2020-02-05,2020-02-21,FEMALE,RESOLVED,No,No,No
3,4,4,Sporadic,60 to 69 Years,Church-Yonge Corridor,M4W,Travel,CONFIRMED,2020-02-16,2020-02-25,FEMALE,RESOLVED,No,No,No
4,5,5,Sporadic,60 to 69 Years,Church-Yonge Corridor,M4W,Travel,CONFIRMED,2020-02-20,2020-02-26,MALE,RESOLVED,No,No,No


In [7]:
# df to parquet
df.to_parquet('covid19.parquet', engine='pyarrow', compression='snappy')

In [8]:
df_p = pd.read_parquet('covid19.parquet', engine='pyarrow')

In [12]:
df_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395231 entries, 0 to 395230
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   _id                  395231 non-null  int64 
 1   Assigned_ID          395231 non-null  int64 
 2   Outbreak Associated  395231 non-null  object
 3   Age Group            394809 non-null  object
 4   Neighbourhood Name   382427 non-null  object
 5   FSA                  387639 non-null  object
 6   Source of Infection  395231 non-null  object
 7   Classification       395231 non-null  object
 8   Episode Date         395231 non-null  object
 9   Reported Date        395231 non-null  object
 10  Client Gender        395231 non-null  object
 11  Outcome              395231 non-null  object
 12  Ever Hospitalized    395231 non-null  object
 13  Ever in ICU          395231 non-null  object
 14  Ever Intubated       395231 non-null  object
dtypes: int64(2), object(13)
memory usa

In [18]:
df_test = pd.read_parquet('/Users/pats/Desktop/covid-19-toronto/data/2023-04-04-covid19-toronto.parquet', engine='pyarrow')
df_test.head()

Unnamed: 0,_id,assigned_id,outbreak_associated,age_group,neighbourhood_name,fsa,source_of_infection,classification,episode_date,reported_date,client_gender,outcome,ever_hospitalized,ever_in_icu,ever_intubated
0,1,1,Sporadic,50 to 59 Years,Willowdale East,M2N,Travel,CONFIRMED,2020-01-22,2020-01-23,FEMALE,RESOLVED,No,No,No
1,2,2,Sporadic,50 to 59 Years,Willowdale East,M2N,Travel,CONFIRMED,2020-01-21,2020-01-23,MALE,RESOLVED,Yes,No,No
2,3,3,Sporadic,20 to 29 Years,Parkwoods-Donalda,M3A,Travel,CONFIRMED,2020-02-05,2020-02-21,FEMALE,RESOLVED,No,No,No
3,4,4,Sporadic,60 to 69 Years,Church-Yonge Corridor,M4W,Travel,CONFIRMED,2020-02-16,2020-02-25,FEMALE,RESOLVED,No,No,No
4,5,5,Sporadic,60 to 69 Years,Church-Yonge Corridor,M4W,Travel,CONFIRMED,2020-02-20,2020-02-26,MALE,RESOLVED,No,No,No


In [19]:
df_test['age_group'].value_counts()
# Count values including null values
df_test['age_group'].value_counts(dropna=False)

age_group
20 to 29 Years    76749
30 to 39 Years    72323
40 to 49 Years    56054
19 and younger    54390
50 to 59 Years    51411
60 to 69 Years    33543
70 to 79 Years    20857
80 to 89 Years    18533
90 and older      10949
None                422
Name: count, dtype: int64

In [20]:
# neighbourhood_name
df_test['neighbourhood_name'].value_counts(dropna=False)

neighbourhood_name
None                                 12804
Waterfront Communities-The Island     8869
Woburn                                8433
Downsview-Roding-CFB                  7424
West Humber-Clairville                7097
                                     ...  
Kingsway South                        1017
Woodbine-Lumsden                       985
Blake-Jones                            967
Runnymede-Bloor West Village           966
Lambton Baby Point                     811
Name: count, Length: 141, dtype: int64

In [21]:
#fsa
df_test['fsa'].value_counts(dropna=False)

fsa
M9V     11544
M1B     10563
M3N      8152
M9W      7863
None     7592
        ...  
M4I         1
M5U         1
M0H         1
M9O         1
M8J         1
Name: count, Length: 193, dtype: int64