#### create BigQuery Dataset "datamart"

In [1]:
!bq --location=US mk --dataset datamart

Dataset 'lunar-analyzer-302702:datamart' successfully created.


### Transform the COVID-19 Dataset

#### create preliminary covid table

In [2]:
%%bigquery
CREATE OR REPLACE TABLE datamart.covid AS
SELECT null AS id, fips, admin2, province_state, country_region AS Country, last_update, latitude, longitude, confirmed, deaths, recovered, active, combined_key, incident_rate, case_fatality_ratio
FROM jhu_daily_reports_staging.covid_winter2020
UNION ALL
SELECT null AS id, fips, admin2, province_state, country_region AS Country, last_update, latitude, longitude, confirmed, deaths, recovered, active, combined_key, incident_rate, case_fatality_ratio
FROM jhu_daily_reports_staging.covid_spring2020
UNION ALL
SELECT null AS id, fips, admin2, province_state, country_region AS Country, CAST(last_update AS STRING) AS last_update, lat AS latitude, long_ AS longitude, confirmed, deaths, recovered, active, combined_key, incidence_rate AS incident_rate, case_fatality_ratio
FROM jhu_daily_reports_staging.covid_summer2020
UNION ALL
SELECT null AS id, fips, admin2, province_state, country_region AS Country, CAST(last_update AS STRING) AS last_update, lat AS latitude, long_ AS longitude, confirmed, deaths, recovered, active, combined_key, incident_rate, case_fatality_ratio
FROM jhu_daily_reports_staging.covid_fall2020
UNION ALL
SELECT null AS id, fips, admin2, province_state, country_region AS Country, CAST(last_update AS STRING) AS last_update, lat AS latitude, long_ AS longitude, confirmed, deaths, recovered, active, combined_key, incident_rate, case_fatality_ratio
FROM jhu_daily_reports_staging.covid_winter2021
UNION ALL
SELECT null AS id, fips, admin2, province_state, country_region AS Country, CAST(last_update AS STRING) AS last_update, lat AS latitude, long_ AS longitude, confirmed, deaths, recovered, active, combined_key, incident_rate, case_fatality_ratio
FROM jhu_daily_reports_staging.covid_spring2021

#### compute fingerprint of country for preliminary covid table

In [3]:
%%bigquery
UPDATE datamart.covid
SET id=FARM_FINGERPRINT(country)
WHERE province_state IS null

#### compute fingerprint of province_state + country for preliminary covid table

In [4]:
%%bigquery
UPDATE datamart.covid
SET id=FARM_FINGERPRINT(concat(province_state, country))
WHERE province_state IS NOT null

#### check that there is no null id field in the preliminary covid table

In [5]:
%%bigquery
SELECT count(*) AS null_id_count
FROM datamart.covid
WHERE id IS null

Unnamed: 0,null_id_count
0,0


#### sample preliminary covid table

In [6]:
%%bigquery
SELECT *
FROM datamart.covid
ORDER BY province_state, country
LIMIT 5

Unnamed: 0,id,fips,admin2,province_state,Country,last_update,latitude,longitude,confirmed,deaths,recovered,active,combined_key,incident_rate,case_fatality_ratio
0,8576431891811451300,,,,Azerbaijan,2020-02-28 15:03:26,,,1,0,0,,,,
1,8778414404485170876,,,,Afghanistan,2020-07-11 04:34:26+00,33.93911,67.709953,34184,973,20882,12341.0,Afghanistan,87.838318,2.839679
2,8778414404485170876,,,,Afghanistan,5/31/20 2:32,33.93911,67.709953,14528,250,1303,12973.0,Afghanistan,37.312147,1.714286
3,8778414404485170876,,,,Afghanistan,2021-04-09 04:21:13+00,33.93911,67.709953,56943,2516,51956,2471.0,Afghanistan,146.276462,4.418454
4,8778414404485170876,,,,Afghanistan,2020-11-23 05:25:26+00,33.93911,67.709953,44706,1690,35934,7085.0,Afghanistan,114.841781,3.773543


#### create locations_raw table

In [8]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Locations_raw AS
SELECT DISTINCT id, fips, admin2, province_state, country, latitude, longitude, combined_key
FROM datamart.covid

#### show rows that have city,state in province_state column of the locations_raw table

In [10]:
%%bigquery
SELECT id, fips, admin2, province_state, strpos(province_state, ',') AS index, country, latitude, longitude, combined_key
FROM datamart.Locations_raw
WHERE strpos(province_state, ',') > 0
LIMIT 10

Unnamed: 0,id,fips,admin2,province_state,index,country,latitude,longitude,combined_key
0,8573415270917882368,,,"Los Angeles, CA",12,US,34.0522,-118.2437,
1,8573415270917882368,,,"Los Angeles, CA",12,US,,,
2,-2512886450838328062,,,"Lackland, TX",9,US,,,
3,-1661025486387849213,,,"Wake County, NC",12,US,35.8032,-78.5661,
4,1889376381529737478,,,"Norfolk County, MA",15,US,42.1767,-71.1449,
5,8276317383438951174,,,"Kershaw County, SC",15,US,34.3672,-80.5883,
6,-384057258140346105,,,"Orange County, CA",14,US,33.7879,-117.8531,
7,-5615829672343180281,,,"Shasta County, CA",14,US,40.7909,-121.8474,
8,2595497680336258825,,,"Pinal County, AZ",13,US,32.8162,-111.2845,
9,37861086087321354,,,"Norwell County, MA",15,US,42.1615,-70.7928,


#### standardize city, state in the locations_raw table into table locations_city

In [12]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Locations_city AS
SELECT id, fips, admin2, split(province_state, ',')[offset(0)] AS City, split(province_state, ',')[offset(1)] AS State, country, latitude, longitude, combined_key
FROM datamart.Locations_raw
WHERE strpos(province_state, ',') > 0

#### create locations table (if error, try CAST(null AS string) AS City

In [14]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Locations AS
(SELECT id, fips, admin2, city, state, country, latitude, longitude, combined_key
FROM datamart.Locations_city
UNION ALL
SELECT id, fips, admin2, null AS City, province_state AS State, country, latitude, longitude, combined_key
FROM datamart.Locations_raw
WHERE strpos(province_state, ',') = 0
UNION ALL
SELECT id, fips, admin2, null AS City, province_state AS State, country, latitude, longitude, combined_key
FROM datamart.Locations_raw
WHERE province_state IS null
)

#### ensure no rows were lost

In [15]:
%%bigquery
SELECT count(*) AS locations_raw_count, (SELECT count(*) FROM datamart.Locations) AS locations_count
FROM datamart.Locations_raw

Unnamed: 0,locations_raw_count,locations_count
0,5637,5637


#### create Cases_raw table from preliminary covid table

In [20]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Cases_raw AS
SELECT id AS location_id, last_update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM datamart.covid

#### sample distinct dates in cases_raw table

In [26]:
%%bigquery
SELECT DISTINCT last_update
FROM datamart.Cases_raw
LIMIT 10

Unnamed: 0,last_update
0,2020-07-03 04:33:54+00
1,2020-08-12 04:27:29+00
2,2020-10-05 04:23:33+00
3,2021-03-17 05:25:07+00
4,2020-12-06 05:26:18+00
5,2020-12-09 05:28:01+00
6,2020-12-18 05:28:18+00
7,2021-01-14 05:22:32+00
8,2020-11-08 05:24:56+00
9,2021-04-11 04:20:56+00


In [27]:
%%bigquery
SELECT DISTINCT last_update
FROM datamart.Cases_raw
ORDER BY last_update
LIMIT 10

Unnamed: 0,last_update
0,1/22/20 17:00
1,1/23/20 17:00
2,1/24/20 17:00
3,1/25/20 17:00
4,1/26/20 16:00
5,1/27/20 23:59
6,1/28/20 23:00
7,1/29/20 19:30
8,1/30/20 16:00
9,1/31/20 10:37


#### create Cases_hypen_date_fixed

In [61]:
%%bigquery
SELECT last_update
FROM datamart.Cases_raw
WHERE strpos(last_update, '-') > 0 AND strpos(last_update, ' ') > 0 AND strpos(last_update, '+') = 0
LIMIT 15

Unnamed: 0,last_update
0,2020-02-09 23:33:02
1,2020-03-21 20:43:02
2,2020-03-03 20:03:06
3,2020-03-04 12:33:03
4,2020-03-02 20:23:16
5,2020-03-16 20:13:11
6,2020-03-19 20:43:02
7,2020-03-15 18:20:18
8,2020-03-07 17:33:03
9,2020-02-14 23:33:02


In [62]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Cases_hyphen_date_fixed AS
SELECT location_id, CAST(last_update AS DATETIME) AS last_update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM datamart.Cases_raw
WHERE strpos(last_update, '-') > 0 AND strpos(last_update, ' ') > 0 AND strpos(last_update, '+') = 0

#### create Cases_plus_date_fixed

In [67]:
%%bigquery
SELECT last_update
FROM datamart.Cases_raw
WHERE strpos(last_update, '-') > 0 AND strpos(last_update, ' ') > 0 AND strpos(last_update, '+') > 0
LIMIT 15

Unnamed: 0,last_update
0,2020-07-03 04:33:54+00
1,2020-08-12 04:27:29+00
2,2020-10-05 04:23:33+00
3,2021-03-17 05:25:07+00
4,2021-03-17 05:25:07+00
5,2020-12-06 05:26:18+00
6,2020-12-09 05:28:01+00
7,2020-12-18 05:28:18+00
8,2021-01-14 05:22:32+00
9,2020-11-08 05:24:56+00


In [64]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Cases_plus_date_fixed AS
SELECT location_id,CAST(CAST(last_update AS TIMESTAMP) AS DATETIME) AS last_update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM datamart.Cases_raw
WHERE strpos(last_update, '-') > 0 AND strpos(last_update, ' ') > 0 AND strpos(last_update, '+') > 0

#### create Cases_slash_date

In [71]:
%%bigquery
SELECT DISTINCT last_update
FROM datamart.Cases_raw
WHERE strpos(last_update, '/') > 0
LIMIT 15

Unnamed: 0,last_update
0,5/31/20 2:32
1,5/30/20 2:32
2,6/1/20 2:32
3,5/11/20 2:32
4,5/17/20 2:32
5,5/29/20 2:32
6,5/21/20 2:32
7,5/23/20 2:32
8,5/8/20 2:32
9,4/28/20 2:30


In [75]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Cases_slash_date AS
SELECT * 
FROM datamart.Cases_raw
WHERE strpos(last_update, '/') > 0

#### check Cases_slash_date for their year format

In [78]:
%%bigquery
SELECT last_update, length(split(split(last_update, ' ')[offset(0)], '/')[offset(2)]) AS year_num_digits
FROM (
    SELECT DISTINCT last_update
    FROM datamart.Cases_slash_date)
LIMIT 10

Unnamed: 0,last_update,year_num_digits
0,2/1/20 19:53,2
1,6/1/20 2:32,2
2,5/29/20 2:32,2
3,5/30/20 2:32,2
4,5/31/20 2:32,2
5,5/15/20 2:33,2
6,5/16/20 2:32,2
7,5/17/20 2:32,2
8,5/18/20 2:32,2
9,5/19/20 2:32,2


#### create Cases_slash_date_fixed

In [79]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Cases_slash_date_fixed AS
(SELECT location_id, PARSE_DATETIME('%m/%d/%y %H:%M', last_update) AS Last_Update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM datamart.Cases_slash_date 
WHERE length(split(split(last_update, ' ')[offset(0)], '/')[offset(2)]) = 2
UNION ALL
SELECT location_id, PARSE_DATETIME('%m/%d/%Y %H:%M', last_update) AS Last_Update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM datamart.Cases_slash_date 
WHERE length(split(split(last_update, ' ')[offset(0)], '/')[offset(2)]) = 4
)

#### sample Cases_slash_date_fixed for fixed dates

In [80]:
%%bigquery
SELECT * 
FROM (
    SELECT DISTINCT last_update
    FROM datamart.Cases_slash_date_fixed)
ORDER BY last_update
LIMIT 10

Unnamed: 0,last_update
0,2020-01-22 17:00:00
1,2020-01-23 17:00:00
2,2020-01-24 17:00:00
3,2020-01-25 17:00:00
4,2020-01-26 16:00:00
5,2020-01-27 23:59:00
6,2020-01-28 23:00:00
7,2020-01-29 19:30:00
8,2020-01-30 16:00:00
9,2020-01-31 08:15:00


#### create Cases table

In [84]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Cases AS
(SELECT * 
FROM datamart.Cases_hyphen_date_fixed
UNION ALL
SELECT *
FROM datamart.Cases_plus_date_fixed
UNION ALL
SELECT *
FROM datamart.Cases_slash_date_fixed
)

#### ensure no rows were lost

In [85]:
%%bigquery
SELECT count(*) AS cases_raw_count, (SELECT count(*) FROM datamart.Cases) AS cases_count
FROM datamart.Cases_raw

Unnamed: 0,cases_raw_count,cases_count
0,1657069,1657068


#### remove duplicate cases

In [86]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Cases AS
SELECT DISTINCT *
FROM datamart.Cases

#### get cases count after first attempt of removing duplicates

In [87]:
%%bigquery
SELECT count(*) AS distinct_cases_count
FROM datamart.Cases

Unnamed: 0,distinct_cases_count
0,1406889


#### check if there a still duplicate cases

In [88]:
%%bigquery
SELECT location_id, last_update, count(*) AS duplicate_cases
FROM datamart.Cases
GROUP BY location_id, last_update
HAVING count(*) > 1
ORDER BY count(*) DESC
LIMIT 10

Unnamed: 0,location_id,last_update,duplicate_cases
0,-7927029950822897516,2020-12-12 05:26:19,255
1,-7927029950822897516,2021-01-11 05:21:50,255
2,-7927029950822897516,2021-02-09 05:23:30,255
3,-7927029950822897516,2021-03-06 04:23:41,255
4,-7927029950822897516,2021-01-22 05:22:10,255
5,-7927029950822897516,2021-02-06 05:23:55,255
6,-7927029950822897516,2020-12-20 05:27:32,255
7,-7927029950822897516,2021-03-03 05:23:28,255
8,-7927029950822897516,2020-10-20 04:24:22,255
9,-7927029950822897516,2020-12-14 05:26:27,255


#### remove duplicates based on confirmed, deaths, and recovered cases

In [102]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Cases AS
SELECT location_id, last_update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM(SELECT *, rank() over (partition by location_id, last_update order by confirmed DESC, deaths DESC, recovered DESC ) as rank
FROM datamart.Cases)
WHERE rank = 1

In [103]:
%%bigquery
SELECT count(*) distinct_cases, (SELECT count(*) AS total_cases FROM datamart.Cases) AS total_cases
FROM (
SELECT DISTINCT location_id, last_update
FROM datamart.Cases)

Unnamed: 0,distinct_cases,total_cases
0,266258,266258


### Transform the ACS Dataset