#### create BigQuery Dataset "datamart"

In [1]:
!bq --location=US mk --dataset datamart

Dataset 'lunar-analyzer-302702:datamart' successfully created.


### Transform the COVID-19 Dataset

#### create preliminary covid table

In [42]:
%%bigquery
CREATE OR REPLACE TABLE datamart.covid AS
SELECT null AS id, fips, admin2, province_state, country_region AS Country, last_update, latitude, longitude, confirmed, deaths, recovered, active, combined_key, incident_rate, case_fatality_ratio
FROM jhu_daily_reports_staging.covid_winter2020
UNION ALL
SELECT null AS id, fips, admin2, province_state, country_region AS Country, last_update, latitude, longitude, confirmed, deaths, recovered, active, combined_key, incident_rate, case_fatality_ratio
FROM jhu_daily_reports_staging.covid_spring2020
UNION ALL
SELECT null AS id, fips, admin2, province_state, country_region AS Country, CAST(last_update AS STRING) AS last_update, lat AS latitude, long_ AS longitude, confirmed, deaths, recovered, active, combined_key, incidence_rate AS incident_rate, case_fatality_ratio
FROM jhu_daily_reports_staging.covid_summer2020
UNION ALL
SELECT null AS id, fips, admin2, province_state, country_region AS Country, CAST(last_update AS STRING) AS last_update, lat AS latitude, long_ AS longitude, confirmed, deaths, recovered, active, combined_key, incident_rate, case_fatality_ratio
FROM jhu_daily_reports_staging.covid_fall2020
UNION ALL
SELECT null AS id, fips, admin2, province_state, country_region AS Country, CAST(last_update AS STRING) AS last_update, lat AS latitude, long_ AS longitude, confirmed, deaths, recovered, active, combined_key, incident_rate, case_fatality_ratio
FROM jhu_daily_reports_staging.covid_winter2021
UNION ALL
SELECT null AS id, fips, admin2, province_state, country_region AS Country, CAST(last_update AS STRING) AS last_update, lat AS latitude, long_ AS longitude, confirmed, deaths, recovered, active, combined_key, incident_rate, case_fatality_ratio
FROM jhu_daily_reports_staging.covid_spring2021

#### compute fingerprint ids for preliminary covid table

In [43]:
%%bigquery
UPDATE datamart.covid
SET id=FARM_FINGERPRINT(country)
WHERE province_state IS null

In [44]:
%%bigquery
UPDATE datamart.covid
SET id=FARM_FINGERPRINT(concat(province_state, country))
WHERE (province_state IS NOT null AND country!='US')

In [46]:
%%bigquery
UPDATE datamart.covid
SET id=FARM_FINGERPRINT(concat(admin2, province_state, country))
WHERE (province_state IS NOT null AND country='US' AND admin2 IS NOT null AND fips IS NULL)

In [48]:
%%bigquery
UPDATE datamart.covid
SET id=FARM_FINGERPRINT(concat(CAST(fips AS STRING), admin2, province_state, country))
WHERE (province_state IS NOT null AND country='US' AND admin2 IS NOT null AND fips IS NOT null)

In [50]:
%%bigquery
UPDATE datamart.covid
SET id=FARM_FINGERPRINT(concat(CAST(fips AS STRING), province_state, country))
WHERE (province_state IS NOT null AND country='US' AND admin2 IS null AND fips IS NOT null)

In [52]:
%%bigquery
UPDATE datamart.covid
SET id=FARM_FINGERPRINT(concat(province_state, country))
WHERE (province_state IS NOT null AND country='US' AND admin2 IS null AND fips IS null)

#### check that there is no null id field in the preliminary covid table

In [53]:
%%bigquery
SELECT count(*) AS null_id_count
FROM datamart.covid
WHERE id IS null

Unnamed: 0,null_id_count
0,0


#### sample preliminary covid table

In [54]:
%%bigquery
SELECT *
FROM datamart.covid
ORDER BY province_state, country
LIMIT 5

Unnamed: 0,id,fips,admin2,province_state,Country,last_update,latitude,longitude,confirmed,deaths,recovered,active,combined_key,incident_rate,case_fatality_ratio
0,8576431891811451300,,,,Azerbaijan,2020-02-28 15:03:26,,,1,0,0,,,,
1,8778414404485170876,,,,Afghanistan,2020-10-13 04:24:02+00,33.93911,67.709953,39870,1481,33118,5273.0,Afghanistan,102.418955,3.709556
2,8778414404485170876,,,,Afghanistan,2020-12-24 05:23:04+00,33.93911,67.709953,50433,2117,39692,8624.0,Afghanistan,129.553427,4.197648
3,8778414404485170876,,,,Afghanistan,2020-09-05 04:28:19+00,33.93911,67.709953,38304,1410,29713,7182.0,Afghanistan,98.396179,3.678467
4,8778414404485170876,,,,Afghanistan,2021-03-29 04:28:35+00,33.93911,67.709953,56294,2470,50013,3811.0,Afghanistan,144.609296,4.387679


### Transforms for table locations

#### create locations_raw table

In [55]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Locations_raw AS
SELECT DISTINCT id, fips, admin2, province_state, country, latitude, longitude, combined_key
FROM datamart.covid

#### show rows that have city,state in province_state column of the locations_raw table

In [56]:
%%bigquery
SELECT id, fips, admin2, province_state, strpos(province_state, ',') AS index, country, latitude, longitude, combined_key
FROM datamart.Locations_raw
WHERE strpos(province_state, ',') > 0
LIMIT 10

Unnamed: 0,id,fips,admin2,province_state,index,country,latitude,longitude,combined_key
0,8145114016361068042,,,"Tempe, AZ",6,US,33.4255,-111.94,
1,8145114016361068042,,,"Tempe, AZ",6,US,,,
2,8556287708025666455,,,"Boston, MA",7,US,,,
3,8556287708025666455,,,"Boston, MA",7,US,42.3601,-71.0589,
4,-1673719292839792409,,,"Orange, CA",7,US,,,
5,-1673719292839792409,,,"Orange, CA",7,US,33.7879,-117.8531,
6,7702389166849437576,,,"Travis, CA",7,US,,,
7,2904588917003491558,,,"Ashland, NE",8,US,,,
8,8162844378884388227,,,"Chicago, IL",8,US,41.8781,-87.6298,
9,8162844378884388227,,,"Chicago, IL",8,US,,,


#### standardize city, state in the locations_raw table into table locations_city

In [57]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Locations_city AS
SELECT id, fips, admin2, split(province_state, ',')[offset(0)] AS city, split(province_state, ',')[offset(1)] AS state, country, latitude, longitude, combined_key
FROM datamart.Locations_raw
WHERE strpos(province_state, ',') > 0

#### create locations table (if error, try CAST(null AS string) AS City

In [58]:
%%bigquery
CREATE OR REPLACE TABLE datamart.locations AS
(SELECT id, fips, admin2, city, state, country, latitude, longitude, combined_key
FROM datamart.Locations_city
UNION ALL
SELECT id, fips, admin2, null AS City, province_state AS State, country, latitude, longitude, combined_key
FROM datamart.Locations_raw
WHERE strpos(province_state, ',') = 0
UNION ALL
SELECT id, fips, admin2, null AS City, province_state AS State, country, latitude, longitude, combined_key
FROM datamart.Locations_raw
WHERE province_state IS null
)

#### Ensure no rows were lost

In [59]:
%%bigquery
SELECT count(*) AS locations_raw_count, (SELECT count(*) FROM datamart.locations) AS locations_count
FROM datamart.Locations_raw

Unnamed: 0,locations_raw_count,locations_count
0,5637,5637


In [72]:
%%bigquery
CREATE OR REPLACE TABLE datamart.locations AS
SELECT id, fips, admin2, city, state, country, latitude, longitude, combined_key
FROM
    (SELECT *, rank() over (partition by id, country ORDER BY latitude DESC, longitude DESC, combined_key DESC) AS rank
    FROM datamart.locations)
WHERE rank = 1

#### PK check: if same number then no duplicates

In [73]:
%%bigquery
SELECT count(*) AS distinct_PK, (SELECT count(*) FROM datamart.locations) AS locations_count
FROM (
    SELECT DISTINCT id
    FROM datamart.locations
)

Unnamed: 0,distinct_PK,locations_count
0,4413,4413


### Transforms for table cases

#### create Cases_raw table from preliminary covid table

In [78]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Cases_raw AS
SELECT id AS location_id, last_update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM datamart.covid

#### sample distinct dates in cases_raw table

In [79]:
%%bigquery
SELECT DISTINCT last_update
FROM datamart.Cases_raw
LIMIT 10

Unnamed: 0,last_update
0,2020-12-23 05:22:03+00
1,2021-02-22 05:24:21+00
2,2020-10-16 04:24:01+00
3,2021-03-03 05:23:28+00
4,2020-11-14 05:25:37+00
5,2020-08-14 04:51:19+00
6,2020-09-25 04:23:00+00
7,2020-07-06 04:33:57+00
8,2020-10-28 04:24:39+00
9,2021-01-30 05:22:49+00


In [80]:
%%bigquery
SELECT DISTINCT last_update
FROM datamart.Cases_raw
ORDER BY last_update
LIMIT 10

Unnamed: 0,last_update
0,1/22/20 17:00
1,1/23/20 17:00
2,1/24/20 17:00
3,1/25/20 17:00
4,1/26/20 16:00
5,1/27/20 23:59
6,1/28/20 23:00
7,1/29/20 19:30
8,1/30/20 16:00
9,1/31/20 10:37


#### create Cases_hypen_date_fixed

In [81]:
%%bigquery
SELECT last_update
FROM datamart.Cases_raw
WHERE strpos(last_update, '-') > 0 AND strpos(last_update, ' ') > 0 AND strpos(last_update, '+') = 0
LIMIT 15

Unnamed: 0,last_update
0,2020-03-15 18:20:18
1,2020-03-18 13:13:15
2,2020-03-20 20:13:15
3,2020-03-11 20:00:00
4,2020-02-07 23:43:02
5,2020-03-14 11:33:06
6,2020-03-14 20:13:16
7,2020-02-29 12:13:10
8,2020-02-06 23:23:02
9,2020-03-16 19:13:13


In [82]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Cases_hyphen_date_fixed AS
SELECT location_id, CAST(last_update AS DATETIME) AS last_update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM datamart.Cases_raw
WHERE strpos(last_update, '-') > 0 AND strpos(last_update, ' ') > 0 AND strpos(last_update, '+') = 0

#### create Cases_plus_date_fixed

In [83]:
%%bigquery
SELECT last_update
FROM datamart.Cases_raw
WHERE strpos(last_update, '-') > 0 AND strpos(last_update, ' ') > 0 AND strpos(last_update, '+') > 0
LIMIT 15

Unnamed: 0,last_update
0,2020-12-23 05:22:03+00
1,2021-02-22 05:24:21+00
2,2020-10-16 04:24:01+00
3,2021-03-03 05:23:28+00
4,2020-11-14 05:25:37+00
5,2020-08-14 04:51:19+00
6,2020-09-25 04:23:00+00
7,2020-07-06 04:33:57+00
8,2020-10-28 04:24:39+00
9,2021-01-30 05:22:49+00


In [84]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Cases_plus_date_fixed AS
SELECT location_id,CAST(CAST(last_update AS TIMESTAMP) AS DATETIME) AS last_update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM datamart.Cases_raw
WHERE strpos(last_update, '-') > 0 AND strpos(last_update, ' ') > 0 AND strpos(last_update, '+') > 0

#### create Cases_slash_date

In [85]:
%%bigquery
SELECT DISTINCT last_update
FROM datamart.Cases_raw
WHERE strpos(last_update, '/') > 0
LIMIT 15

Unnamed: 0,last_update
0,5/30/20 2:32
1,6/1/20 2:32
2,5/31/20 2:32
3,5/12/20 3:32
4,5/19/20 2:32
5,5/4/20 2:32
6,4/17/20 23:30
7,5/28/20 2:32
8,5/20/20 2:32
9,5/26/20 2:32


In [86]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Cases_slash_date AS
SELECT * 
FROM datamart.Cases_raw
WHERE strpos(last_update, '/') > 0

#### check Cases_slash_date for their year format

In [87]:
%%bigquery
SELECT last_update, length(split(split(last_update, ' ')[offset(0)], '/')[offset(2)]) AS year_num_digits
FROM (
    SELECT DISTINCT last_update
    FROM datamart.Cases_slash_date)
LIMIT 10

Unnamed: 0,last_update,year_num_digits
0,1/22/20 17:00,2
1,1/23/20 17:00,2
2,1/24/20 17:00,2
3,5/1/20 2:32,2
4,5/2/20 2:32,2
5,5/3/20 2:32,2
6,5/4/20 2:32,2
7,5/5/20 2:32,2
8,5/6/20 2:32,2
9,5/7/20 2:32,2


#### create Cases_slash_date_fixed

In [88]:
%%bigquery
CREATE OR REPLACE TABLE datamart.Cases_slash_date_fixed AS
(SELECT location_id, PARSE_DATETIME('%m/%d/%y %H:%M', last_update) AS Last_Update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM datamart.Cases_slash_date 
WHERE length(split(split(last_update, ' ')[offset(0)], '/')[offset(2)]) = 2
UNION ALL
SELECT location_id, PARSE_DATETIME('%m/%d/%Y %H:%M', last_update) AS Last_Update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM datamart.Cases_slash_date 
WHERE length(split(split(last_update, ' ')[offset(0)], '/')[offset(2)]) = 4
)

#### sample Cases_slash_date_fixed for fixed dates

In [89]:
%%bigquery
SELECT * 
FROM (
    SELECT DISTINCT last_update
    FROM datamart.Cases_slash_date_fixed)
ORDER BY last_update
LIMIT 10

Unnamed: 0,last_update
0,2020-01-22 17:00:00
1,2020-01-23 17:00:00
2,2020-01-24 17:00:00
3,2020-01-25 17:00:00
4,2020-01-26 16:00:00
5,2020-01-27 23:59:00
6,2020-01-28 23:00:00
7,2020-01-29 19:30:00
8,2020-01-30 16:00:00
9,2020-01-31 08:15:00


#### create cases table

In [90]:
%%bigquery
CREATE OR REPLACE TABLE datamart.cases AS
(SELECT * 
FROM datamart.Cases_hyphen_date_fixed
UNION ALL
SELECT *
FROM datamart.Cases_plus_date_fixed
UNION ALL
SELECT *
FROM datamart.Cases_slash_date_fixed
)

#### ensure no rows were lost

In [91]:
%%bigquery
SELECT count(*) AS cases_raw_count, (SELECT count(*) FROM datamart.cases) AS cases_count
FROM datamart.Cases_raw

Unnamed: 0,cases_raw_count,cases_count
0,1657069,1657068


#### remove duplicate cases

In [92]:
%%bigquery
CREATE OR REPLACE TABLE datamart.cases AS
SELECT DISTINCT *
FROM datamart.cases

#### get cases count after first attempt of removing duplicates

In [93]:
%%bigquery
SELECT count(*) AS distinct_cases_count
FROM datamart.cases

Unnamed: 0,distinct_cases_count
0,1476173


#### check if there a still duplicate cases

In [94]:
%%bigquery
SELECT location_id, last_update, count(*) AS duplicate_cases
FROM datamart.cases
GROUP BY location_id, last_update
HAVING count(*) > 1
ORDER BY count(*) DESC
LIMIT 10

Unnamed: 0,location_id,last_update,duplicate_cases
0,3183797238637395852,2021-04-02 15:13:53,146
1,-1181716864917925688,2021-04-02 15:13:53,111
2,2231787937947570706,2020-12-21 13:27:30,92
3,8778414404485170876,2020-03-17 11:53:10,3
4,8778414404485170876,2020-02-24 23:33:02,3
5,-391570144277816991,2020-03-14 22:33:04,2
6,8009782315755301025,2020-08-04 02:27:56,2
7,7612477503619687113,2020-02-13 17:53:03,2
8,7612477503619687113,2020-02-17 07:33:02,2
9,7430223413705102985,2020-03-14 20:13:16,2


#### remove duplicates based on confirmed, deaths, and recovered cases

In [95]:
%%bigquery
CREATE OR REPLACE TABLE datamart.cases AS
SELECT location_id, last_update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM(SELECT *, rank() over (partition by location_id, last_update order by confirmed DESC, deaths DESC, recovered DESC ) as rank
FROM datamart.cases)
WHERE rank = 1

#### PK check: if same number then no duplicates

In [96]:
%%bigquery
SELECT count(*) AS distinct_PK, (SELECT count(*) FROM datamart.cases) AS cases_count
FROM (
    SELECT DISTINCT location_id, last_update
    FROM datamart.cases
)

Unnamed: 0,distinct_PK,cases_count
0,1475767,1475767


#### FK check

In [97]:
%%bigquery
SELECT count(*) AS foreign_key_violations
FROM datamart.cases AS c
LEFT JOIN datamart.locations AS l ON c.location_id=l.id
WHERE l.id IS null

Unnamed: 0,foreign_key_violations
0,0


### Transform the ACS Dataset

### Transforms for table demographics

#### create demo_temp

In [5]:
%%bigquery
CREATE OR REPLACE TABLE datamart.demo_temp AS
SELECT 'US' AS country, null AS category, null AS subcategory1, null AS subcategory2, TRIM(_Label_) AS label, CAST(United_States__Estimate AS NUMERIC) AS estimate, REGEXP_REPLACE(United_States__Margin_of_Error, '[^0-9.]', '') AS error_margin, REGEXP_REPLACE(United_States__Percent, '[^0-9.]', '') AS percent, REGEXP_REPLACE(United_States__Percent_Margin_of_Error, '[^0-9.]', '') AS error_margin_percent
FROM us_census_bureau_staging.acs_demographics

#### update blank strings to nulls

In [6]:
%%bigquery
UPDATE datamart.demo_temp
SET error_margin=null
WHERE error_margin=''

In [7]:
%%bigquery
UPDATE datamart.demo_temp
SET error_margin_percent=null
WHERE error_margin_percent=''

In [8]:
%%bigquery
UPDATE datamart.demo_temp
SET percent=null
WHERE percent=''

#### create table demographics

In [34]:
%%bigquery
CREATE OR REPLACE TABLE datamart.demographics AS
SELECT country, CAST(category AS STRING) AS category, CAST(subcategory1 AS STRING) AS subcategory1, CAST(subcategory2 AS STRING) AS subcategory2, label, estimate, CAST(error_margin AS NUMERIC) AS error_margin, CAST(percent AS NUMERIC) AS percent, CAST(error_margin_percent AS NUMERIC) AS error_margin_percent
FROM datamart.demo_temp

#### remove non-data rows

In [35]:
%%bigquery
DELETE FROM datamart.demographics
WHERE estimate IS null AND error_margin IS null AND percent IS null AND error_margin IS null

#### update records with corresponding category and subcategory

In [36]:
%%bigquery
UPDATE datamart.demographics
SET category='general'
WHERE label IN ('Male', 'Female', 'Sex ratio (males per 100 females)', 'Total population', 'Under 5 years', '5 to 9 years', '10 to 14 years', '15 to 19 years', '20 to 24 years', '25 to 34 years', '35 to 44 years', '45 to 54 years', '55 to 59 years', '60 to 64 years', '65 to 74 years', '75 to 84 years', '85 years and over', 'Median age (years)', 'Under 18 years', '16 years and over', '18 years and over', '21 years and over', '62 years and over', '65 years and over', 'Total housing units', 'Citizen, 18 and over population')

In [37]:
%%bigquery
UPDATE datamart.demographics
SET subcategory1='sex'
WHERE (label='Male' AND estimate=159886919 AND error_margin=5817 AND percent=49.2)
OR (label='Female' AND estimate=164810876 AND error_margin=5818 AND percent=50.8)
OR (label='Sex ratio (males per 100 females)' AND estimate=97 AND error_margin=0.1)

In [38]:
%%bigquery
UPDATE datamart.demographics
SET subcategory1='age'
WHERE label IN ('Under 5 years', '5 to 9 years', '10 to 14 years', '15 to 19 years', '20 to 24 years', '25 to 34 years', '35 to 44 years', '45 to 54 years', '55 to 59 years', '60 to 64 years', '65 to 74 years', '75 to 84 years', '85 years and over', 'Median age (years)', 'Under 18 years', '16 years and over', '18 years and over', '21 years and over', '62 years and over', '65 years and over')
OR (label='Male' AND estimate=122360385)
OR (label='Female' AND estimate=128908018)
OR (label='Male' AND estimate=22518603)
OR (label='Female' AND estimate=28265193)
OR (label='Sex ratio (males per 100 females)' AND estimate=94.9 AND error_margin=0.1)
OR (label='Sex ratio (males per 100 females)' AND estimate=79.7 AND error_margin=0.1)

In [39]:
%%bigquery
UPDATE datamart.demographics
SET subcategory2='18 years and over'
WHERE (label='18 years and over')
OR (label='Male' AND estimate=122360385)
OR (label='Female' AND estimate=128908018)
OR (label='Sex ratio (males per 100 females)' AND estimate=94.9 AND error_margin=0.1)

In [40]:
%%bigquery
UPDATE datamart.demographics
SET subcategory2='65 years and over'
WHERE (label='65 years and over')
OR (label='Male' AND estimate=22518603)
OR (label='Female' AND estimate=28265193)
OR (label='Sex ratio (males per 100 females)' AND estimate=79.7 AND error_margin=0.1)

In [41]:
%%bigquery
UPDATE datamart.demographics
SET subcategory1='one race'
WHERE label IN ('One race', 'Cherokee tribal grouping', 'Chippewa tribal grouping', 'Navajo tribal grouping', 'Sioux tribal grouping', 'Asian Indian', 'Chinese', 'Filipino', 'Japanese', 'Korean', 'Vietnamese', 'Other Asian', 'Native Hawaiian', 'Guamanian or Chamorro', 'Samoan', 'Other Pacific Islander')
OR (label='White' AND estimate=235377662)
OR (label='Black or African American' AND estimate=41234642)
OR (label='American Indian and Alaska Native' AND estimate=2750143)
OR (label='Asian' AND estimate=17924209)
OR (label='Native Hawaiian and Other Pacific Islander' AND estimate=599868)
OR (label='Some other race' AND estimate=16047369)

In [42]:
%%bigquery
UPDATE datamart.demographics
SET subcategory2='American Indian and Alaska Native'
WHERE label IN ('Cherokee tribal grouping', 'Chippewa tribal grouping', 'Navajo tribal grouping', 'Sioux tribal grouping')
OR (label='American Indian and Alaska Native' AND estimate=2750143)

In [43]:
%%bigquery
UPDATE datamart.demographics
SET subcategory2='Asian'
WHERE label IN ('Asian Indian', 'Chinese', 'Filipino', 'Japanese', 'Korean', 'Vietnamese', 'Other Asian')
OR (label='Asian' AND estimate=17924209)

In [44]:
%%bigquery
UPDATE datamart.demographics
SET subcategory2='Native Hawaiian and Other Pacific Islander'
WHERE label IN ('Native Hawaiian', 'Guamanian or Chamorro', 'Samoan', 'Other Pacific Islander')
OR (label='Native Hawaiian and Other Pacific Islander' AND estimate=599868)

In [45]:
%%bigquery
UPDATE datamart.demographics
SET subcategory1='two or more races'
WHERE label IN ('White and Black or African American', 'White and American Indian and Alaska Native', 'White and Asian', 'Black or African American and American Indian and Alaska Native')
OR (label='Two or more races' AND estimate=10763902)

In [46]:
%%bigquery
UPDATE datamart.demographics
SET subcategory1='Race alone or in combination with one or more other races'
WHERE (label='White' AND estimate=244597669)
OR (label='Black or African American' AND estimate=45612523)
OR (label='American Indian and Alaska Native' AND estimate=5643919)
OR (label='Asian' AND estimate=21408058)
OR (label='Native Hawaiian and Other Pacific Islander' AND estimate=1399393)
OR (label='Some other race' AND estimate=17859236)

In [47]:
%%bigquery
UPDATE datamart.demographics
SET subcategory1='Hispanic or Latino'
WHERE label IN ('Hispanic or Latino (of any race)', 'Mexican', 'Puerto Rican', 'Cuban', 'Other Hispanic or Latino')

In [48]:
%%bigquery
UPDATE datamart.demographics
SET subcategory1='Not Hispanic or Latino'
WHERE label IN ('Not Hispanic or Latino', 'White alone', 'Black or African American alone', 'American Indian and Alaska Native alone', 'Asian alone', 'Native Hawaiian and Other Pacific Islander alone', 'Some other race alone','Two races including Some other race', 'Two races excluding Some other race, and Three or more races')
OR (label='Two or more races' AND estimate=7941608 AND error_margin=66801 AND percent=2.4)

In [49]:
%%bigquery
UPDATE datamart.demographics
SET subcategory2='Two or more races'
WHERE label IN ('Two races including Some other race', 'Two races excluding Some other race, and Three or more races')
OR (label='Two or more races' AND estimate=7941608 AND error_margin=66801 AND percent=2.4)

In [50]:
%%bigquery
UPDATE datamart.demographics
SET subcategory1='Citizen, 18 and over population'
WHERE (label='Citizen, 18 and over population')
OR (label='Male' AND estimate=112003359)
OR (label='Female' AND estimate=118956432)

In [51]:
%%bigquery
UPDATE datamart.demographics
SET subcategory2='sex'
WHERE (label='Male' AND estimate=112003359)
OR (label='Female' AND estimate=118956432)

In [52]:
%%bigquery
UPDATE datamart.demographics
SET category='race'
WHERE subcategory1 IN ('one race', 'two or more races', 'Race alone or in combination with one or more other races', 'Hispanic or Latino', 'Not Hispanic or Latino')

#### make fields null in percent column if they're not percentages

In [53]:
%%bigquery
UPDATE datamart.demographics
SET percent=null
WHERE estimate=percent

#### remove duplicate records

In [54]:
%%bigquery
CREATE OR REPLACE TABLE datamart.demographics AS
SELECT DISTINCT *
FROM datamart.demographics

In [55]:
%%bigquery
DELETE FROM datamart.demographics
WHERE (label='18 years and over' AND percent IS null)
OR (label='65 years and over' AND percent IS null)

#### remove housing record

In [99]:
%%bigquery
DELETE FROM datamart.demographics
WHERE label='Total housing units'

#### PK check: if same number then no duplicates

In [100]:
%%bigquery
SELECT count(*) AS distinct_PK, (SELECT count(*) FROM datamart.demographics) AS demographics_count
FROM (
    SELECT DISTINCT category, subcategory1, subcategory2, label
    FROM datamart.demographics
)

Unnamed: 0,distinct_PK,demographics_count
0,81,81


### FK check

In [101]:
%%bigquery
SELECT count(*) AS foreign_key_violations
FROM datamart.demographics AS d 
LEFT JOIN datamart.locations AS l ON d.country=l.country
WHERE l.id IS null

Unnamed: 0,foreign_key_violations
0,0


### Transforms for table housing

#### create housing_temp

In [24]:
%%bigquery
CREATE OR REPLACE TABLE datamart.housing_temp AS
SELECT 'US' AS country, null AS category, null AS subcategory, TRIM(_Label_) AS label, CAST(United_States__Estimate AS NUMERIC) AS estimate, REGEXP_REPLACE(United_States__Margin_of_Error, '[^0-9.]', '') AS error_margin, REGEXP_REPLACE(United_States__Percent, '[^0-9.]', '') AS percent, REGEXP_REPLACE(United_States__Percent_Margin_of_Error, '[^0-9.]', '') AS error_margin_percent
FROM us_census_bureau_staging.acs_housing

#### update blank strings to nulls

In [25]:
%%bigquery
UPDATE datamart.housing_temp
SET error_margin=null
WHERE error_margin=''

In [26]:
%%bigquery
UPDATE datamart.housing_temp
SET error_margin_percent=null
WHERE error_margin_percent=''

In [27]:
%%bigquery
UPDATE datamart.housing_temp
SET percent=null
WHERE percent=''

#### create table housing

In [26]:
%%bigquery
CREATE OR REPLACE TABLE datamart.housing AS
SELECT country, CAST(category AS STRING) AS category, CAST(subcategory AS STRING) AS subcategory, label, estimate, CAST(error_margin AS NUMERIC) AS error_margin, CAST(percent AS NUMERIC) AS percent, CAST(error_margin_percent AS NUMERIC) AS error_margin_percent
FROM datamart.housing_temp

#### remove non-data rows

In [53]:
%%bigquery
DELETE FROM datamart.housing
WHERE estimate IS null AND error_margin IS null AND percent IS null AND error_margin IS null

#### update records with corresponding category and subcategory

In [6]:
%%bigquery
UPDATE datamart.housing
SET category='housing units'
WHERE label IN ( 'Total housing units', 'Occupied housing units', 'Vacant housing units', 'Homeowner vacancy rate', 'Rental vacancy rate','1-unit, detached', '1-unit, attached','2 units', '3 or 4 units', '5 to 9 units', '10 to 19 units', '20 or more units', 'Mobile home', 'Boat, RV, van, etc.', 'Built 2014 or later', 'Built 2010 to 2013', 'Built 2000 to 2009', 'Built 1990 to 1999', 'Built 1980 to 1989', 'Built 1970 to 1979', 'Built 1960 to 1969', 'Built 1950 to 1959', 'Built 1940 to 1949', 'Built 1939 or earlier', '1 room', '2 rooms', '3 rooms', '4 rooms', '5 rooms', '6 rooms', '7 rooms', '8 rooms', '9 rooms or more', 'Median rooms', 'No bedroom', '1 bedroom', '2 bedrooms', '3 bedrooms', '4 bedrooms', '5 or more bedrooms')

In [15]:
%%bigquery
UPDATE datamart.housing
SET subcategory='housing occupancy'
WHERE label IN ('Occupied housing units', 'Vacant housing units', 'Homeowner vacancy rate', 'Rental vacancy rate')

In [7]:
%%bigquery
UPDATE datamart.housing
SET subcategory='units in structure'
WHERE label IN ('1-unit, detached', '1-unit, attached','2 units', '3 or 4 units', '5 to 9 units', '10 to 19 units', '20 or more units', 'Mobile home', 'Boat, RV, van, etc.')

In [8]:
%%bigquery
UPDATE datamart.housing
SET subcategory='housing tenure'
WHERE label IN ('Owner-occupied', 'Renter-occupied', 'Average household size of owner-occupied unit', 'Average household size of renter-occupied unit')

In [29]:
%%bigquery
UPDATE datamart.housing
SET subcategory='built year'
WHERE label IN ('Built 2014 or later', 'Built 2010 to 2013', 'Built 2000 to 2009', 'Built 1990 to 1999', 'Built 1980 to 1989', 'Built 1970 to 1979', 'Built 1960 to 1969', 'Built 1950 to 1959', 'Built 1940 to 1949', 'Built 1939 or earlier')

In [30]:
%%bigquery
UPDATE datamart.housing
SET subcategory='rooms'
WHERE label IN ('1 room', '2 rooms', '3 rooms', '4 rooms', '5 rooms', '6 rooms', '7 rooms', '8 rooms', '9 rooms or more', 'Median rooms')

In [31]:
%%bigquery
UPDATE datamart.housing
SET subcategory='bedrooms'
WHERE label IN ('No bedroom', '1 bedroom', '2 bedrooms', '3 bedrooms', '4 bedrooms', '5 or more bedrooms')

In [32]:
%%bigquery
UPDATE datamart.housing
SET category='occupied housing units'
WHERE label IN ('Owner-occupied', 'Renter-occupied', 'Average household size of owner-occupied unit', 'Average household size of renter-occupied unit', 'Moved in 2017 or later', 'Moved in 2015 to 2016', 'Moved in 2010 to 2014', 'Moved in 2000 to 2009', 'Moved in 1990 to 1999', 'Moved in 1989 and earlier', 'No vehicles available', '1 vehicle available', '2 vehicles available', '3 or more vehicles available', 'Utility gas', 'Bottled, tank, or LP gas', 'Electricity', 'Fuel oil, kerosene, etc.', 'Coal or coke', 'Wood', 'Solar energy', 'Other fuel', 'No fuel used', 'Lacking complete plumbing facilities', 'Lacking complete kitchen facilities', 'No telephone service available', '1.00 or less', '1.01 to 1.50', '1.51 or more')

In [33]:
%%bigquery
UPDATE datamart.housing
SET subcategory='moved in year'
WHERE label IN ('Moved in 2017 or later', 'Moved in 2015 to 2016', 'Moved in 2010 to 2014', 'Moved in 2000 to 2009', 'Moved in 1990 to 1999', 'Moved in 1989 and earlier')

In [34]:
%%bigquery
UPDATE datamart.housing
SET subcategory='vehicles'
WHERE label IN ('No vehicles available', '1 vehicle available', '2 vehicles available', '3 or more vehicles available')

In [35]:
%%bigquery
UPDATE datamart.housing
SET subcategory='house heating fuel'
WHERE label IN ('Utility gas', 'Bottled, tank, or LP gas', 'Electricity', 'Fuel oil, kerosene, etc.', 'Coal or coke', 'Wood', 'Solar energy', 'Other fuel', 'No fuel used')

In [36]:
%%bigquery
UPDATE datamart.housing
SET subcategory='selected characteristics'
WHERE label IN ('Lacking complete plumbing facilities', 'Lacking complete kitchen facilities', 'No telephone service available')

In [37]:
%%bigquery
UPDATE datamart.housing
SET subcategory='occupants per room'
WHERE label IN ('1.00 or less', '1.01 to 1.50', '1.51 or more')

In [9]:
%%bigquery
UPDATE datamart.housing
SET category='owner-occupied units' 
WHERE label IN ('Owner-occupied units', 'Less than $50,000', '$50,000 to $99,999', '$100,000 to $149,999', '$150,000 to $199,999', '$200,000 to $299,999', '$300,000 to $499,999', '$500,000 to $999,999', '$1,000,000 or more', 'Housing units with a mortgage', 'Housing units without a mortgage')
OR (label='Median (dollars)' AND estimate=217500 AND error_margin=180)

In [39]:
%%bigquery
UPDATE datamart.housing
SET subcategory='value'
WHERE label IN ('Less than $50,000', '$50,000 to $99,999', '$100,000 to $149,999', '$150,000 to $199,999', '$200,000 to $299,999', '$300,000 to $499,999', '$500,000 to $999,999', '$1,000,000 or more')
OR (label='Median (dollars)' AND estimate=217500 AND error_margin=180)

In [40]:
%%bigquery
UPDATE datamart.housing
SET subcategory='mortgage status'
WHERE label IN ('Housing units with a mortgage', 'Housing units without a mortgage')

In [13]:
%%bigquery
UPDATE datamart.housing
SET category='housing units with mortgage'
WHERE (label='Less than $500' AND estimate=569949 AND error_margin=6363 AND percent=1.2)
OR (label='$500 to $999' AND estimate=8242060 AND error_margin=44702 AND percent=17)
OR (label='$1,000 to $1,499' AND estimate=13248839 AND error_margin=67426 AND percent=27.4)
OR (label='$1,500 to $1,999' AND estimate=10169830 AND error_margin=54768 AND percent=21)
OR (label='$2,000 to $2,499' AND estimate=6274095 AND error_margin=36485 AND percent=13)
OR (label='$2,500 to $2,999' AND estimate=3841335 AND error_margin=23746 AND percent=7.9)
OR (label='$3,000 or more' AND estimate=6070519 AND error_margin=31722 AND percent=12.5)
OR (label='Median (dollars)' AND estimate=1595 AND error_margin=2)
OR (label='Less than 20.0 percent')
OR (label='20.0 to 24.9 percent' AND estimate=7584745 AND error_margin=48798 AND percent=15.7)
OR (label='25.0 to 29.9 percent' AND estimate=5072173 AND error_margin=29346 AND percent=10.5)
OR (label='30.0 to 34.9 percent' AND estimate=3330974 AND error_margin=18266 AND percent=6.9)
OR (label='35.0 percent or more' AND estimate=10069038 AND error_margin=23122 AND percent=20.9)
OR (label='Not computed' AND estimate=233653 AND error_margin=3535)
OR (label='Housing units with a mortgage')
OR (label='Housing units with a mortgage (excluding units where SMOCAPI cannot be computed)')

In [42]:
%%bigquery
UPDATE datamart.housing
SET subcategory='selected monthly owner costs'
WHERE (label='Less than $500' AND estimate=569949 AND error_margin=6363 AND percent=1.2)
OR (label='$500 to $999' AND estimate=8242060 AND error_margin=44702 AND percent=17)
OR (label='$1,000 to $1,499' AND estimate=13248839 AND error_margin=67426 AND percent=27.4)
OR (label='$1,500 to $1,999' AND estimate=10169830 AND error_margin=54768 AND percent=21)
OR (label='$2,000 to $2,499' AND estimate=6274095 AND error_margin=36485 AND percent=13)
OR (label='$2,500 to $2,999' AND estimate=3841335 AND error_margin=23746 AND percent=7.9)
OR (label='$3,000 or more' AND estimate=6070519 AND error_margin=31722 AND percent=12.5)
OR (label='Median (dollars)' AND estimate=1595 AND error_margin=2)

In [43]:
%%bigquery
UPDATE datamart.housing
SET subcategory='selected monthly owner costs as a percentage of household income'
WHERE (label='Less than 20.0 percent')
OR (label='20.0 to 24.9 percent' AND estimate=7584745 AND error_margin=48798 AND percent=15.7)
OR (label='25.0 to 29.9 percent' AND estimate=5072173 AND error_margin=29346 AND percent=10.5)
OR (label='30.0 to 34.9 percent' AND estimate=3330974 AND error_margin=18266 AND percent=6.9)
OR (label='35.0 percent or more' AND estimate=10069038 AND error_margin=23122 AND percent=20.9)
OR (label='Not computed' AND estimate=233653 AND error_margin=3535)

In [11]:
%%bigquery
UPDATE datamart.housing
SET category='housing units without mortgage'
WHERE label IN ('Less than $250', '$250 to $399', '$400 to $599', '$600 to $799', '$800 to $999', '$1,000 or more', 'Less than 10.0 percent', '10.0 to 14.9 percent')
OR (label='Median (dollars)' AND estimate=500 AND error_margin=1)
OR (label='15.0 to 19.9 percent' AND estimate=3154745 AND error_margin=16364 AND percent=11.1)
OR (label='20.0 to 24.9 percent' AND estimate=1916515 AND error_margin=11272 AND percent=6.7)
OR (label='25.0 to 29.9 percent' AND estimate=1235715 AND error_margin=7526 AND percent=4.3)
OR (label='30.0 to 34.9 percent' AND estimate=836924 AND error_margin=6646 AND percent=2.9)
OR (label='35.0 percent or more' AND estimate=3010014 AND error_margin=15340 AND percent=10.6)
OR (label='Not computed' AND estimate=394725 AND error_margin=4268)
OR (label='Housing units without a mortgage')
OR (label='Housing unit without a mortgage (excluding units where SMOCAPI cannot be computed)')

In [45]:
%%bigquery
UPDATE datamart.housing
SET subcategory='selected monthly owner costs'
WHERE label IN ('Less than $250', '$250 to $399', '$400 to $599', '$600 to $799', '$800 to $999', '$1,000 or more')
OR (label='Median (dollars)' AND estimate=500 AND error_margin=1)

In [46]:
%%bigquery
UPDATE datamart.housing
SET subcategory='selected monthly owner costs as a percentage of household income'
WHERE label IN ('Less than 10.0 percent', '10.0 to 14.9 percent')
OR (label='15.0 to 19.9 percent' AND estimate=3154745 AND error_margin=16364 AND percent=11.1)
OR (label='20.0 to 24.9 percent' AND estimate=1916515 AND error_margin=11272 AND percent=6.7)
OR (label='25.0 to 29.9 percent' AND estimate=1235715 AND error_margin=7526 AND percent=4.3)
OR (label='30.0 to 34.9 percent' AND estimate=836924 AND error_margin=6646 AND percent=2.9)
OR (label='35.0 percent or more' AND estimate=3010014 AND error_margin=15340 AND percent=10.6)
OR (label='Not computed' AND estimate=394725 AND error_margin=4268)

In [14]:
%%bigquery
UPDATE datamart.housing
SET category='occupied units paying rent'
WHERE (label='Less than $500' AND estimate=3865037 AND error_margin=13581 AND percent=9.4)
OR (label='$500 to $999' AND estimate=14956085 AND error_margin=47591 AND percent=36.2)
OR (label='$1,000 to $1,499' AND estimate=12403502 AND error_margin=50982 AND percent=30)
OR (label='$1,500 to $1,999' AND estimate=5796670 AND error_margin=32779 AND percent=14)
OR (label='$2,000 to $2,499' AND estimate=2324841 AND error_margin=14722 AND percent=5.6)
OR (label='$2,500 to $2,999' AND estimate=987766 AND error_margin=8283 AND percent=2.4)
OR (label='$3,000 or more' AND estimate=977971 AND error_margin=7308 AND percent=2.4)
OR (label='Median (dollars)' AND estimate=1062 AND error_margin=1)
OR (label='No rent paid' AND estimate=2169795 AND error_margin=10848)
OR (label='Less than 15.0 percent')
OR (label='15.0 to 19.9 percent' AND estimate=5195435 AND error_margin=23159 AND percent=12.9)
OR (label='20.0 to 24.9 percent' AND estimate=5190296 AND error_margin=21539 AND percent=12.9)
OR (label='25.0 to 29.9 percent' AND estimate=4681621 AND error_margin=19362 AND percent=11.6)
OR (label='30.0 to 34.9 percent' AND estimate=3674673 AND error_margin=17612 AND percent=9.1)
OR (label='35.0 percent or more' AND estimate=16328272 AND error_margin=67971 AND percent=40.5)
OR (label='Not computed' AND estimate=3115329 AND error_margin=16031)
OR (label='Occupied units paying rent')
OR (label='Occupied units paying rent (excluding units where GRAPI cannot be computed)')

In [48]:
%%bigquery
UPDATE datamart.housing
SET subcategory='gross rent'
WHERE (label='Less than $500' AND estimate=3865037 AND error_margin=13581 AND percent=9.4)
OR (label='$500 to $999' AND estimate=14956085 AND error_margin=47591 AND percent=36.2)
OR (label='$1,000 to $1,499' AND estimate=12403502 AND error_margin=50982 AND percent=30)
OR (label='$1,500 to $1,999' AND estimate=5796670 AND error_margin=32779 AND percent=14)
OR (label='$2,000 to $2,499' AND estimate=2324841 AND error_margin=14722 AND percent=5.6)
OR (label='$2,500 to $2,999' AND estimate=987766 AND error_margin=8283 AND percent=2.4)
OR (label='$3,000 or more' AND estimate=977971 AND error_margin=7308 AND percent=2.4)
OR (label='Median (dollars)' AND estimate=1062 AND error_margin=1)
OR (label='No rent paid' AND estimate=2169795 AND error_margin=10848)

In [49]:
%%bigquery
UPDATE datamart.housing
SET subcategory='gross rent as a percentage of household income'
WHERE (label='Less than 15.0 percent')
OR (label='15.0 to 19.9 percent' AND estimate=5195435 AND error_margin=23159 AND percent=12.9)
OR (label='20.0 to 24.9 percent' AND estimate=5190296 AND error_margin=21539 AND percent=12.9)
OR (label='25.0 to 29.9 percent' AND estimate=4681621 AND error_margin=19362 AND percent=11.6)
OR (label='30.0 to 34.9 percent' AND estimate=3674673 AND error_margin=17612 AND percent=9.1)
OR (label='35.0 percent or more' AND estimate=16328272 AND error_margin=67971 AND percent=40.5)
OR (label='Not computed' AND estimate=3115329 AND error_margin=16031)

#### make fields null in percent column if they're not percentages

In [16]:
%%bigquery
UPDATE datamart.housing
SET percent=null
WHERE estimate=percent

#### remove duplicate records

In [51]:
%%bigquery
CREATE OR REPLACE TABLE datamart.housing AS
SELECT DISTINCT *
FROM datamart.housing

In [105]:
%%bigquery
DELETE FROM datamart.housing
WHERE (label='Occupied housing units' AND percent IS null)
OR (label='Housing units with a mortgage' AND percent IS null)
OR (label='Housing units without a mortgage' AND percent IS null)

#### PK check: if same number then no duplicates

In [106]:
%%bigquery
SELECT count(*) AS distinct_PK, (SELECT count(*) FROM datamart.housing) AS housing_count
FROM (
    SELECT DISTINCT category, subcategory, label
    FROM datamart.housing
)

Unnamed: 0,distinct_PK,housing_count
0,130,130


#### FK check

In [107]:
%%bigquery
SELECT count(*) AS foreign_key_violations
FROM datamart.housing AS h
LEFT JOIN datamart.locations AS l ON h.country=l.country
WHERE l.id IS null

Unnamed: 0,foreign_key_violations
0,0


### Transforms for table economics

#### Create economics temp

In [1]:
%%bigquery
CREATE OR REPLACE TABLE datamart.economics_temp AS
SELECT 'US' AS country, null AS category, null AS subcategory, TRIM (Label) AS label, CAST(United_States__Estimate AS NUMERIC) AS estimate, REGEXP_REPLACE(United_States__Margin_of_Error, '[^0-9.]', '') AS error_margin, REGEXP_REPLACE(United_States__Percent, '[^0-9.]', '') AS percent, REGEXP_REPLACE(United_States__Percent_Margin_of_Error, '[^0-9.]', '') AS error_margin_percent
FROM us_census_bureau_staging.acs_economics

#### Update blank strings to nulls

In [2]:
%%bigquery
UPDATE datamart.economics_temp
SET error_margin=null
WHERE error_margin=''

In [3]:
%%bigquery
UPDATE datamart.economics_temp
SET error_margin_percent=null
WHERE error_margin_percent=''

In [4]:
%%bigquery
UPDATE datamart.economics_temp
SET percent=null
WHERE percent=''

#### Create Table Economics

In [5]:
%%bigquery
CREATE OR REPLACE TABLE datamart.economics AS
SELECT country, CAST(category AS STRING) AS category, CAST(subcategory AS STRING) AS subcategory, label, estimate, CAST(error_margin AS NUMERIC) AS error_margin, CAST(percent AS NUMERIC) AS percent, CAST(error_margin_percent AS NUMERIC) AS error_margin_percent
FROM datamart.economics_temp

#### Remove non-data rows

In [6]:
%%bigquery
DELETE FROM datamart.economics
WHERE estimate IS null AND error_margin IS null AND percent IS null AND error_margin IS null

### Transforms for table social

#### Create social temp

In [7]:
%%bigquery
CREATE OR REPLACE TABLE datamart.social_temp AS
SELECT 'US' AS country, null AS category, null AS subcategory, TRIM(_Label_) AS label, CAST(United_States__Estimate AS NUMERIC) AS estimate, REGEXP_REPLACE(United_States__Margin_of_Error, '[^0-9.]', '') AS error_margin, REGEXP_REPLACE(United_States__Percent, '[^0-9.]', '') AS percent, REGEXP_REPLACE(United_States__Percent_Margin_of_Error, '[^0-9.]', '') AS error_margin_percent
FROM us_census_bureau_staging.acs_social

#### Update Blank Strings to nulls

In [8]:
%%bigquery
UPDATE datamart.social_temp
SET error_margin=null
WHERE error_margin=''

In [9]:
%%bigquery
UPDATE datamart.social_temp
SET error_margin_percent=null
WHERE error_margin_percent=''

In [10]:
%%bigquery
UPDATE datamart.social_temp
SET percent=null
WHERE percent=''

#### Create Table Social

In [11]:
%%bigquery
CREATE OR REPLACE TABLE datamart.social AS
SELECT country, CAST(category AS STRING) AS category, CAST(subcategory AS STRING) AS subcategory, label, estimate, CAST(error_margin AS NUMERIC) AS error_margin, CAST(percent AS NUMERIC) AS percent, CAST(error_margin_percent AS NUMERIC) AS error_margin_percent 
FROM datamart.social_temp

#### Remove non data rows

In [12]:
%%bigquery
DELETE FROM datamart.social
WHERE estimate IS null AND error_margin IS null AND percent IS null AND error_margin IS null

### SQL Queries

### Views