### Table Locations Pipeline [1/2]: Remove duplicate locations

##### Due to requiring two Apache Beam pipelines for table locations, the python files are named locations_beam_1.py and locations_beam_2.py

#### Get unique locations from duplicate latitude/longitude in table locations into locations_Beam

In [2]:
%run locations_beam_1.py

#### Create the locations table without the duplicates

In [1]:
%%bigquery
CREATE OR REPLACE TABLE datamart.new_locations AS
SELECT *
FROM datamart.locations AS l
WHERE l.id NOT IN (SELECT lo.id
                    FROM datamart.locations AS lo
                    WHERE (lo.latitude IN (SELECT lat.latitude FROM datamart.locations AS lat WHERE lo.latitude=lat.latitude AND lo.id!=lat.id) 
                    AND longitude IN (SELECT long.longitude FROM datamart.locations AS long WHERE lo.longitude=long.longitude AND lo.id!=long.id) 
                    AND lo.latitude!=0 
                    AND lo.longitude!=0)
                    AND lo.id NOT IN (SELECT b.id FROM datamart.locations_Beam AS b))

#### Check that only duplicates were removed

In [2]:
%%bigquery
SELECT count(*) AS orginal_count, 
                    (SELECT count(*)
                    FROM datamart.locations AS lo
                    WHERE (lo.latitude IN (SELECT lat.latitude FROM datamart.locations AS lat WHERE lo.latitude=lat.latitude AND lo.id!=lat.id) 
                    AND longitude IN (SELECT long.longitude FROM datamart.locations AS long WHERE lo.longitude=long.longitude AND lo.id!=long.id) 
                    AND lo.latitude!=0 
                    AND lo.longitude!=0)
                    AND lo.id NOT IN (SELECT b.id FROM datamart.locations_Beam AS b)) AS num_duplicates,
                    (SELECT count(*)
                    FROM datamart.new_locations) AS new_count
FROM datamart.locations

Unnamed: 0,orginal_count,num_duplicates,new_count
0,4413,108,4305


#### PK check: no duplicates if same number

In [3]:
%%bigquery
SELECT count(*) AS distinct_PK, (SELECT count(*) FROM datamart.new_locations) AS locations_count
FROM (
    SELECT DISTINCT id
    FROM datamart.new_locations
)

Unnamed: 0,distinct_PK,locations_count
0,4305,4305


### SQL Transforms: Update location_id of the orphaned records in table cases

##### Note: No Apache Beam pipeline was written for table cases, therefore the cases table is being modified directly rather than making a new table called cases_Beam

#### Create a junction table to match abandoned ids to kept ids

In [4]:
%%bigquery
CREATE OR REPLACE TABLE datamart.abandoned_junction AS
SELECT lo.id, lo.latitude, lo.longitude
FROM datamart.locations AS lo
WHERE (lo.latitude IN (SELECT lat.latitude FROM datamart.locations AS lat WHERE lo.latitude=lat.latitude AND lo.id!=lat.id) 
AND longitude IN (SELECT long.longitude FROM datamart.locations AS long WHERE lo.longitude=long.longitude AND lo.id!=long.id) 
AND lo.latitude!=0 
AND lo.longitude!=0)
AND lo.id NOT IN (SELECT b.id FROM datamart.locations_Beam AS b)

In [5]:
%%bigquery
SELECT b.id AS keep_id, a.id AS rid_id, a.latitude, a.longitude
FROM datamart.locations_Beam AS b
JOIN datamart.abandoned_junction AS a ON b.latitude=a.latitude AND b.longitude=a.longitude


Unnamed: 0,keep_id,rid_id,latitude,longitude
0,80692389010230126,7902239636534001922,13.444300,144.793700
1,80692389010230126,-1927967959511792155,13.444300,144.793700
2,80692389010230126,-7958264304576551061,13.444300,144.793700
3,-6125909817942919488,-8744039356309243705,15.097900,145.673900
4,-6125909817942919488,3620381022065474036,15.097900,145.673900
...,...,...,...,...
103,-4001783368444537860,-286247227594574592,37.854472,-111.441876
104,-4001783368444537860,-4115263533047990398,37.854472,-111.441876
105,-3162112320542996385,8800898412804184444,60.244297,-151.538888
106,4683806234748901016,-6268268396950388964,62.313050,-149.574174


In [6]:
%%bigquery
CREATE OR REPLACE TABLE datamart.abandoned_junction AS
SELECT b.id AS keep_id, a.id AS rid_id
FROM datamart.locations_Beam AS b
JOIN datamart.abandoned_junction AS a ON b.latitude=a.latitude AND b.longitude=a.longitude

#### Update table cases: change location_id from the abandoned location_id to the kept ones

In [7]:
%%bigquery
UPDATE datamart.cases AS c
SET location_id = (SELECT keep_id FROM datamart.abandoned_junction AS a WHERE c.location_id=a.rid_id)
WHERE c.location_id IN (SELECT lo.id
                        FROM datamart.locations AS lo
                        WHERE (lo.latitude IN (SELECT lat.latitude FROM datamart.locations AS lat WHERE lo.latitude=lat.latitude AND lo.id!=lat.id) 
                        AND longitude IN (SELECT long.longitude FROM datamart.locations AS long WHERE lo.longitude=long.longitude AND lo.id!=long.id) 
                        AND lo.latitude!=0 
                        AND lo.longitude!=0)
                        AND lo.id NOT IN (SELECT b.id FROM datamart.locations_Beam AS b))

#### PK check: no duplicates if same number

In [8]:
%%bigquery
SELECT count(*) AS distinct_PK, (SELECT count(*) FROM datamart.cases) AS cases_count
FROM (
    SELECT DISTINCT location_id, last_update
    FROM datamart.cases
)

Unnamed: 0,distinct_PK,cases_count
0,1475736,1475736


##### remove duplicates based on confirmed, deaths, and recovered cases

In [9]:
%%bigquery
CREATE OR REPLACE TABLE datamart.cases AS
SELECT location_id, last_update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM(SELECT *, rank() over (partition by location_id, last_update order by confirmed DESC, deaths DESC, recovered DESC ) as rank
FROM datamart.cases)
WHERE rank = 1

#### PK re-check: no duplicates if same number

In [10]:
%%bigquery
SELECT count(*) AS distinct_PK, (SELECT count(*) FROM datamart.cases) AS cases_count
FROM (
    SELECT DISTINCT location_id, last_update
    FROM datamart.cases
)

Unnamed: 0,distinct_PK,cases_count
0,1475736,1475736


#### FK check

In [11]:
%%bigquery
SELECT count(*) AS foreign_key_violations
FROM datamart.cases AS c
LEFT JOIN datamart.new_locations AS l ON c.location_id=l.id
WHERE l.id IS null

Unnamed: 0,foreign_key_violations
0,0


### Table Locations Pipeline [2/2]: Standardize state names

##### Due to requiring two Apache Beam pipelines for table locations, the python files are named locations_beam_1.py and locations_beam_2.py

#### Get locations with abreviated state names from table locations and standardize them into locations_Beam

In [2]:
%run locations_beam_2.py

#### Create complete locations_Beam table by replacing the unstandardize state rows with standardize rows

In [1]:
%%bigquery
CREATE OR REPLACE TABLE datamart.new_locations AS
SELECT *
FROM datamart.new_locations AS l
WHERE l.id NOT IN (SELECT b.id
                    FROM datamart.locations_Beam AS b)

In [2]:
%%bigquery
CREATE OR REPLACE TABLE datamart.locations_Beam AS
SELECT * 
FROM datamart.new_locations
UNION ALL
SELECT *
FROM datamart.locations_Beam

#### PK check: no duplicates if same number

In [3]:
%%bigquery
SELECT count(*) AS distinct_PK, (SELECT count(*) FROM datamart.locations_Beam) AS locations_count
FROM (
    SELECT DISTINCT id
    FROM datamart.locations_Beam
)

Unnamed: 0,distinct_PK,locations_count
0,4305,4305


### SQL Transforms on table locations_Beam

In [4]:
%%bigquery
SELECT distinct state, count(*) AS count
FROM datamart.locations_Beam
WHERE country='US'
GROUP BY state
ORDER BY state

Unnamed: 0,state,count
0,,1
1,CA (From Diamond Princess),1
2,D.C.,1
3,NE (From Diamond Princess),1
4,TX (From Diamond Princess),1
...,...,...
64,Washington,53
65,West Virginia,57
66,Wisconsin,76
67,Wuhan Evacuee,1


In [5]:
%%bigquery
UPDATE datamart.locations_Beam
SET state='District of Columbia'
WHERE state=' D.C.'

In [6]:
%%bigquery
UPDATE datamart.locations_Beam
SET state='Texas', combined_key='Diamond Princess, Lackland, Texas, US'
WHERE state=' TX (From Diamond Princess)'

In [7]:
%%bigquery
UPDATE datamart.locations_Beam
SET state='Nebraska', combined_key='Diamond Princess, Omaha, Nebraska, US'
WHERE state=' NE (From Diamond Princess)'

In [8]:
%%bigquery
UPDATE datamart.locations_Beam
SET state='California', combined_key='Diamond Princess, Travis, California, US'
WHERE state=' CA (From Diamond Princess)'

In [9]:
%%bigquery
SELECT distinct state, count(*) AS count
FROM datamart.locations_Beam
WHERE country='US'
GROUP BY state
ORDER BY state

Unnamed: 0,state,count
0,,1
1,Alabama,71
2,Alaska,37
3,American Samoa,1
4,Arizona,22
...,...,...
60,Washington,53
61,West Virginia,57
62,Wisconsin,76
63,Wuhan Evacuee,1


#### PK check: no duplicates if same number

In [10]:
%%bigquery
SELECT count(*) AS distinct_PK, (SELECT count(*) FROM datamart.locations_Beam) AS locations_count
FROM (
    SELECT DISTINCT id
    FROM datamart.locations_Beam
)

Unnamed: 0,distinct_PK,locations_count
0,4305,4305


### Temp tables were deleted via BigQuery's UI