### Remove duplicate locations in table locations

#### Get unique locations from duplicate latitude/longitude in table locations into locations_Beam

In [5]:
%run locations_beam.py

#### Create the locations table without the duplicates

In [1]:
%%bigquery
CREATE OR REPLACE TABLE datamart.new_locations AS
SELECT *
FROM datamart.locations AS l
WHERE l.id NOT IN (SELECT lo.id
                    FROM datamart.locations AS lo
                    WHERE (lo.latitude IN (SELECT lat.latitude FROM datamart.locations AS lat WHERE lo.latitude=lat.latitude AND lo.id!=lat.id) 
                    AND longitude IN (SELECT long.longitude FROM datamart.locations AS long WHERE lo.longitude=long.longitude AND lo.id!=long.id) 
                    AND lo.latitude!=0 
                    AND lo.longitude!=0)
                    AND lo.id NOT IN (SELECT b.id FROM datamart.locations_Beam AS b))

#### Check that only duplicates were removed

In [2]:
%%bigquery
SELECT count(*) AS orginal_count, 
                    (SELECT count(*)
                    FROM datamart.locations AS lo
                    WHERE (lo.latitude IN (SELECT lat.latitude FROM datamart.locations AS lat WHERE lo.latitude=lat.latitude AND lo.id!=lat.id) 
                    AND longitude IN (SELECT long.longitude FROM datamart.locations AS long WHERE lo.longitude=long.longitude AND lo.id!=long.id) 
                    AND lo.latitude!=0 
                    AND lo.longitude!=0)
                    AND lo.id NOT IN (SELECT b.id FROM datamart.locations_Beam AS b)) AS num_duplicates,
                    (SELECT count(*)
                    FROM datamart.new_locations) AS new_count
FROM datamart.locations

Unnamed: 0,orginal_count,num_duplicates,new_count
0,4413,108,4305


#### PK check: no duplicates if same number

In [3]:
%%bigquery
SELECT count(*) AS distinct_PK, (SELECT count(*) FROM datamart.new_locations) AS locations_count
FROM (
    SELECT DISTINCT id
    FROM datamart.new_locations
)

Unnamed: 0,distinct_PK,locations_count
0,4305,4305


### Update location_id of the orphaned records in new_cases

#### Make new_cases

In [4]:
%%bigquery
CREATE OR REPLACE TABLE datamart.new_cases AS
SELECT *
FROM datamart.cases

#### Create a junction table to match abandoned ids to kept ids

In [5]:
%%bigquery
CREATE OR REPLACE TABLE datamart.abandoned_junction AS
SELECT lo.id, lo.latitude, lo.longitude
FROM datamart.locations AS lo
WHERE (lo.latitude IN (SELECT lat.latitude FROM datamart.locations AS lat WHERE lo.latitude=lat.latitude AND lo.id!=lat.id) 
AND longitude IN (SELECT long.longitude FROM datamart.locations AS long WHERE lo.longitude=long.longitude AND lo.id!=long.id) 
AND lo.latitude!=0 
AND lo.longitude!=0)
AND lo.id NOT IN (SELECT b.id FROM datamart.locations_Beam AS b)

In [6]:
%%bigquery
SELECT b.id AS keep_id, a.id AS rid_id, a.latitude, a.longitude
FROM datamart.locations_Beam AS b
JOIN datamart.abandoned_junction AS a ON b.latitude=a.latitude AND b.longitude=a.longitude


Unnamed: 0,keep_id,rid_id,latitude,longitude
0,80692389010230126,-7958264304576551061,13.444300,144.793700
1,80692389010230126,7902239636534001922,13.444300,144.793700
2,80692389010230126,-1927967959511792155,13.444300,144.793700
3,-6125909817942919488,-8744039356309243705,15.097900,145.673900
4,-6125909817942919488,3620381022065474036,15.097900,145.673900
...,...,...,...,...
103,-4001783368444537860,-286247227594574592,37.854472,-111.441876
104,-4001783368444537860,-4115263533047990398,37.854472,-111.441876
105,-3162112320542996385,8800898412804184444,60.244297,-151.538888
106,4683806234748901016,-6268268396950388964,62.313050,-149.574174


In [7]:
%%bigquery
CREATE OR REPLACE TABLE datamart.abandoned_junction AS
SELECT b.id AS keep_id, a.id AS rid_id
FROM datamart.locations_Beam AS b
JOIN datamart.abandoned_junction AS a ON b.latitude=a.latitude AND b.longitude=a.longitude

#### Update new_cases: change location_id from the abandoned location_id to the kept ones

In [8]:
%%bigquery
UPDATE datamart.new_cases AS c
SET location_id = (SELECT keep_id FROM datamart.abandoned_junction AS a WHERE c.location_id=a.rid_id)
WHERE c.location_id IN (SELECT lo.id
                        FROM datamart.locations AS lo
                        WHERE (lo.latitude IN (SELECT lat.latitude FROM datamart.locations AS lat WHERE lo.latitude=lat.latitude AND lo.id!=lat.id) 
                        AND longitude IN (SELECT long.longitude FROM datamart.locations AS long WHERE lo.longitude=long.longitude AND lo.id!=long.id) 
                        AND lo.latitude!=0 
                        AND lo.longitude!=0)
                        AND lo.id NOT IN (SELECT b.id FROM datamart.locations_Beam AS b))

#### PK check: no duplicates if same number

In [9]:
%%bigquery
SELECT count(*) AS distinct_PK, (SELECT count(*) FROM datamart.new_cases) AS cases_count
FROM (
    SELECT DISTINCT location_id, last_update
    FROM datamart.new_cases
)

Unnamed: 0,distinct_PK,cases_count
0,1475736,1475767


##### remove duplicates based on confirmed, deaths, and recovered cases

In [10]:
%%bigquery
CREATE OR REPLACE TABLE datamart.new_cases AS
SELECT location_id, last_update, confirmed, deaths, recovered, active, incident_rate, case_fatality_ratio
FROM(SELECT *, rank() over (partition by location_id, last_update order by confirmed DESC, deaths DESC, recovered DESC ) as rank
FROM datamart.new_cases)
WHERE rank = 1

#### PK re-check: no duplicates if same number

In [11]:
%%bigquery
SELECT count(*) AS distinct_PK, (SELECT count(*) FROM datamart.new_cases) AS cases_count
FROM (
    SELECT DISTINCT location_id, last_update
    FROM datamart.new_cases
)

Unnamed: 0,distinct_PK,cases_count
0,1475736,1475737


##### investigate duplicate case

In [13]:
%%bigquery
SELECT location_id, last_update, count(*) AS duplicate_cases
FROM datamart.new_cases
GROUP BY location_id, last_update
HAVING count(*) > 1

Unnamed: 0,location_id,last_update,duplicate_cases
0,-278257842025680577,2020-03-08 17:33:03,2


In [16]:
%%bigquery
SELECT *
FROM datamart.new_cases
WHERE location_id=-278257842025680577 AND last_update='2020-03-08 17:33:03'

Unnamed: 0,location_id,last_update,confirmed,deaths,recovered,active,incident_rate,case_fatality_ratio
0,-278257842025680577,2020-03-08 17:33:03,6,0,0,,,
1,-278257842025680577,2020-03-08 17:33:03,6,0,0,,,


##### remove duplicate by SELECT DISTINCT: the single duplicate record is an exact copy that cannot be ranked

In [17]:
%%bigquery
CREATE OR REPLACE TABLE datamart.new_cases AS
SELECT DISTINCT *
FROM datamart.new_cases

#### PK re-re-check: no duplicates if same number

In [18]:
%%bigquery
SELECT count(*) AS distinct_PK, (SELECT count(*) FROM datamart.new_cases) AS cases_count
FROM (
    SELECT DISTINCT location_id, last_update
    FROM datamart.new_cases
)

Unnamed: 0,distinct_PK,cases_count
0,1475736,1475736


#### FK check

In [19]:
%%bigquery
SELECT count(*) AS foreign_key_violations
FROM datamart.new_cases AS c
LEFT JOIN datamart.new_locations AS l ON c.location_id=l.id
WHERE l.id IS null

Unnamed: 0,foreign_key_violations
0,0


### Temp tables were deleted via BigQuery's UI