# How to query All of Us data?

This Notebook is a collection of best practices for writing BigQueries to extract All of Us (AoU) data based on frequently asked questions by users during office hours or from the User Support Hub. 

In [None]:
import os
import numpy as np
import pandas as pd
from google.cloud import bigquery
import seaborn as sns
import matplotlib.pyplot as plt


pd.set_option('max_colwidth', 800) # show all contents

In [None]:
my_bucket=os.getenv('WORKSPACE_BUCKET')
my_bucket

In [None]:
DATASET=os.environ["WORKSPACE_CDR"]
DATASET

# How to work with participant age 

In AoU, there are different types of ages that can be calculated including:

1. current_age= current_date - birthdate

2. age_at_event= event_start_data - birthdate

3. age_at_cdr= cdr_cutoff_date - birth_date, 

Note: Regarding the cdr_cutoff_date, please find specific info about current CDR cutoff date here:

https://support.researchallofus.org/hc/en-us/articles/360051661772-What-are-the-CDR-cutoff-dates-

4. age_at_consent is when the participant signs the primary consent form and can be calculated by using "consent_date - birth_date", which can also be considered as age_at_enrollment, as seen here: https://support.researchallofus.org/hc/en-us/articles/13176125767188-How-to-find-participant-enrollment-data

5. age_at_EHR_consent, question concept_id 1586099 (EHRConsentPII_ConsentPermission) and answer concept_id 1586100 (ConsentPermission_Yes) will be used for this purpose.

6. age_at_death= death_date - birth_date

## How to calculate all types of ages using OMOP tables

### Calculation of current_age and age_at_event in the condition table

In [None]:
query=f"""

SELECT DISTINCT p.person_id, co.condition_concept_id, c.concept_name,co.condition_start_date,
-- FLOOR(DATE_DIFF(DATE(condition_start_datetime),DATE(p.birth_datetime), DAY)/365.25) AS age_at_event,
-- FLOOR(DATE_DIFF(CURRENT_DATE(), EXTRACT(DATE FROM birth_datetime ), DAY)/365.25) AS current_age,
FLOOR(CAST(FORMAT_DATE('%Y.%m%d', co.condition_start_date) AS FLOAT64) - CAST(FORMAT_DATE('%Y.%m%d', DATE(birth_datetime)) AS FLOAT64))  age_at_event,
FLOOR(CAST(FORMAT_DATE('%Y.%m%d', CURRENT_DATE()) AS FLOAT64) - CAST(FORMAT_DATE('%Y.%m%d', DATE(birth_datetime)) AS FLOAT64))  current_age

FROM {DATASET}.condition_occurrence co
JOIN {DATASET}.person p USING (person_id)
JOIN {DATASET}.concept c
ON c.concept_id=co.condition_concept_id
WHERE concept_id IN (436675)--Anorexia nervosa 
LIMIT 5

"""
df=pd.read_gbq(query, dialect='standard')
df.head()


### age_at_cdr

In [None]:
cdr_cutoff_date='2022-01-01'

In [None]:
query=f"""

SELECT DISTINCT p.person_id, DATE(birth_datetime) birth_date,
FLOOR(CAST(FORMAT_DATE('%Y.%m%d', '{cdr_cutoff_date}') AS FLOAT64) - CAST(FORMAT_DATE('%Y.%m%d', DATE(birth_datetime)) AS FLOAT64))  age_at_cdr,
FLOOR(CAST(FORMAT_DATE('%Y.%m%d', CURRENT_DATE()) AS FLOAT64) - CAST(FORMAT_DATE('%Y.%m%d', DATE(birth_datetime)) AS FLOAT64))  current_age
FROM {DATASET}.person p
LIMIT 5

"""
df=pd.read_gbq(query, dialect='standard')
df.head()


### age_at_consent

In [None]:
query=f"""

SELECT DISTINCT p.person_id, p.birth_datetime birth_date, MIN(observation_date) obs_date_min,
FLOOR(CAST(FORMAT_DATE('%Y.%m%d', MIN(observation_date)) AS FLOAT64) - CAST(FORMAT_DATE('%Y.%m%d', DATE(birth_datetime)) AS FLOAT64))  age_at_consent    
FROM {DATASET}.concept 
JOIN {DATASET}.concept_ancestor ON (concept_id = ancestor_concept_id)
JOIN {DATASET}.observation ON (descendant_concept_id = observation_concept_id)
JOIN {DATASET}.observation_ext USING(observation_id)
JOIN {DATASET}.person p USING(person_id)
WHERE concept_class_id = 'Module'
AND concept_name IN ('Consent PII','Primary Consent Update')
AND src_id = 'PPI/PM'
AND questionnaire_response_id IS NOT NULL
GROUP BY 1,2
LIMIT 5
    
"""
# use the pandas function "read_gbq" to execute the query and pull the data into a dataframe, or "df"
df = pd.read_gbq(query, dialect = "standard")
df.shape

In [None]:
df

### age_at_EHR_consent

In [None]:
query = f"""

SELECT DISTINCT p.person_id, p.birth_datetime birth_date, observation_date,
FLOOR(CAST(FORMAT_DATE('%Y.%m%d', observation_date) AS FLOAT64) - CAST(FORMAT_DATE('%Y.%m%d', DATE(birth_datetime)) AS FLOAT64))  age_at_EHR_consent
FROM {DATASET}.observation
JOIN {DATASET}.person p USING(person_id)
WHERE observation_concept_id = 1586099 -- EHR Consent PII: Consent Permission
AND value_source_concept_id = 1586100 -- ConsentPermission_Yes
LIMIT 5
"""
df = pd.read_gbq(query, dialect='standard')
df.head()

### age_at_death

In [None]:
query=f"""

SELECT DISTINCT p.person_id, DATE(birth_datetime) birth_date, DATE(death_datetime) death_date,
FLOOR(CAST(FORMAT_DATE('%Y.%m%d', death_datetime) AS FLOAT64) - CAST(FORMAT_DATE('%Y.%m%d', DATE(birth_datetime)) AS FLOAT64))  age_at_death, 
-- FLOOR( DATE_DIFF(EXTRACT(DATE FROM death_datetime), EXTRACT(DATE FROM birth_datetime),DAY)/365.25) AS age_at_death
FROM {DATASET}.person p
RIGHT JOIN {DATASET}.death USING (person_id)
LIMIT 5

"""
df=pd.read_gbq(query, dialect='standard')
df.head()


## Extract 'age_at_cdr' and 'age_at_consent' from cb_search_person table

In [None]:
query = f"""
SELECT DISTINCT person_id , age_at_cdr,age_at_consent
FROM {DATASET}.cb_search_person 
LIMIT 5
"""
df= pd.read_gbq(query, dialect='standard')
df.head()

# How to select participants who consent to donate their EHR

**How many participants have consented to share their EHR?**

In [None]:
query = f"""
SELECT COUNT( DISTINCT person_id) AS participant_count, value_source_value AS permission_type
FROM {DATASET}.observation
JOIN {DATASET}.concept ON observation_concept_id=concept_id
WHERE concept_id = 1586099 
GROUP BY 2

"""
df1 = pd.read_gbq(query, dialect='standard')
df1.head()

Only choose people who consented to share their EHR (ie. ConsentPermission_Yes)

In [None]:
query = f"""
SELECT DISTINCT person_id
FROM {DATASET}.observation
WHERE observation_concept_id = 1586099 -- EHR Consent PII: Consent Permission
AND value_source_concept_id = 1586100 -- ConsentPermission_Yes
"""
df = pd.read_gbq(query, dialect='standard')
df.head()

# How to find people who registered for AoU and had certain conditions prior to a certain cutoff date

**Example 1: Find participants who registered before May 2020**

We will assume registeration date = enrollment date = primary consent date.

We need to find anyone whose enrollment date is < cut_off_date='2020-05-01' or > cut_off_date

In [None]:
query = f"""
SELECT DISTINCT person_id, min(observation_date) AS enrollment_date
FROM  {DATASET}.concept
JOIN {DATASET}.concept_ancestor ON concept_id = ancestor_concept_id
JOIN {DATASET}.observation ON descendant_concept_id = observation_concept_id 
WHERE concept_name = 'Consent PII' AND concept_class_id = 'Module'
AND observation_date < DATE('2020-05-01')
GROUP BY 1
LIMIT 5
"""
# use the pandas function "read_gbq" to execute the query and pull the data into a dataframe, or "df1"
df = pd.read_gbq(query,dialect = "standard")
df.head()

Or we can use a variable for this cut_off_date

In [None]:
cut_off_date='2020-05-01' 
cut_off_date

In [None]:
query = f"""
SELECT DISTINCT person_id, min(observation_date) AS enrollment_date
FROM  {DATASET}.concept
JOIN {DATASET}.concept_ancestor ON concept_id = ancestor_concept_id
JOIN {DATASET}.observation ON descendant_concept_id = observation_concept_id 
WHERE concept_name = 'Consent PII' AND concept_class_id = 'Module'
AND observation_date < DATE('{cut_off_date}')
GROUP BY 1
"""
# use the pandas function "read_gbq" to execute the query and pull the data into a dataframe, or "df1"
df = pd.read_gbq(query,dialect = "standard")
df.shape


**Example 2: Find participants who registered before May 2020 and had certain condition within a certain time window**

This example code uses ‘depression’ as a keyword to define any depression-related conditions:

In [None]:
query = f"""
WITH person_enrolled_before_20200501 AS (
SELECT DISTINCT person_id, MIN(observation_date) AS enrollment_date
FROM  {DATASET}.concept
JOIN {DATASET}.concept_ancestor ON concept_id = ancestor_concept_id
JOIN {DATASET}.observation ON descendant_concept_id = observation_concept_id 
WHERE concept_name = 'Consent PII' AND concept_class_id = 'Module'
AND observation_date < DATE('{cut_off_date}')
GROUP BY 1
)

-- find matched person_id from the cohort defined in example 1

SELECT DISTINCT person_id,condition_concept_id,c1.concept_name,condition_start_datetime
FROM {DATASET}.condition_occurrence 
JOIN {DATASET}.concept c1 ON condition_concept_id=c1.concept_id
WHERE c1.concept_name LIKE '%depression%'
AND DATE(condition_start_date) BETWEEN DATE('2020-01-01') AND DATE('2022-01-01')
AND person_id IN (SELECT person_id FROM person_enrolled_before_20200501)
"""
# use the pandas function "read_gbq" to execute the query and pull the data into a dataframe, or "df1"
df = pd.read_gbq(query,dialect = "standard")
df.shape

In [None]:
df.head()

# How to find all partcipants who have EHR data

In [None]:
query = f"""
    SELECT
       DISTINCT person_id
    FROM {DATASET}.measurement AS m
    LEFT JOIN `{DATASET}.measurement_ext` AS mm ON m.measurement_id = mm.measurement_id
    WHERE LOWER(mm.src_id) LIKE '%ehr%'
    
    UNION DISTINCT
    SELECT
       DISTINCT person_id
    FROM {DATASET}.condition_occurrence AS m
    LEFT JOIN {DATASET}.condition_occurrence_ext AS mm ON m.condition_occurrence_id = mm.condition_occurrence_id
    WHERE LOWER(mm.src_id) LIKE '%ehr%'
    
    UNION DISTINCT
    SELECT
       DISTINCT person_id
    FROM {DATASET}.device_exposure AS m
    LEFT JOIN {DATASET}.device_exposure_ext AS mm ON m.device_exposure_id = mm.device_exposure_id
    WHERE LOWER(mm.src_id) LIKE '%ehr%'
    
    UNION DISTINCT
    SELECT
       DISTINCT person_id
    FROM {DATASET}.drug_exposure AS m
    LEFT JOIN `{DATASET}.drug_exposure_ext` AS mm ON m.drug_exposure_id = mm.drug_exposure_id
    WHERE LOWER(mm.src_id) LIKE '%ehr%'
    
    UNION DISTINCT
    SELECT
       DISTINCT person_id
    FROM {DATASET}.observation AS m
    LEFT JOIN `{DATASET}.observation_ext` AS mm ON m.observation_id = mm.observation_id
    WHERE LOWER(mm.src_id) LIKE '%ehr%'
    
    UNION DISTINCT
    SELECT
       DISTINCT person_id
    FROM {DATASET}.procedure_occurrence AS m
    LEFT JOIN `{DATASET}.procedure_occurrence_ext` AS mm ON m.procedure_occurrence_id = mm.procedure_occurrence_id
    WHERE LOWER(mm.src_id) LIKE '%ehr%'
    
    UNION DISTINCT
    SELECT
       DISTINCT person_id
    FROM {DATASET}.visit_occurrence AS m
    LEFT JOIN `{DATASET}.visit_occurrence_ext` AS mm ON m.visit_occurrence_id = mm.visit_occurrence_id
    WHERE LOWER(mm.src_id) LIKE '%ehr%'
"""
df1 = pd.read_gbq(query,dialect = "standard")
df1.shape

## How to query for all participants with EHR using a simpler SQL (non-OMOP):

In [None]:
query = f"""
    SELECT DISTINCT person_id
    FROM {DATASET}.cb_search_person 
    WHERE has_ehr_data = 1
"""
df1 = pd.read_gbq(query,dialect = "standard")
df1.shape

# How to select participants who experienced a particular event

**Example 1: We want to query participants who were diagnosed with anorexia nervosa (concept_id=436675) during adolescence**

In [None]:
query=f"""
SELECT * from
(SELECT  
DISTINCT p.person_id, co.condition_concept_id, concept_name, DATE(co.condition_start_datetime) as start_date,
FLOOR(DATE_DIFF(DATE(condition_start_datetime),DATE(p.birth_datetime), DAY)/365.25) AS age_at_event,
FROM {DATASET}.condition_occurrence co
JOIN {DATASET}.person p USING (person_id)
JOIN {DATASET}.concept_ancestor ON condition_concept_id=descendant_concept_id
JOIN {DATASET}.concept ON concept_id=condition_concept_id
WHERE ancestor_concept_id =436675  --Anorexia nervosa 
)
WHERE age_at_event BETWEEN 12 AND 17
"""
df=pd.read_gbq(query, dialect='standard')
df.shape

In [None]:
df.head()

**Example 2: We want to query participants who underwent appendectomy (concept_id=4198190) during adolescence**

In [None]:
query=f"""
SELECT * from
(SELECT  
DISTINCT p.person_id, po.procedure_concept_id, concept_name,po.procedure_date,
FLOOR(DATE_DIFF(DATE(procedure_date),DATE(p.birth_datetime), DAY)/365.25) AS age_at_event,
FROM {DATASET}.procedure_occurrence po
JOIN {DATASET}.person p USING (person_id)
JOIN {DATASET}.concept_ancestor ON procedure_concept_id=descendant_concept_id
JOIN {DATASET}.concept ON concept_id=procedure_concept_id
WHERE ancestor_concept_id =4198190  
)
WHERE age_at_event BETWEEN 12 AND 17
"""
df=pd.read_gbq(query, dialect='standard')
df.shape

In [None]:
df.head()

**Example 3: how to find length of stay in all participants who had inpatient visit (in the visit_occurrence table) in past five years given a condition (in the condition_occurrence table)?**



In [None]:
query = f"""
SELECT DISTINCT co.person_id,condition_concept_id, condition_start_date, c1.concept_name,visit_concept_id,c2.concept_name,
visit_start_date,visit_end_date,
DATE_DIFF(DATE(visit_end_date), DATE(visit_start_date), DAY) AS length_of_stay_days
FROM {DATASET}.condition_occurrence co
JOIN {DATASET}.visit_occurrence using (visit_occurrence_id)
JOIN {DATASET}.concept c1 ON condition_concept_id=c1.concept_id
JOIN {DATASET}.concept c2 ON visit_concept_id=c2.concept_id
WHERE condition_concept_id IN (436675)
AND visit_concept_id IN (9201, 262, 8717)
AND co.visit_occurrence_id IS NOT NULL
-- AND c2.concept_name LIKE '%Inpatient%'
AND DATE(visit_start_date) BETWEEN DATE('2018-01-01') AND DATE('2022-04-22')
"""
df=pd.read_gbq(query, dialect='standard')
df.shape

In [None]:
df.head()

# How to find participants with all types of cancer and malignancy

We will use ancestor_concept_id = 438112 (neoplastic disease), a parent concept_id of all neoplastic diseases (cancers) in the data per the OMOP hierarchy. 

In [None]:
query = f"""
SELECT DISTINCT person_id, condition_concept_id,concept_name
FROM {DATASET}.condition_occurrence 
JOIN {DATASET}.concept_ancestor ON (condition_concept_id=descendant_concept_id)
JOIN {DATASET}.concept ON concept_id=condition_concept_id
WHERE ancestor_concept_id=  438112 ---neoplastic disease
LIMIT 5
"""
df1 = pd.read_gbq(query, dialect='standard')
df1.head()

#  How to query data from non-standard concept_ids and ICD codes

In the main OMOP data table, such as the condition_occurrence table, only standard concept_ids are used for the column 'condition_concept_id'. In contrast, non-standard concept_ids, including those that are mapped to ICD9/10 codes, can be found in the column 'condition_source_concept_id'. In order to use ICD9/10 codes to find records in the condition table, the best way is to find the standard concept_ids that are mapped to the ICD9/10 codes, and then use standard concept_ids to query the table. The example query in condition table is displayed in the following example:

**Example**
- Using the ICD codes: ‘concept_id IN (45605777, 44824250, 44823108, 1569178)’, which are the non-standard concept_ids mapped to ICD codes shown in the following sql:

In [None]:
query=f"""
SELECT concept_id,concept_name,domain_id,vocabulary_id,concept_code AS icd_code,standard_concept
FROM `{DATASET}.concept` 
WHERE concept_id IN  (45605777, 44824250, 44823108, 1569178) 
"""
df1 = pd.read_gbq(query, dialect = 'standard')
df1.head()


In this example, we assume that we have a long list of ICD codes that are available in a data frame (as shown above). Otherwise, the list needs to be read into a data frame first.

In [None]:
# a tuple is a ordered, unmodifiable collection of items
icds = tuple(df1['icd_code'])
len(icds)

**Query the table**

In [None]:
query=f"""
--to get standard concept_ids
WITH icd AS (
SELECT distinct c2.concept_id AS concept_id_standard
FROM {DATASET}.condition_occurrence 
JOIN {DATASET}.concept c1 ON condition_source_concept_id=c1.concept_id -- non-standard concept_ids
JOIN {DATASET}.concept c2 ON condition_concept_id=c2.concept_id --standard concept_ids
WHERE c1.concept_code IN {icds} 
-- WHERE c1.concept_id IN (45605777, 44824250, 44823108, 1569178) -- can use this too
)

SELECT person_id,condition_concept_id,concept_name, condition_start_date
FROM {DATASET}.condition_occurrence 
JOIN {DATASET}.concept ON concept_id=condition_concept_id
WHERE condition_concept_id IN (SELECT concept_id_standard FROM icd )
LIMIT 5 --remove this, if to get all rows
"""
df = pd.read_gbq(query, dialect = 'standard')
df.shape

In [None]:
df

# How to get earliest and latest EHR dates to determine length of EHR records for each person_id




In [None]:
query = f"""

WITH ehr AS (
    SELECT  m.person_id AS participant, MIN( m.measurement_date) AS first_date, MAX( m.measurement_date) AS last_date
    FROM {DATASET}.measurement m
    LEFT JOIN {DATASET}.measurement_ext AS m_ext ON m.measurement_id = m_ext.measurement_id
    WHERE LOWER(m_ext.src_id) LIKE '%ehr%'--ehr data from measurement table
    GROUP BY 1

    UNION DISTINCT


    SELECT co.person_id AS participant, MIN(co.condition_start_date) AS first_date,
    CASE WHEN MAX(co.condition_start_date)>= MAX(co.condition_end_date) OR MAX( co.condition_end_date) IS NULL THEN MAX(co.condition_start_date)
    WHEN MAX(co.condition_start_date) < MAX(co.condition_end_date) OR MAX(co.condition_start_date) IS NULL THEN MAX(co.condition_end_date)
    END AS last_date -- select the max of the lastest record for both date columns
    FROM {DATASET}.condition_occurrence AS co
    LEFT JOIN {DATASET}.condition_occurrence_ext AS co_ext ON co.condition_occurrence_id = co_ext.condition_occurrence_id
    WHERE LOWER(co_ext.src_id) LIKE '%ehr%' -- ehr data from condition_occurrence table
    GROUP BY 1

    UNION DISTINCT
    
    SELECT d.person_id AS participant, MIN( d.device_exposure_start_date) AS first_date,
    CASE WHEN MAX(d.device_exposure_start_date)>= MAX(d.device_exposure_end_date) OR MAX(d.device_exposure_end_date) IS NULL THEN MAX(d.device_exposure_start_date)
    WHEN MAX(d.device_exposure_start_date) < MAX(d.device_exposure_end_date) OR  MAX(d.device_exposure_start_date) IS NULL THEN MAX(d.device_exposure_end_date)
    END AS last_date --select the max of the lastest record for both date columns
    FROM {DATASET}.device_exposure AS d
    LEFT JOIN {DATASET}.device_exposure_ext AS d_ext ON d.device_exposure_id = d_ext.device_exposure_id
    WHERE LOWER(d_ext.src_id) LIKE '%ehr%'-- ehr data from device_exposure table
    GROUP BY 1

    UNION DISTINCT

    SELECT de.person_id as participant, MIN(de.drug_exposure_start_date) AS first_date,  
    CASE WHEN MAX(de.drug_exposure_start_date) >= MAX(de.drug_exposure_end_date) OR MAX(de.drug_exposure_end_date) IS NULL THEN MAX(de.drug_exposure_start_date) 
    WHEN MAX(de.drug_exposure_start_date)< MAX(de.drug_exposure_end_date) OR MAX(de.drug_exposure_start_date) IS NULL THEN MAX(de.drug_exposure_end_date) 
    END AS last_date
    FROM {DATASET}.drug_exposure AS de
    LEFT JOIN {DATASET}.drug_exposure_ext AS de_ext ON de.drug_exposure_id = de_ext.drug_exposure_id
    WHERE LOWER(de_ext.src_id) LIKE '%ehr%' --ehr data from drug_exposure table
    GROUP BY 1

    UNION DISTINCT

    SELECT o.person_id AS participant, MIN(o.observation_date) AS first_date,
    MAX(o.observation_date) AS last_date
    FROM {DATASET}.observation AS o
    LEFT JOIN {DATASET}.observation_ext AS o_ext ON o.observation_id = o_ext.observation_id
    WHERE LOWER(o_ext.src_id) LIKE '%ehr%' --ehr data from observation table
    GROUP BY 1


    UNION DISTINCT

    SELECT po.person_id AS participant, MIN(po.procedure_date) AS first_date, 
    MAX(po.procedure_date) AS last_date 
    FROM {DATASET}.procedure_occurrence AS po
    LEFT JOIN {DATASET}.procedure_occurrence_ext AS po_ext ON po.procedure_occurrence_id = po_ext.procedure_occurrence_id
    WHERE LOWER(po_ext.src_id) LIKE '%ehr%' -- ehr data from procedure_occurrence table
    GROUP BY 1 


    UNION DISTINCT

    SELECT v.person_id AS participant, MIN( v.visit_start_date ) AS first_date, 
    CASE WHEN MAX ( v.visit_start_date )>= MAX( v.visit_end_date ) OR MAX( v.visit_end_date ) IS NULL THEN  MAX(v.visit_start_date)
    WHEN MAX ( v.visit_start_date ) < MAX ( v.visit_end_date ) OR MAX( v.visit_start_date ) IS NULL THEN MAX ( v.visit_end_date )
    END AS  last_date -- select between the end and start date by using max() 
    FROM {DATASET}.visit_occurrence AS v
    LEFT JOIN {DATASET}.visit_occurrence_ext AS v_ext ON v.visit_occurrence_id = v_ext.visit_occurrence_id
    WHERE LOWER(v_ext.src_id) LIKE '%ehr%'-- ehr data from visit occurrence table
    GROUP BY 1
    )

SELECT
participant, COUNT (participant) AS record_count, MIN (first_date) AS first_ehr_date, MAX (last_date) as last_ehr_date,
DATE_DIFF(DATE(MAX(last_date)), DATE(MIN(first_date)), day)+1 ehr_data_length  -- add 1 day
FROM ehr
GROUP BY 1
ORDER BY 1 DESC

"""

df = pd.read_gbq(query, dialect='standard')
df.head()

In [None]:
df['ehr_data_length']=df['ehr_data_length'].astype('float')

**Distribution of EHR length**

In [None]:
plt.figure(figsize=(12,7))
ax=sns.kdeplot(data= df, x=df['ehr_data_length'], color='purple',fill=True)
plt.title('Length density for EHR data for All of Us participants in days' ,fontsize=16)
sns.set(style="ticks")
plt.xlim(-1000,17000)
plt.grid()
plt.show()


In [None]:
df.head()

**Distribution of record counts per participant among seven tables**

In [None]:
plt.figure(figsize=(8,8))
plt.hist(df['record_count'], bins=20, color='purple')
plt.title("Counts of records per participant")
plt.grid()
plt.show()

# How to query csv files in the Google bucket by joining AoU data

**Load local data into temporary tables in BigQuery**

1. Users don’t have permission to create tables and are also unable to directly import a local csv file as a temporary table. 
2. However, there is a way to query such user-defined csv files, as long as the files are located in the bucket, more details can be seen  here: https://cloud.google.com/bigquery/docs/external-data-cloud-storage#python 
3. In the following example, we will assume there is a csv file named ‘0_pid.csv’ in the local or the current VM working directory. This file only has one column named ‘person_id’. Before running the example code, this file needs to be copied to the user’s bucket, i.e., {my_bucket}/data/0_pid.csv


In [None]:
from google.cloud import bigquery

## BigQuery setup.
BILLING_PROJECT_ID = os.getenv('GOOGLE_PROJECT')
# Construct a BigQuery client object.
client = bigquery.Client()
# Get the BigQuery curated DATASET for the current workspace context.
CDR = os.getenv('WORKSPACE_CDR')
# Bucket
my_bucket = os.getenv('WORKSPACE_BUCKET')
my_bucket


DATASET=os.environ["WORKSPACE_CDR"]
DATASET

# example data
query=f"""
SELECT DISTINCT person_id FROM {DATASET}.condition_occurrence
LIMIT 100
"""
df=pd.read_gbq(query, dialect='standard')
# df.shape

# save the result to the bucket
df.to_csv('0_pid.csv',index=False)
!gsutil cp 0_pid.csv {my_bucket}/data/0_pid.csv

# assuming the file is in the bucket like this
# list the file
!gsutil ls {my_bucket}/data/0_pid.csv

# how to configure external table
filename='data/0_pid.csv'
file_path=os.path.join(my_bucket, filename)
file_path

# or can simple do this, without using os.path.join()
filename='/data/0_pid.csv'
file_path=my_bucket+filename
file_path

# Construct a BigQuery client object.
# Configure the external data source and query job.
external_config = bigquery.ExternalConfig("CSV")
external_config.source_uris = [file_path]
external_config.schema = [
    bigquery.SchemaField("person_id", "INT64"),
]
external_config.options.skip_leading_rows = 1
table_id = "pid"
job_config = bigquery.QueryJobConfig(table_definitions={table_id: external_config})

# how to query OMOP tables by joining this table 
# you can query this table alone too

query =f"""
SELECT * FROM person
JOIN pid USING (person_id)
"""
job_config.default_dataset = CDR
query_job = client.query(query,job_config=job_config)  # API request

df= query_job.to_dataframe()
df.head()

**In above example, this csv (table name: pid) can be queried in combination with any AoU data.**