In [None]:
import os
import numpy as np
import pandas as pd
import pandas_profiling
import plotnine
from plotnine import *  # Provides a ggplot-like interface to matplotlib.
from IPython.display import display

## Plot setup.
theme_set(theme_bw(base_size = 11)) # Default theme for plots.

def get_boxplot_fun_data(df):
  """Returns a data frame with a y position and a label, for use annotating ggplot boxplots.

  Args:
    d: A data frame.
  Returns:
    A data frame with column y as max and column label as length.
  """
  d = {'y': max(df), 'label': f'N = {len(df)}'}
  return(pd.DataFrame(data=d, index=[0]))

# NOTE: if you get any errors from this cell, restart your kernel and run it again.


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
import pandas
import os

# This query represents dataset "Asthma_exacerb_predict" for domain "person" and was generated for All of Us Registered Tier Dataset v4
dataset_64469718_person_sql = """
    SELECT
        person.BIRTH_DATETIME as DATE_OF_BIRTH,
        person.PERSON_ID,
        p_race_concept.concept_name as RACE,
        p_gender_concept.concept_name as GENDER,
        p_ethnicity_concept.concept_name as ETHNICITY,
        p_sex_at_birth_concept.concept_name as SEX_AT_BIRTH 
    FROM
        `""" + os.environ["WORKSPACE_CDR"] + """.person` person 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_race_concept 
            on person.race_concept_id = p_race_concept.CONCEPT_ID 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_gender_concept 
            on person.gender_concept_id = p_gender_concept.CONCEPT_ID 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_ethnicity_concept 
            on person.ethnicity_concept_id = p_ethnicity_concept.CONCEPT_ID 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_sex_at_birth_concept 
            on person.sex_at_birth_concept_id = p_sex_at_birth_concept.CONCEPT_ID  
    WHERE
        person.PERSON_ID IN (
            select
                person_id  
            from
                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
            where
                cb_search_person.person_id in (
                    select
                        criteria.person_id 
                    from
                        (select
                            distinct person_id,
                            entry_date,
                            concept_id 
                        from
                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                        where
                            person_id in (
                                select
                                    person_id 
                                from
                                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                where
                                    is_standard = 1 
                                    and concept_id in (
                                        select
                                            distinct c.concept_id 
                                        from
                                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                        join
                                            (
                                                select
                                                    cast(cr.id as string) as id 
                                                from
                                                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                                where
                                                    domain_id = 'CONDITION' 
                                                    and is_standard = 1 
                                                    and concept_id in (764949, 46274062, 4271333, 46270573, 4146581, 46273487, 46270028, 45768963, 42539549, 46269770, 4142738, 45768910, 443801, 4145356, 4152913, 45769350, 46273454, 4279553, 45769443, 317009, 314754, 4138760, 252946, 46269783, 46269790, 252658, 46269786, 45768965, 4051466, 46273462, 4245676, 252942, 4215802, 45772937, 40481763, 37108581, 4075237, 256448, 46269802, 4141978, 764677, 46270322, 4125022, 46269776, 45768964, 4233784, 4155468, 37310241, 46273635, 4207479, 4123253, 45769352, 4143828, 42535716, 257581, 46273452, 45771045, 46269778, 312950, 45769351, 4309833, 45768911, 36684328, 4308356, 313236, 46269789, 45768912, 761844, 42536208, 4145497, 4312524, 46270030, 46269787, 40483397, 45769438, 45769441, 43530693, 45773005, 4206340, 45766728, 4194289, 46270082, 45769442, 42538744, 4191479, 46269784, 4155469, 37116845) 
                                                    and is_selectable = 1 
                                                    and full_text like '%[condition_rank1]%'
                                            ) a 
                                                on (
                                                    c.path like concat('%.',
                                                a.id,
                                                '.%') 
                                                or c.path like concat('%.',
                                                a.id) 
                                                or c.path like concat(a.id,
                                                '.%')) 
                                            where
                                                domain_id = 'CONDITION' 
                                                and is_standard = 1 
                                                and is_selectable = 1
                                            )
                                        union
                                        all select
                                            person_id 
                                        from
                                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                        where
                                            is_standard = 0 
                                            and concept_id in (
                                                select
                                                    distinct c.concept_id 
                                                from
                                                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                                join
                                                    (
                                                        select
                                                            cast(cr.id as string) as id 
                                                        from
                                                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                                        where
                                                            domain_id = 'CONDITION' 
                                                            and is_standard = 0 
                                                            and concept_id in (46270322, 4155468, 46269789, 45572168, 45768965, 313236, 45768911, 4207479, 46273454, 44820889, 46270573, 44834769, 45601133, 45771045, 46269802, 4194289, 45567266, 44824288, 1569490, 45768912, 44829012, 4215802, 46269784, 44821988, 46270030, 42539549, 312950, 46269786, 764949, 4051466, 4245676, 45543269, 45543270, 45557626, 45769351, 4143828, 4152913, 45769442, 46273635, 45772937, 45548117, 4191479, 44832423, 4125022, 44833611, 45591559, 45576951, 4271333, 317009, 37310241, 4206340, 45548116, 45572170, 45562457, 4146581, 4123253, 256448, 46269776, 45768964, 46269787, 46273452, 44823144, 4312524, 4279553, 43530693, 42535716, 44830115, 46273487, 45769438, 4138760, 4155469, 36684328, 44831280, 1569491, 46269770, 40483397, 1569488, 44831278, 45576952, 45567265, 44824289, 44826679, 4141978, 45601134, 4145356, 43530700, 45773005, 4144757, 45581859, 45769441, 46269778, 4309833, 46269783, 4142738, 44831279, 4308356, 37116845, 44832424, 45581860, 257581, 44828510, 45768963, 764677, 44837136, 252658, 4145497, 46270028, 1569489, 46269790, 46273462, 45766728, 46270082, 44837135, 42536208, 42538744, 45768910, 37108581, 44821987, 35225323, 314754, 45586675, 40410639, 44824287, 761844, 45769352, 4075237, 45591558, 443801, 45572169, 45572171, 46274062, 4233784) 
                                                            and is_selectable = 1 
                                                            and full_text like '%[condition_rank1]%'
                                                    ) a 
                                                        on (
                                                            c.path like concat('%.',
                                                        a.id,
                                                        '.%') 
                                                        or c.path like concat('%.',
                                                        a.id) 
                                                        or c.path like concat(a.id,
                                                        '.%')) 
                                                    where
                                                        domain_id = 'CONDITION' 
                                                        and is_standard = 0 
                                                        and is_selectable = 1
                                                    )
                                            )
                                        ) criteria 
                                ) 
                        )"""

dataset_64469718_person_df = pandas.read_gbq(dataset_64469718_person_sql, dialect="standard", progress_bar_type="tqdm_notebook")

dataset_64469718_person_df.head(5)

In [None]:
dataset_64469718_person_df.info()

In [None]:
person_df=dataset_64469718_person_df

In [None]:
#!pip install pyjanitor
import janitor


In [None]:
person_df = janitor.clean_names(person_df)

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = person_df  

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'person_df.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


In [None]:
person_df.info()

In [None]:
person_df.duplicated(subset='person_id').value_counts()

In [None]:
import pandas
import os

# This query represents dataset "Asthma_exacerb_predict" for domain "condition" and was generated for All of Us Registered Tier Dataset v4
dataset_64469718_condition_sql = """
    SELECT
        c_occurrence.CONDITION_START_DATETIME,
        c_occurrence.CONDITION_END_DATETIME,
        c_occurrence.STOP_REASON,
        c_occurrence.PERSON_ID,
        c_type.concept_name as CONDITION_TYPE_CONCEPT_NAME,
        c_status.concept_name as CONDITION_STATUS_CONCEPT_NAME,
        c_standard_concept.concept_name as STANDARD_CONCEPT_NAME,
        c_standard_concept.vocabulary_id as STANDARD_VOCABULARY,
        visit.concept_name as VISIT_OCCURRENCE_CONCEPT_NAME 
    from
        ( SELECT
            * 
        from
            `""" + os.environ["WORKSPACE_CDR"] + """.condition_occurrence` c_occurrence 
        WHERE
            (
                condition_concept_id in  (
                    select
                        distinct c.concept_id 
                    from
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                    join
                        (
                            select
                                cast(cr.id as string) as id 
                            from
                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                            where
                                domain_id = 'CONDITION' 
                                and is_standard = 1 
                                and concept_id in (
                                    42538744, 43530693, 45768910, 46273487, 4138760, 45768911, 46270082, 4146581, 4279553, 312950, 4155469, 45768912, 45769350, 45769441, 4155468, 45769351, 4308356, 257581, 45768963, 45769442, 4233784, 4141978, 4143828, 4142738, 45771045, 45768964, 36684328, 4312524, 45772937, 45768965, 46274062, 443801, 4191479, 317009, 314754, 45769438, 313236, 4145356, 45773005, 4152913, 4145497
                                ) 
                                and is_selectable = 1 
                                and full_text like '%[condition_rank1]%'
                        ) a 
                            on (
                                c.path like concat('%.',
                            a.id,
                            '.%') 
                            or c.path like concat('%.',
                            a.id) 
                            or c.path like concat(a.id,
                            '.%')) 
                        where
                            domain_id = 'CONDITION' 
                            and is_standard = 1 
                            and is_selectable = 1
                        ) 
                        OR  condition_source_concept_id in  (
                            select
                                distinct c.concept_id 
                            from
                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                            join
                                (
                                    select
                                        cast(cr.id as string) as id 
                                    from
                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                    where
                                        domain_id = 'CONDITION' 
                                        and is_standard = 0 
                                        and concept_id in (
                                            1569489, 42538744, 44824289, 4146581, 313236, 44832423, 45768910, 35225323, 1569488, 46273487, 46270082, 4138760, 44828510, 45768911, 44832424, 45586675, 4155469, 44824287, 45769441, 45576951, 45591559, 45572169, 44837136, 45768963, 44824288, 4125022, 4155468, 4308356, 45562457, 45591558, 257581, 45557626, 312950, 45572168, 44833611, 45768964, 45771045, 45567266, 45601133, 4141978, 45572171, 45768965, 45567265, 45601134, 44831280, 44837135, 45772937, 314754, 4191479, 44821987, 1569491, 45548117, 46274062, 4152913, 45581860, 443801, 44820889, 45543270, 1569490, 4145356, 44821988, 4143828, 45581859, 45769438, 44826679, 45548116, 317009, 4145497, 45543269, 44823144
                                        ) 
                                        and is_selectable = 1 
                                        and full_text like '%[condition_rank1]%'
                                ) a 
                                    on (
                                        c.path like concat('%.',
                                    a.id,
                                    '.%') 
                                    or c.path like concat('%.',
                                    a.id) 
                                    or c.path like concat(a.id,
                                    '.%')) 
                                where
                                    domain_id = 'CONDITION' 
                                    and is_standard = 0 
                                    and is_selectable = 1
                                )
                        )  
                        AND (
                            c_occurrence.PERSON_ID IN (
                                select
                                    person_id  
                                from
                                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
                                where
                                    cb_search_person.person_id in (
                                        select
                                            criteria.person_id 
                                        from
                                            (select
                                                distinct person_id,
                                                entry_date,
                                                concept_id 
                                            from
                                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                            where
                                                person_id in (
                                                    select
                                                        person_id 
                                                    from
                                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                                    where
                                                        is_standard = 1 
                                                        and concept_id in (
                                                            select
                                                                distinct c.concept_id 
                                                            from
                                                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                                            join
                                                                (
                                                                    select
                                                                        cast(cr.id as string) as id 
                                                                    from
                                                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                                                    where
                                                                        domain_id = 'CONDITION' 
                                                                        and is_standard = 1 
                                                                        and concept_id in (764949, 46274062, 4271333, 46270573, 4146581, 46273487, 46270028, 45768963, 42539549, 46269770, 4142738, 45768910, 443801, 4145356, 4152913, 45769350, 46273454, 4279553, 45769443, 317009, 314754, 4138760, 252946, 46269783, 46269790, 252658, 46269786, 45768965, 4051466, 46273462, 4245676, 252942, 4215802, 45772937, 40481763, 37108581, 4075237, 256448, 46269802, 4141978, 764677, 46270322, 4125022, 46269776, 45768964, 4233784, 4155468, 37310241, 46273635, 4207479, 4123253, 45769352, 4143828, 42535716, 257581, 46273452, 45771045, 46269778, 312950, 45769351, 4309833, 45768911, 36684328, 4308356, 313236, 46269789, 45768912, 761844, 42536208, 4145497, 4312524, 46270030, 46269787, 40483397, 45769438, 45769441, 43530693, 45773005, 4206340, 45766728, 4194289, 46270082, 45769442, 42538744, 4191479, 46269784, 4155469, 37116845) 
                                                                        and is_selectable = 1 
                                                                        and full_text like '%[condition_rank1]%'
                                                                ) a 
                                                                    on (
                                                                        c.path like concat('%.',
                                                                    a.id,
                                                                    '.%') 
                                                                    or c.path like concat('%.',
                                                                    a.id) 
                                                                    or c.path like concat(a.id,
                                                                    '.%')) 
                                                                where
                                                                    domain_id = 'CONDITION' 
                                                                    and is_standard = 1 
                                                                    and is_selectable = 1
                                                                )
                                                            union
                                                            all select
                                                                person_id 
                                                            from
                                                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                                            where
                                                                is_standard = 0 
                                                                and concept_id in (
                                                                    select
                                                                        distinct c.concept_id 
                                                                    from
                                                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                                                    join
                                                                        (
                                                                            select
                                                                                cast(cr.id as string) as id 
                                                                            from
                                                                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                                                            where
                                                                                domain_id = 'CONDITION' 
                                                                                and is_standard = 0 
                                                                                and concept_id in (46270322, 4155468, 46269789, 45572168, 45768965, 313236, 45768911, 4207479, 46273454, 44820889, 46270573, 44834769, 45601133, 45771045, 46269802, 4194289, 45567266, 44824288, 1569490, 45768912, 44829012, 4215802, 46269784, 44821988, 46270030, 42539549, 312950, 46269786, 764949, 4051466, 4245676, 45543269, 45543270, 45557626, 45769351, 4143828, 4152913, 45769442, 46273635, 45772937, 45548117, 4191479, 44832423, 4125022, 44833611, 45591559, 45576951, 4271333, 317009, 37310241, 4206340, 45548116, 45572170, 45562457, 4146581, 4123253, 256448, 46269776, 45768964, 46269787, 46273452, 44823144, 4312524, 4279553, 43530693, 42535716, 44830115, 46273487, 45769438, 4138760, 4155469, 36684328, 44831280, 1569491, 46269770, 40483397, 1569488, 44831278, 45576952, 45567265, 44824289, 44826679, 4141978, 45601134, 4145356, 43530700, 45773005, 4144757, 45581859, 45769441, 46269778, 4309833, 46269783, 4142738, 44831279, 4308356, 37116845, 44832424, 45581860, 257581, 44828510, 45768963, 764677, 44837136, 252658, 4145497, 46270028, 1569489, 46269790, 46273462, 45766728, 46270082, 44837135, 42536208, 42538744, 45768910, 37108581, 44821987, 35225323, 314754, 45586675, 40410639, 44824287, 761844, 45769352, 4075237, 45591558, 443801, 45572169, 45572171, 46274062, 4233784) 
                                                                                and is_selectable = 1 
                                                                                and full_text like '%[condition_rank1]%'
                                                                        ) a 
                                                                            on (
                                                                                c.path like concat('%.',
                                                                            a.id,
                                                                            '.%') 
                                                                            or c.path like concat('%.',
                                                                            a.id) 
                                                                            or c.path like concat(a.id,
                                                                            '.%')) 
                                                                        where
                                                                            domain_id = 'CONDITION' 
                                                                            and is_standard = 0 
                                                                            and is_selectable = 1
                                                                        )
                                                                )
                                                            ) criteria 
                                                    ) 
                                            )
                                        )
                                ) c_occurrence 
                            left join
                                `""" + os.environ["WORKSPACE_CDR"] + """.concept` c_type 
                                    on c_occurrence.CONDITION_TYPE_CONCEPT_ID = c_type.CONCEPT_ID 
                            left join
                                `""" + os.environ["WORKSPACE_CDR"] + """.concept` c_status 
                                    on c_occurrence.CONDITION_STATUS_CONCEPT_ID = c_status.CONCEPT_ID 
                            left join
                                `""" + os.environ["WORKSPACE_CDR"] + """.concept` c_standard_concept 
                                    on c_occurrence.CONDITION_CONCEPT_ID = c_standard_concept.CONCEPT_ID 
                            left join
                                `""" + os.environ["WORKSPACE_CDR"] + """.visit_occurrence` v 
                                    on c_occurrence.VISIT_OCCURRENCE_ID = v.VISIT_OCCURRENCE_ID 
                            left join
                                `""" + os.environ["WORKSPACE_CDR"] + """.concept` visit 
                                    on v.visit_concept_id = visit.concept_id"""

dataset_64469718_condition_df = pandas.read_gbq(dataset_64469718_condition_sql, dialect="standard", progress_bar_type="tqdm_notebook")

dataset_64469718_condition_df.head(5)

In [None]:
condition_df=dataset_64469718_condition_df

In [None]:
condition_df= janitor.clean_names(condition_df)

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = condition_df   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'condition_df.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


In [None]:
condition_df.duplicated(subset='person_id').value_counts()

In [None]:
import pandas
import os

# This query represents dataset "Asthma_exacerb_predict" for domain "drug" and was generated for All of Us Registered Tier Dataset v4
dataset_64469718_drug_sql = """
    SELECT
        d_exposure.PERSON_ID,
        d_exposure.DRUG_EXPOSURE_START_DATETIME,
        d_exposure.VERBATIM_END_DATE,
        d_exposure.REFILLS,
        d_exposure.DAYS_SUPPLY,
        d_exposure.QUANTITY,
        d_exposure.STOP_REASON,
        d_exposure.DRUG_EXPOSURE_END_DATETIME,
        d_route.concept_name as ROUTE_CONCEPT_NAME,
        d_type.concept_name as DRUG_TYPE_CONCEPT_NAME,
        d_standard_concept.concept_name as STANDARD_CONCEPT_NAME,
        d_standard_concept.vocabulary_id as STANDARD_VOCABULARY,
        d_source_concept.concept_name as SOURCE_CONCEPT_NAME,
        d_visit.concept_name as VISIT_OCCURRENCE_CONCEPT_NAME 
    from
        ( SELECT
            * 
        from
            `""" + os.environ["WORKSPACE_CDR"] + """.drug_exposure` d_exposure 
        WHERE
            (
                drug_concept_id in  (
                    select
                        distinct ca.descendant_id 
                    from
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria_ancestor` ca 
                    join
                        (
                            select
                                distinct c.concept_id 
                            from
                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                            join
                                (
                                    select
                                        cast(cr.id as string) as id 
                                    from
                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                    where
                                        domain_id = 'DRUG' 
                                        and is_standard = 1 
                                        and concept_id in (
                                            1129625, 1140088, 1192218, 1326901, 1105775, 1181809, 1300153, 1134439, 43532539, 751698, 902938, 1196677, 1143374, 1154161, 1343916, 1183554, 711452, 1551099, 35603983, 1036525, 36883745, 1137529, 19029322, 905233, 43013634, 19126894, 1506270, 1111706, 19136048, 1237049, 1036059, 1110942, 35606631, 777221, 939259, 960900, 989878, 1163944, 1314928, 901656, 1115572, 1149380, 19050346, 19126511, 1112921, 19004810, 1101703, 914335, 919839, 19049024, 938205, 1154343, 19087208, 1147878, 734275, 792993, 19030493, 1780601, 915553, 1192710
                                        ) 
                                        and is_selectable = 1 
                                        and full_text like '%[drug_rank1]%'
                                ) a 
                                    on (
                                        c.path like concat('%.',
                                    a.id,
                                    '.%') 
                                    or c.path like concat('%.',
                                    a.id)) 
                                where
                                    domain_id = 'DRUG' 
                                    and is_standard = 1 
                                    and is_selectable = 1
                                ) b 
                                    on (
                                        ca.ancestor_id = b.concept_id
                                    )
                            )
                        )  
                        AND (
                            d_exposure.PERSON_ID IN (
                                select
                                    person_id  
                            from
                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
                            where
                                cb_search_person.person_id in (
                                    select
                                        criteria.person_id 
                                    from
                                        (select
                                            distinct person_id,
                                            entry_date,
                                            concept_id 
                                        from
                                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                        where
                                            person_id in (
                                                select
                                                    person_id 
                                                from
                                                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                                where
                                                    is_standard = 1 
                                                    and concept_id in (
                                                        select
                                                            distinct c.concept_id 
                                                        from
                                                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                                        join
                                                            (
                                                                select
                                                                    cast(cr.id as string) as id 
                                                                from
                                                                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                                                where
                                                                    domain_id = 'CONDITION' 
                                                                    and is_standard = 1 
                                                                    and concept_id in (764949, 46274062, 4271333, 46270573, 4146581, 46273487, 46270028, 45768963, 42539549, 46269770, 4142738, 45768910, 443801, 4145356, 4152913, 45769350, 46273454, 4279553, 45769443, 317009, 314754, 4138760, 252946, 46269783, 46269790, 252658, 46269786, 45768965, 4051466, 46273462, 4245676, 252942, 4215802, 45772937, 40481763, 37108581, 4075237, 256448, 46269802, 4141978, 764677, 46270322, 4125022, 46269776, 45768964, 4233784, 4155468, 37310241, 46273635, 4207479, 4123253, 45769352, 4143828, 42535716, 257581, 46273452, 45771045, 46269778, 312950, 45769351, 4309833, 45768911, 36684328, 4308356, 313236, 46269789, 45768912, 761844, 42536208, 4145497, 4312524, 46270030, 46269787, 40483397, 45769438, 45769441, 43530693, 45773005, 4206340, 45766728, 4194289, 46270082, 45769442, 42538744, 4191479, 46269784, 4155469, 37116845) 
                                                                    and is_selectable = 1 
                                                                    and full_text like '%[condition_rank1]%'
                                                            ) a 
                                                                on (
                                                                    c.path like concat('%.',
                                                                a.id,
                                                                '.%') 
                                                                or c.path like concat('%.',
                                                                a.id) 
                                                                or c.path like concat(a.id,
                                                                '.%')) 
                                                            where
                                                                domain_id = 'CONDITION' 
                                                                and is_standard = 1 
                                                                and is_selectable = 1
                                                            )
                                                        union
                                                        all select
                                                            person_id 
                                                        from
                                                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                                        where
                                                            is_standard = 0 
                                                            and concept_id in (
                                                                select
                                                                    distinct c.concept_id 
                                                                from
                                                                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                                                join
                                                                    (
                                                                        select
                                                                            cast(cr.id as string) as id 
                                                                        from
                                                                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                                                        where
                                                                            domain_id = 'CONDITION' 
                                                                            and is_standard = 0 
                                                                            and concept_id in (46270322, 4155468, 46269789, 45572168, 45768965, 313236, 45768911, 4207479, 46273454, 44820889, 46270573, 44834769, 45601133, 45771045, 46269802, 4194289, 45567266, 44824288, 1569490, 45768912, 44829012, 4215802, 46269784, 44821988, 46270030, 42539549, 312950, 46269786, 764949, 4051466, 4245676, 45543269, 45543270, 45557626, 45769351, 4143828, 4152913, 45769442, 46273635, 45772937, 45548117, 4191479, 44832423, 4125022, 44833611, 45591559, 45576951, 4271333, 317009, 37310241, 4206340, 45548116, 45572170, 45562457, 4146581, 4123253, 256448, 46269776, 45768964, 46269787, 46273452, 44823144, 4312524, 4279553, 43530693, 42535716, 44830115, 46273487, 45769438, 4138760, 4155469, 36684328, 44831280, 1569491, 46269770, 40483397, 1569488, 44831278, 45576952, 45567265, 44824289, 44826679, 4141978, 45601134, 4145356, 43530700, 45773005, 4144757, 45581859, 45769441, 46269778, 4309833, 46269783, 4142738, 44831279, 4308356, 37116845, 44832424, 45581860, 257581, 44828510, 45768963, 764677, 44837136, 252658, 4145497, 46270028, 1569489, 46269790, 46273462, 45766728, 46270082, 44837135, 42536208, 42538744, 45768910, 37108581, 44821987, 35225323, 314754, 45586675, 40410639, 44824287, 761844, 45769352, 4075237, 45591558, 443801, 45572169, 45572171, 46274062, 4233784) 
                                                                            and is_selectable = 1 
                                                                            and full_text like '%[condition_rank1]%'
                                                                    ) a 
                                                                        on (
                                                                            c.path like concat('%.',
                                                                        a.id,
                                                                        '.%') 
                                                                        or c.path like concat('%.',
                                                                        a.id) 
                                                                        or c.path like concat(a.id,
                                                                        '.%')) 
                                                                    where
                                                                        domain_id = 'CONDITION' 
                                                                        and is_standard = 0 
                                                                        and is_selectable = 1
                                                                    )
                                                            )
                                                        ) criteria 
                                                ) 
                                        )
                                    )
                            ) d_exposure 
                        LEFT JOIN
                            `""" + os.environ["WORKSPACE_CDR"] + """.concept` d_route 
                                on d_exposure.ROUTE_CONCEPT_ID = d_route.CONCEPT_ID 
                        LEFT JOIN
                            `""" + os.environ["WORKSPACE_CDR"] + """.concept` d_type 
                                on d_exposure.drug_type_concept_id = d_type.CONCEPT_ID 
                        left join
                            `""" + os.environ["WORKSPACE_CDR"] + """.concept` d_standard_concept 
                                on d_exposure.DRUG_CONCEPT_ID = d_standard_concept.CONCEPT_ID 
                        LEFT JOIN
                            `""" + os.environ["WORKSPACE_CDR"] + """.concept` d_source_concept 
                                on d_exposure.DRUG_SOURCE_CONCEPT_ID = d_source_concept.CONCEPT_ID 
                        left join
                            `""" + os.environ["WORKSPACE_CDR"] + """.visit_occurrence` v 
                                on d_exposure.VISIT_OCCURRENCE_ID = v.VISIT_OCCURRENCE_ID 
                        LEFT JOIN
                            `""" + os.environ["WORKSPACE_CDR"] + """.concept` d_visit 
                                on v.VISIT_CONCEPT_ID = d_visit.CONCEPT_ID"""

dataset_64469718_drug_df = pandas.read_gbq(dataset_64469718_drug_sql, dialect="standard", progress_bar_type="tqdm_notebook")

dataset_64469718_drug_df.head(5)

In [None]:
drug_df=dataset_64469718_drug_df

In [None]:
drug_df=janitor.clean_names(drug_df)

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = drug_df   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'drug_df.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


In [None]:
drug_df.info()

In [None]:
drug_df.sort_values('person_id').head(60)

In [None]:
import pandas
import os

# This query represents dataset "Asthma_exacerb_predict" for domain "measurement" and was generated for All of Us Registered Tier Dataset v4
dataset_64469718_measurement_sql = """
    SELECT
        measurement.RANGE_HIGH,
        measurement.RANGE_LOW,
        measurement.MEASUREMENT_DATETIME,
        measurement.PERSON_ID,
        measurement.VALUE_AS_NUMBER,
        m_value.concept_name as VALUE_AS_CONCEPT_NAME,
        m_type.concept_name as MEASUREMENT_TYPE_CONCEPT_NAME,
        m_standard_concept.concept_name as STANDARD_CONCEPT_NAME,
        m_standard_concept.vocabulary_id as STANDARD_VOCABULARY,
        m_visit.concept_name as VISIT_OCCURRENCE_CONCEPT_NAME 
    from
        ( SELECT
            * 
        from
            `""" + os.environ["WORKSPACE_CDR"] + """.measurement` measurement 
        WHERE
            (
                measurement_concept_id in  (
                    select
                        distinct c.concept_id 
                    from
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                    join
                        (
                            select
                                cast(cr.id as string) as id 
                            from
                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                            where
                                domain_id = 'MEASUREMENT' 
                                and is_standard = 1 
                                and concept_id in (
                                    3034806, 3012030, 3020825, 3024128, 3018645, 3022675, 3035995, 3021304, 3013682, 3020874, 3033575, 3005333, 3023599, 3024929, 3021601, 3045716, 3009537, 3037235, 3027796, 3000131, 3023006, 3036277, 3002187, 3003888, 3013650, 3015731, 3005235, 3025260, 3011948, 3027300, 3036780, 3001915, 3014599, 3027597, 3007490, 3004327, 3024180, 3005136, 3024469, 3016723, 3001965, 3009744, 3009745, 3037475, 3010702, 3015632, 3012494, 3024171, 3004501, 3001247, 3002527, 3012932, 3020934, 3036312, 3023314, 3022100, 3024561, 3013429, 3036857, 3020630, 3000876, 3015411, 3009201, 3028615, 3006451, 3024395, 3008342, 3035362, 3016459, 3011397, 3026212, 3007070, 3000348, 3008450, 3016031, 3020416, 3010457, 37023425, 3028352, 3002479, 3024149, 3008401, 3011951, 3006906, 3012711, 3026915, 3023430, 3043730, 3019897, 3023646, 3001539, 3019550, 3007980, 3014133, 3006923, 3013043, 3007757, 3038205, 3024009, 3027114, 3022217, 3021226, 3022192, 3037511, 3006594, 3013721, 3027231, 3007015, 3022260, 3038553, 3006734, 3028668, 3036067, 3013101, 3013869, 3043111, 3011177, 3015942, 3025315, 3011360, 3033335, 3000963, 3027794, 40783188, 3018834, 3000905, 3023351, 3014126, 3015142, 40796119, 3015076, 3016835
                                ) 
                                and is_selectable = 1 
                                and full_text like '%[measurement_rank1]%'
                        ) a 
                            on (
                                c.path like concat('%.',
                            a.id,
                            '.%') 
                            or c.path like concat('%.',
                            a.id) 
                            or c.path like concat(a.id,
                            '.%')) 
                        where
                            domain_id = 'MEASUREMENT' 
                            and is_standard = 1 
                            and is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        select
                            person_id  
                        from
                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
                        where
                            cb_search_person.person_id in (
                                select
                                    criteria.person_id 
                                from
                                    (select
                                        distinct person_id,
                                        entry_date,
                                        concept_id 
                                    from
                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                    where
                                        person_id in (
                                            select
                                                person_id 
                                            from
                                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                            where
                                                is_standard = 1 
                                                and concept_id in (
                                                    select
                                                        distinct c.concept_id 
                                                    from
                                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                                    join
                                                        (
                                                            select
                                                                cast(cr.id as string) as id 
                                                            from
                                                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                                            where
                                                                domain_id = 'CONDITION' 
                                                                and is_standard = 1 
                                                                and concept_id in (764949, 46274062, 4271333, 46270573, 4146581, 46273487, 46270028, 45768963, 42539549, 46269770, 4142738, 45768910, 443801, 4145356, 4152913, 45769350, 46273454, 4279553, 45769443, 317009, 314754, 4138760, 252946, 46269783, 46269790, 252658, 46269786, 45768965, 4051466, 46273462, 4245676, 252942, 4215802, 45772937, 40481763, 37108581, 4075237, 256448, 46269802, 4141978, 764677, 46270322, 4125022, 46269776, 45768964, 4233784, 4155468, 37310241, 46273635, 4207479, 4123253, 45769352, 4143828, 42535716, 257581, 46273452, 45771045, 46269778, 312950, 45769351, 4309833, 45768911, 36684328, 4308356, 313236, 46269789, 45768912, 761844, 42536208, 4145497, 4312524, 46270030, 46269787, 40483397, 45769438, 45769441, 43530693, 45773005, 4206340, 45766728, 4194289, 46270082, 45769442, 42538744, 4191479, 46269784, 4155469, 37116845) 
                                                                and is_selectable = 1 
                                                                and full_text like '%[condition_rank1]%'
                                                        ) a 
                                                            on (
                                                                c.path like concat('%.',
                                                            a.id,
                                                            '.%') 
                                                            or c.path like concat('%.',
                                                            a.id) 
                                                            or c.path like concat(a.id,
                                                            '.%')) 
                                                        where
                                                            domain_id = 'CONDITION' 
                                                            and is_standard = 1 
                                                            and is_selectable = 1
                                                        )
                                                    union
                                                    all select
                                                        person_id 
                                                    from
                                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                                    where
                                                        is_standard = 0 
                                                        and concept_id in (
                                                            select
                                                                distinct c.concept_id 
                                                            from
                                                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                                            join
                                                                (
                                                                    select
                                                                        cast(cr.id as string) as id 
                                                                    from
                                                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                                                    where
                                                                        domain_id = 'CONDITION' 
                                                                        and is_standard = 0 
                                                                        and concept_id in (46270322, 4155468, 46269789, 45572168, 45768965, 313236, 45768911, 4207479, 46273454, 44820889, 46270573, 44834769, 45601133, 45771045, 46269802, 4194289, 45567266, 44824288, 1569490, 45768912, 44829012, 4215802, 46269784, 44821988, 46270030, 42539549, 312950, 46269786, 764949, 4051466, 4245676, 45543269, 45543270, 45557626, 45769351, 4143828, 4152913, 45769442, 46273635, 45772937, 45548117, 4191479, 44832423, 4125022, 44833611, 45591559, 45576951, 4271333, 317009, 37310241, 4206340, 45548116, 45572170, 45562457, 4146581, 4123253, 256448, 46269776, 45768964, 46269787, 46273452, 44823144, 4312524, 4279553, 43530693, 42535716, 44830115, 46273487, 45769438, 4138760, 4155469, 36684328, 44831280, 1569491, 46269770, 40483397, 1569488, 44831278, 45576952, 45567265, 44824289, 44826679, 4141978, 45601134, 4145356, 43530700, 45773005, 4144757, 45581859, 45769441, 46269778, 4309833, 46269783, 4142738, 44831279, 4308356, 37116845, 44832424, 45581860, 257581, 44828510, 45768963, 764677, 44837136, 252658, 4145497, 46270028, 1569489, 46269790, 46273462, 45766728, 46270082, 44837135, 42536208, 42538744, 45768910, 37108581, 44821987, 35225323, 314754, 45586675, 40410639, 44824287, 761844, 45769352, 4075237, 45591558, 443801, 45572169, 45572171, 46274062, 4233784) 
                                                                        and is_selectable = 1 
                                                                        and full_text like '%[condition_rank1]%'
                                                                ) a 
                                                                    on (
                                                                        c.path like concat('%.',
                                                                    a.id,
                                                                    '.%') 
                                                                    or c.path like concat('%.',
                                                                    a.id) 
                                                                    or c.path like concat(a.id,
                                                                    '.%')) 
                                                                where
                                                                    domain_id = 'CONDITION' 
                                                                    and is_standard = 0 
                                                                    and is_selectable = 1
                                                                )
                                                        )
                                                    ) criteria 
                                            ) 
                                    )
                                )
                        ) measurement 
                    left join
                        `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_value 
                            on measurement.value_as_concept_id = m_value.concept_id 
                    left join
                        `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_type 
                            on measurement.measurement_type_concept_id = m_type.concept_id 
                    left join
                        `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_standard_concept 
                            on measurement.measurement_concept_id = m_standard_concept.concept_id 
                    left join
                        `""" + os.environ["WORKSPACE_CDR"] + """.visit_occurrence` v 
                            on measurement.visit_occurrence_id = v.visit_occurrence_id 
                    left join
                        `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_visit 
                            on v.visit_concept_id = m_visit.concept_id"""

dataset_64469718_measurement_df = pandas.read_gbq(dataset_64469718_measurement_sql, dialect="standard", progress_bar_type="tqdm_notebook")

dataset_64469718_measurement_df.head(5)

In [None]:
measurement_df=dataset_64469718_measurement_df

In [None]:
measurement_df=janitor.clean_names(measurement_df)

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = measurement_df   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'measurement_df.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


In [None]:
measurement_df.info()

In [None]:
measurement_df.sort_values('person_id').head(60)

In [None]:
import pandas
import os

# This query represents dataset "Asthma_exacerb_predict" for domain "observation" and was generated for All of Us Registered Tier Dataset v4
dataset_64469718_observation_sql = """
    SELECT
        observation.VALUE_AS_STRING,
        observation.OBSERVATION_DATETIME,
        observation.PERSON_ID,
        observation.VALUE_AS_NUMBER,
        o_unit.concept_name as UNIT_CONCEPT_NAME,
        o_value.concept_name as VALUE_AS_CONCEPT_NAME,
        o_type.concept_name as OBSERVATION_TYPE_CONCEPT_NAME,
        o_qualifier.concept_name as QUALIFIER_CONCEPT_NAME,
        o_standard_concept.concept_name as STANDARD_CONCEPT_NAME,
        o_standard_concept.vocabulary_id as STANDARD_VOCABULARY,
        o_visit.concept_name as VISIT_OCCURRENCE_CONCEPT_NAME 
    from
        ( SELECT
            * 
        from
            `""" + os.environ["WORKSPACE_CDR"] + """.ds_observation` observation 
        WHERE
            (
                observation_concept_id in (
                    1585636, 40766240, 1586182, 3035281, 1586213, 3022304, 40764347, 40771091, 3046344, 3046853, 40771090, 40766609, 1585389, 1586166, 4087925, 3007191, 1586140, 3046965, 1585370, 1585886, 1586174, 40766306, 46235933, 43530559
                )
            )  
            AND (
                observation.PERSON_ID IN (
                    select
                        person_id  
                    from
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
                    where
                        cb_search_person.person_id in (
                            select
                                criteria.person_id 
                            from
                                (select
                                    distinct person_id,
                                    entry_date,
                                    concept_id 
                                from
                                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                where
                                    person_id in (
                                        select
                                            person_id 
                                        from
                                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                        where
                                            is_standard = 1 
                                            and concept_id in (
                                                select
                                                    distinct c.concept_id 
                                                from
                                                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                                join
                                                    (
                                                        select
                                                            cast(cr.id as string) as id 
                                                        from
                                                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                                        where
                                                            domain_id = 'CONDITION' 
                                                            and is_standard = 1 
                                                            and concept_id in (764949, 46274062, 4271333, 46270573, 4146581, 46273487, 46270028, 45768963, 42539549, 46269770, 4142738, 45768910, 443801, 4145356, 4152913, 45769350, 46273454, 4279553, 45769443, 317009, 314754, 4138760, 252946, 46269783, 46269790, 252658, 46269786, 45768965, 4051466, 46273462, 4245676, 252942, 4215802, 45772937, 40481763, 37108581, 4075237, 256448, 46269802, 4141978, 764677, 46270322, 4125022, 46269776, 45768964, 4233784, 4155468, 37310241, 46273635, 4207479, 4123253, 45769352, 4143828, 42535716, 257581, 46273452, 45771045, 46269778, 312950, 45769351, 4309833, 45768911, 36684328, 4308356, 313236, 46269789, 45768912, 761844, 42536208, 4145497, 4312524, 46270030, 46269787, 40483397, 45769438, 45769441, 43530693, 45773005, 4206340, 45766728, 4194289, 46270082, 45769442, 42538744, 4191479, 46269784, 4155469, 37116845) 
                                                            and is_selectable = 1 
                                                            and full_text like '%[condition_rank1]%'
                                                    ) a 
                                                        on (
                                                            c.path like concat('%.',
                                                        a.id,
                                                        '.%') 
                                                        or c.path like concat('%.',
                                                        a.id) 
                                                        or c.path like concat(a.id,
                                                        '.%')) 
                                                    where
                                                        domain_id = 'CONDITION' 
                                                        and is_standard = 1 
                                                        and is_selectable = 1
                                                    )
                                                union
                                                all select
                                                    person_id 
                                                from
                                                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                                where
                                                    is_standard = 0 
                                                    and concept_id in (
                                                        select
                                                            distinct c.concept_id 
                                                        from
                                                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                                        join
                                                            (
                                                                select
                                                                    cast(cr.id as string) as id 
                                                                from
                                                                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                                                where
                                                                    domain_id = 'CONDITION' 
                                                                    and is_standard = 0 
                                                                    and concept_id in (46270322, 4155468, 46269789, 45572168, 45768965, 313236, 45768911, 4207479, 46273454, 44820889, 46270573, 44834769, 45601133, 45771045, 46269802, 4194289, 45567266, 44824288, 1569490, 45768912, 44829012, 4215802, 46269784, 44821988, 46270030, 42539549, 312950, 46269786, 764949, 4051466, 4245676, 45543269, 45543270, 45557626, 45769351, 4143828, 4152913, 45769442, 46273635, 45772937, 45548117, 4191479, 44832423, 4125022, 44833611, 45591559, 45576951, 4271333, 317009, 37310241, 4206340, 45548116, 45572170, 45562457, 4146581, 4123253, 256448, 46269776, 45768964, 46269787, 46273452, 44823144, 4312524, 4279553, 43530693, 42535716, 44830115, 46273487, 45769438, 4138760, 4155469, 36684328, 44831280, 1569491, 46269770, 40483397, 1569488, 44831278, 45576952, 45567265, 44824289, 44826679, 4141978, 45601134, 4145356, 43530700, 45773005, 4144757, 45581859, 45769441, 46269778, 4309833, 46269783, 4142738, 44831279, 4308356, 37116845, 44832424, 45581860, 257581, 44828510, 45768963, 764677, 44837136, 252658, 4145497, 46270028, 1569489, 46269790, 46273462, 45766728, 46270082, 44837135, 42536208, 42538744, 45768910, 37108581, 44821987, 35225323, 314754, 45586675, 40410639, 44824287, 761844, 45769352, 4075237, 45591558, 443801, 45572169, 45572171, 46274062, 4233784) 
                                                                    and is_selectable = 1 
                                                                    and full_text like '%[condition_rank1]%'
                                                            ) a 
                                                                on (
                                                                    c.path like concat('%.',
                                                                a.id,
                                                                '.%') 
                                                                or c.path like concat('%.',
                                                                a.id) 
                                                                or c.path like concat(a.id,
                                                                '.%')) 
                                                            where
                                                                domain_id = 'CONDITION' 
                                                                and is_standard = 0 
                                                                and is_selectable = 1
                                                            )
                                                    )
                                                ) criteria 
                                        ) 
                                )
                            )
                    ) observation 
                LEFT JOIN
                    `""" + os.environ["WORKSPACE_CDR"] + """.concept` o_unit 
                        on observation.unit_concept_id = o_unit.CONCEPT_ID 
                LEFT JOIN
                    `""" + os.environ["WORKSPACE_CDR"] + """.concept` o_value 
                        on observation.value_as_concept_id = o_value.CONCEPT_ID 
                LEFT JOIN
                    `""" + os.environ["WORKSPACE_CDR"] + """.concept` o_type 
                        on observation.OBSERVATION_TYPE_CONCEPT_ID = o_type.CONCEPT_ID 
                LEFT JOIN
                    `""" + os.environ["WORKSPACE_CDR"] + """.concept` o_qualifier 
                        on observation.qualifier_concept_id = o_qualifier.CONCEPT_ID 
                LEFT JOIN
                    `""" + os.environ["WORKSPACE_CDR"] + """.concept` o_standard_concept 
                        on observation.OBSERVATION_CONCEPT_ID = o_standard_concept.CONCEPT_ID 
                left join
                    `""" + os.environ["WORKSPACE_CDR"] + """.visit_occurrence` v 
                        on observation.VISIT_OCCURRENCE_ID = v.VISIT_OCCURRENCE_ID 
                left join
                    `""" + os.environ["WORKSPACE_CDR"] + """.concept` o_visit 
                        on v.visit_concept_id = o_visit.concept_id"""

dataset_64469718_observation_df = pandas.read_gbq(dataset_64469718_observation_sql, dialect="standard", progress_bar_type="tqdm_notebook")

dataset_64469718_observation_df.head(5)

In [None]:
observation_df=dataset_64469718_observation_df

In [None]:
observation_df=janitor.clean_names(observation_df)

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = observation_df   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'observation_df.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


In [None]:
observation_df.sort_values('person_id').head(60)

In [None]:
import pandas
import os

# This query represents dataset "Asthma_exacerb_predict" for domain "survey" and was generated for All of Us Registered Tier Dataset v4
dataset_64469718_survey_sql = """
    SELECT
        answer.question,
        answer.answer,
        answer.survey_datetime,
        answer.person_id,
        answer.survey  
    FROM
        `""" + os.environ["WORKSPACE_CDR"] + """.ds_survey` answer   
    WHERE
        (
            question_concept_id in (
                43528873, 43530546, 43530388
            )
        )  
        AND (
            answer.PERSON_ID IN (
                select
                    person_id  
                from
                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
                where
                    cb_search_person.person_id in (
                        select
                            criteria.person_id 
                        from
                            (select
                                distinct person_id,
                                entry_date,
                                concept_id 
                            from
                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                            where
                                person_id in (
                                    select
                                        person_id 
                                    from
                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                    where
                                        is_standard = 1 
                                        and concept_id in (
                                            select
                                                distinct c.concept_id 
                                            from
                                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                            join
                                                (
                                                    select
                                                        cast(cr.id as string) as id 
                                                    from
                                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                                    where
                                                        domain_id = 'CONDITION' 
                                                        and is_standard = 1 
                                                        and concept_id in (764949, 46274062, 4271333, 46270573, 4146581, 46273487, 46270028, 45768963, 42539549, 46269770, 4142738, 45768910, 443801, 4145356, 4152913, 45769350, 46273454, 4279553, 45769443, 317009, 314754, 4138760, 252946, 46269783, 46269790, 252658, 46269786, 45768965, 4051466, 46273462, 4245676, 252942, 4215802, 45772937, 40481763, 37108581, 4075237, 256448, 46269802, 4141978, 764677, 46270322, 4125022, 46269776, 45768964, 4233784, 4155468, 37310241, 46273635, 4207479, 4123253, 45769352, 4143828, 42535716, 257581, 46273452, 45771045, 46269778, 312950, 45769351, 4309833, 45768911, 36684328, 4308356, 313236, 46269789, 45768912, 761844, 42536208, 4145497, 4312524, 46270030, 46269787, 40483397, 45769438, 45769441, 43530693, 45773005, 4206340, 45766728, 4194289, 46270082, 45769442, 42538744, 4191479, 46269784, 4155469, 37116845) 
                                                        and is_selectable = 1 
                                                        and full_text like '%[condition_rank1]%'
                                                ) a 
                                                    on (
                                                        c.path like concat('%.',
                                                    a.id,
                                                    '.%') 
                                                    or c.path like concat('%.',
                                                    a.id) 
                                                    or c.path like concat(a.id,
                                                    '.%')) 
                                                where
                                                    domain_id = 'CONDITION' 
                                                    and is_standard = 1 
                                                    and is_selectable = 1
                                                )
                                            union
                                            all select
                                                person_id 
                                            from
                                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                            where
                                                is_standard = 0 
                                                and concept_id in (
                                                    select
                                                        distinct c.concept_id 
                                                    from
                                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                                    join
                                                        (
                                                            select
                                                                cast(cr.id as string) as id 
                                                            from
                                                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                                            where
                                                                domain_id = 'CONDITION' 
                                                                and is_standard = 0 
                                                                and concept_id in (46270322, 4155468, 46269789, 45572168, 45768965, 313236, 45768911, 4207479, 46273454, 44820889, 46270573, 44834769, 45601133, 45771045, 46269802, 4194289, 45567266, 44824288, 1569490, 45768912, 44829012, 4215802, 46269784, 44821988, 46270030, 42539549, 312950, 46269786, 764949, 4051466, 4245676, 45543269, 45543270, 45557626, 45769351, 4143828, 4152913, 45769442, 46273635, 45772937, 45548117, 4191479, 44832423, 4125022, 44833611, 45591559, 45576951, 4271333, 317009, 37310241, 4206340, 45548116, 45572170, 45562457, 4146581, 4123253, 256448, 46269776, 45768964, 46269787, 46273452, 44823144, 4312524, 4279553, 43530693, 42535716, 44830115, 46273487, 45769438, 4138760, 4155469, 36684328, 44831280, 1569491, 46269770, 40483397, 1569488, 44831278, 45576952, 45567265, 44824289, 44826679, 4141978, 45601134, 4145356, 43530700, 45773005, 4144757, 45581859, 45769441, 46269778, 4309833, 46269783, 4142738, 44831279, 4308356, 37116845, 44832424, 45581860, 257581, 44828510, 45768963, 764677, 44837136, 252658, 4145497, 46270028, 1569489, 46269790, 46273462, 45766728, 46270082, 44837135, 42536208, 42538744, 45768910, 37108581, 44821987, 35225323, 314754, 45586675, 40410639, 44824287, 761844, 45769352, 4075237, 45591558, 443801, 45572169, 45572171, 46274062, 4233784) 
                                                                and is_selectable = 1 
                                                                and full_text like '%[condition_rank1]%'
                                                        ) a 
                                                            on (
                                                                c.path like concat('%.',
                                                            a.id,
                                                            '.%') 
                                                            or c.path like concat('%.',
                                                            a.id) 
                                                            or c.path like concat(a.id,
                                                            '.%')) 
                                                        where
                                                            domain_id = 'CONDITION' 
                                                            and is_standard = 0 
                                                            and is_selectable = 1
                                                        )
                                                )
                                            ) criteria 
                                    ) 
                            )
                        )"""

dataset_64469718_survey_df = pandas.read_gbq(dataset_64469718_survey_sql, dialect="standard", progress_bar_type="tqdm_notebook")

dataset_64469718_survey_df.head(5)

In [None]:
survey_df=dataset_64469718_survey_df

In [None]:
survey_df=janitor.clean_names(survey_df)

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = survey_df   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'survey_df.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


In [None]:
survey_df.info()

In [None]:
survey_df.sort_values('person_id').head(60)

# Saving each table as data frames to workspace

In [None]:
# This snippet assumes you run setup first

# This code copies file in your Google Bucket and loads it into a dataframe

# Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
name_of_file_in_bucket = 'person_df.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# save dataframe in a csv file in the same workspace as the notebook
my_dataframe = pd.read_csv(name_of_file_in_bucket)
my_dataframe.head()


In [None]:
# This snippet assumes you run setup first

# This code copies file in your Google Bucket and loads it into a dataframe

# Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
name_of_file_in_bucket = 'condition_df.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# save dataframe in a csv file in the same workspace as the notebook
my_dataframe = pd.read_csv(name_of_file_in_bucket)
my_dataframe.head()


In [None]:
# This snippet assumes you run setup first

# This code copies file in your Google Bucket and loads it into a dataframe

# Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
name_of_file_in_bucket = 'measurement_df.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# save dataframe in a csv file in the same workspace as the notebook
my_dataframe = pd.read_csv(name_of_file_in_bucket)
my_dataframe.head()


In [None]:
# This snippet assumes you run setup first

# This code copies file in your Google Bucket and loads it into a dataframe

# Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
name_of_file_in_bucket = 'drug_df.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# save dataframe in a csv file in the same workspace as the notebook
my_dataframe = pd.read_csv(name_of_file_in_bucket)
my_dataframe.head()


In [None]:
# This snippet assumes you run setup first

# This code copies file in your Google Bucket and loads it into a dataframe

# Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
name_of_file_in_bucket = 'survey_df.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# save dataframe in a csv file in the same workspace as the notebook
my_dataframe = pd.read_csv(name_of_file_in_bucket)
my_dataframe.head()


In [None]:
# This snippet assumes you run setup first

# This code copies file in your Google Bucket and loads it into a dataframe

# Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
name_of_file_in_bucket = 'observation_df.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# save dataframe in a csv file in the same workspace as the notebook
my_dataframe = pd.read_csv(name_of_file_in_bucket)
my_dataframe.head()


# Cleaning data each separately

In [None]:
import pandas as pd
person_df=pd.read_csv('person_df.csv')
condition_df=pd.read_csv('condition_df.csv')
measurement_df=pd.read_csv('measurement_df.csv')
drug_df=pd.read_csv('drug_df.csv')
survey_df=pd.read_csv('survey_df.csv')
observation_df=pd.read_csv('observation_df.csv')

## cleaning person data

In [None]:
import pandas as pd
person_df=pd.read_csv('person_df.csv')

In [None]:
person_df.tail(60)

## cleaning and reshaping observation data¶

In [None]:
observation_df=pd.read_csv('observation_df.csv')

In [None]:
observation_df.person_id.nunique()

In [None]:
observation_df=observation_df.drop(observation_df[observation_df['value_as_concept_name'] == 'PMI: Skip'].index)
#observation_df.sort_values(by=['person_id','observation_datetime']).head(60)

In [None]:
observation_df=observation_df.pivot_table(index=['observation_datetime','person_id'],
                                              columns='standard_concept_name', values='value_as_concept_name', aggfunc='first').reset_index()

In [None]:
observation_df=observation_df.rename(
    columns={
        'Alcohol: 6 or More Drinks Occurrence':'alcohol6or_more',
        'Are you covered by health insurance or some other kind of health care plan [PhenX]':'Health_insurance_cov',
        'Cigar Smoking: Cigar Smoke Participant':'cigar_smoke',
        'Current occupational status [SAMHSA]':'Current_occupation_status',
        'Electronic Smoking: Electric Smoke Participant':'electronic_cigarate_smoke',
        'Health Insurance: Health Insurance Type':'Health_insurance_type',
        'Home Own: Current Home Own':'current_home_own_rent',
        'Hookah Smoking: Hookah Smoke Participant':'hookah_smoke',
        'How often have you been bothered by emotional problems such as feeling anxious, depressed or irritable in past 7 days [PROMIS]':'howmuch_emotional_problem_7yrs',
        'Insurance: Healthcare Coverage':'insurance_healthcare_coverage',
        'Living Situation: Stable House Concern':'stable_house_concern',
        'Marital status':'marital_status',
        'Race':'race',
        'Race: What Race Ethnicity':'race_ethnicity',
        'Recreational Drug Use: Which Drugs Used':'resreational_drug',
        'Sex':'sex',
        'Smoked at least 100 cigarettes in entire life':'smoked100cigarettes',
        'Total combined household income range in last year':'hh_income_last_yr',
        'What is the highest grade or level of schooling you completed [SAMHSA]':'highest_grade_edu'
    }
)

In [None]:
observation_df.sort_values(by=['person_id','observation_datetime']).head(60)

In [None]:
observation_df=observation_df.drop(columns=['sex','race','race', 'Age'])

In [None]:
observation_df_w=observation_df

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = observation_df_w   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'observation_df_wide.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


## Cleaning and reshaping survey data

In [None]:
survey_df=pd.read_csv('survey_df.csv')

In [None]:
survey_df=survey_df.drop(survey_df[survey_df['answer'] == 'PMI: Skip'].index)

In [None]:
survey_df_w=survey_df.pivot_table(index=['survey_datetime','person_id'],
                                              columns='question', values='answer', aggfunc='first').reset_index()

In [None]:
survey_df.sort_values(by=['person_id','survey_datetime']).head(60)

In [None]:
survey_df_w.info()

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = survey_df_w   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'survey_df_wide.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


## cleaning condition data and creating exacebration variable

In [None]:
condition_df=pd.read_csv('condition_df.csv')
condition_df.info()

In [None]:
condition_df['person_id'].nunique()

In [None]:
condition_df['visit_occurrence_concept_name'].unique()

In [None]:
condition_df['standard_concept_name'].unique()

In [None]:
import numpy as np
condition_df['exacerbation']=np.where(condition_df['standard_concept_name'].str.contains('exacerbation'), 1, 0)

In [None]:
condition_df.sort_values(by=['person_id','condition_start_datetime']).head(60)

In [None]:
condition_df['exacerbation'].value_counts()

In [None]:
condition_df['condition_start_datetime']=pd.to_datetime(condition_df['condition_start_datetime'])
condition_df['condition_end_datetime']=pd.to_datetime(condition_df['condition_end_datetime'])

## subseting  and cleaning, and pivoting (wide) measurement data

In [None]:
import pandas as pd
measurement_df=pd.read_csv('measurement_df.csv')

In [None]:
measurement_df=measurement_df.drop(columns=['range_high','range_low','value_as_concept_name',
                             'measurement_type_concept_name', 'standard_vocabulary',
                             'visit_occurrence_concept_name'])


In [None]:
measurement_df=measurement_df.dropna(subset=['value_as_number'])
measurement_df.shape

In [None]:
measurement_df=measurement_df[measurement_df['standard_concept_name'].str.contains('Monocytes', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Leukocytes', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Neutrophils', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Eosinophils', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Lymphocytes', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Basophills', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('BMI', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Albumin [Mass/volume] in Serum or Plasma', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Bilirubin.total [Mass/volume] in Serum or Plasma', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Bilirubin.total [Mass/volume] in Blood', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('IgE', case=False, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Hemoglobin [Mass/volume] in Blood', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Nitrite', case=False, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Protein', case=False, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Urobilinogen', case=False, regex=True)]

In [None]:
import numpy as np
conditions=[(measurement_df['standard_concept_name'].str.contains('Monocytes', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Leukocytes', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Neutrophils', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Eosinophils', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Lymphocytes', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Basophills', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('BMI', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Albumin', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Bilirubin', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('IgE', case=False, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Hemoglobin', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Nitrite', case=False, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Protein', case=False, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Urobilinogen', case=False, regex=True))]

values=['Monocytes','Leukocytes','Neutrophils','Eosinophils',
        'Lymphocytes','Basophills','BMI','Albumin','Bilirubin',
        'IgE','Hemoglobin','Nitrite', 'Protein','Urobilinogen']

measurement_df['variable'] = np.select(conditions, values)

In [None]:
measurement_df=measurement_df.pivot_table(index=['measurement_datetime','person_id'],columns='variable', values='value_as_number').reset_index()

In [None]:
measurement_df.sort_values(by=['person_id','measurement_datetime']).head(60)

In [None]:
measurement_df_w=measurement_df

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = measurement_df_w   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'measurement_df_wide.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


## cleaning drug data

In [None]:
import pandas as pd
drug_df=pd.read_csv('drug_df.csv')

In [None]:
drug_df.sort_values(by=['person_id','drug_exposure_start_datetime']).head(60)

In [None]:
for val in drug_df['standard_concept_name'].unique():
    print(val)

In [None]:
drug_df=drug_df.convert_dtypes()

In [None]:

drug_df=drug_df.dropna(subset=['standard_concept_name'])
drug_df.shape

In [None]:
import numpy as np
SABA=['albuterol','ipratropium','Levalbuterol', 'Terbutaline','Pirbuterol','Salbutamol']
Inhaled_corticosteroids=['budesonide','Beclomethasone','Pulmicort','Ciclesonide','Flunisolide',
                         'Fluticasone','Mometasone'] 
LABA=['formoterol','Fluticasone','salmeterol','vilantero','Indacaterol','Olodaerol'] 
Leukotriene_modifiers=['Montelukast','Zafirlukast','Zileuton','Arformoterol','Carmoterol']  
Methylxanthines_and_Monoclonals=['Theophylline','Omalizumab','Aminophylline'] 
Anti_histamines=['Azelastine','methylprednisolone','Phenylephrine','diphenhydramine',
                 'Chlorpheniramine', 'Desloratadine', 'Fexofenadine','Levocetirizine',
                 'Loratadine','Cetirizine','pseudoephedrine','Oxymetazoline','Levocetirizine',
                 'Tetrahydrozoline', 'Olopatadine','Ketotifen','Triprolidine','Cromoglicic',
                 'Levocetirizine', 'Xylometazoline','Antazoline']
Syst_corticosteroids=['cortisone','prednisone','prednisolone','methylprednisolone',
                      'dexamethasone','betamethasone','hydrocortisone','Triamcinolone']
Sympathomimetics=['Phenylpropanolamine','Amphetamine','Methoxamine','Epinephrine','Metaraminol',
                  'Labetalol','Phenylephrine','Norepinephrine','Midodrine','Phentermine']


conditions=[
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(SABA),case=False, na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(Inhaled_corticosteroids),case=False,na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(LABA),case=False, na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(Leukotriene_modifiers),case=False,na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(Methylxanthines_and_Monoclonals),case=False,na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(Anti_histamines),case=False,na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(Syst_corticosteroids),case=False,na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(Sympathomimetics),case=False,na=False)
]


choices=[
    'SABA',
    'Inhaled corticosteroids', 'LABA', 'Leukotriene modifiers',
    'Methylxanthines and Monoclonals', 'Anti-histamines', 'Syst_corticosteroids','Sympathomimetics'
]


    
drug_df['drug_class'] =np.select(conditions, choices, 'other')  

In [None]:
#drug_df[drug_df['drug_class']=='other'].head(60)
drug_df['drug_class'].value_counts()

In [None]:
drug_df_w=drug_df

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = drug_df_w   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'drug_df_wide.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


In [None]:
#!pip install --upgrade pip
#!pip install MedCodes
from medcodes.drugs.standardization import Drug

# Merging the cleaned data frames

## merge person data to condition data

In [None]:
import pandas as pd
person_df=pd.read_csv('person_df.csv')
condition_df=pd.read_csv('condition_df.csv')
person_cond=pd.merge(person_df,
                   condition_df,
                   on='person_id', how='inner')

In [None]:
person_cond.info()

## merge person_condition data to observation data

In [None]:
observation_df_wide=pd.read_csv('observation_df_wide.csv')

In [None]:
observation_df_wide.info()

In [None]:
observation_df_wide.groupby('person_id').observation_datetime.value_counts()

In [None]:

person_cond['condition_start_datetime']=pd.to_datetime(person_cond['condition_start_datetime'])
observation_df_wide['observation_datetime']=pd.to_datetime(observation_df_wide['observation_datetime'])

In [None]:
person_cond=person_cond.sort_values(by=['condition_start_datetime'])

In [None]:

person_cond_obsv=pd.merge(person_cond,
                   observation_df_wide,
                   on='person_id', how='inner')

In [None]:
person_cond_obsv[['person_id','condition_start_datetime',
                  'observation_datetime']].sort_values(by=['person_id',
                                                           'condition_start_datetime',
                                                           'observation_datetime']).head(100)

In [None]:
person_cond_obsv.sort_values(by=['person_id','condition_start_datetime','observation_datetime'])

## merge person_condition_observ data to measurement data

In [None]:
person_cond_obsv=person_cond_obsv.sort_values(by=['condition_start_datetime'])

In [None]:
person_cond_obsv['condition_start_datetime']=pd.to_datetime(person_cond_obsv['condition_start_datetime'])

In [None]:
person_cond_obsv=person_cond_obsv.drop(columns='stop_reason')

In [None]:
import numpy as np
person_cond_obsv['exacerbation']=np.where(person_cond_obsv['standard_concept_name'].str.contains('exacerbation'), 1, 0)

In [None]:
measurement_df_wide=pd.read_csv('measurement_df_wide.csv')

In [None]:
measurement_df_wide['measurement_datetime']=pd.to_datetime(measurement_df_wide['measurement_datetime'])

In [None]:
person_cond_obsv_measu=pd.merge_asof(measurement_df_wide, person_cond_obsv, 
                   left_on='measurement_datetime', 
                   right_on='condition_start_datetime',
                   left_by=['person_id'],
                   right_by=['person_id'],allow_exact_matches=True,
                   direction='forward',tolerance=pd.Timedelta("5y"))

In [None]:
person_cond_obsv_measu.sort_values(by=['person_id','measurement_datetime',
                     'condition_start_datetime']).head(500)[['person_id','measurement_datetime','observation_datetime',
                                                             'condition_start_datetime','standard_concept_name']]


In [None]:
person_cond_obsv_measu[person_cond_obsv_measu['person_id']==1000109][['person_id','measurement_datetime','observation_datetime',
                                  'condition_start_datetime','standard_concept_name']]

In [None]:
person_cond_obsv_measu.dropna(subset=['condition_start_datetime']).sort_values(by=['person_id',
                                                                 'condition_start_datetime']) [['person_id',
                                                                                                'measurement_datetime','observation_datetime',
                                                                                                'condition_start_datetime','standard_concept_name']].head(200)

In [None]:
person_cond_obsv_measu=person_cond_obsv_measu.dropna(subset=['condition_start_datetime'])

In [None]:
person_cond_obsv_measu.info()

## merge person_condition_observation_measurement data to drug data

In [None]:
drug_df_wide=pd.read_csv('drug_df_wide.csv')

In [None]:
drug_df_wide['drug_exposure_start_datetime']=pd.to_datetime(drug_df_wide['drug_exposure_start_datetime'])

In [None]:
drug_df_wide=drug_df_wide.sort_values('drug_exposure_start_datetime')

In [None]:
person_cond_obsv_measu=person_cond_obsv_measu.sort_values(by=['condition_start_datetime'])

In [None]:
person_cond_obsv_measu_drug=pd.merge_asof(person_cond_obsv_measu, drug_df_wide, 
                   left_on='condition_start_datetime', 
                   right_on='drug_exposure_start_datetime',
                   left_by=['person_id'],
                   right_by=['person_id'],allow_exact_matches=True,
                   direction='backward',tolerance=pd.Timedelta("10y"))

In [None]:
person_cond_obsv_measu_drug.info()

In [None]:
person_cond_obsv_measu_drug.sort_values(by=['person_id', 'drug_exposure_start_datetime',
                          'condition_start_datetime']) [['person_id',
                                                         'measurement_datetime','observation_datetime','drug_class',
                                                         'condition_start_datetime','standard_concept_name_x',
                                                         'standard_concept_name_y','drug_exposure_start_datetime']].head(200)

In [None]:
person_cond_obsv_measu_drug[person_cond_obsv_measu_drug['person_id']==1000151][['person_id','measurement_datetime',
                                            'observation_datetime','drug_class',
                                            'condition_start_datetime','standard_concept_name_x',
                                            'standard_concept_name_y','drug_exposure_start_datetime']].head(200)


In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = person_cond_obsv_measu_drug   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'asthma_exaceb_df.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


## merge person_condition_observation_measurement_drug data to survey data

In [None]:
survey_df_wide=pd.read_csv('survey_df_wide.csv')

In [None]:
survey_df_wide.person_id.nunique()

In [None]:
survey_df_wide.survey_datetime.min()

In [None]:
person_co_dr_mr_svy.info()

In [None]:

person_co_dr_mr_svy[person_co_dr_mr_svy['person_id']==1000109][['person_id','measurement_datetime',
                                            'observation_datetime','survey_datetime',
                                            'condition_start_datetime','standard_concept_name_x','drug_class',
                                            'standard_concept_name_y','drug_exposure_start_datetime',
                                                                'Respiratory: Asthma Currently',
                                                               'Respiratory: How Old Were You Asthma',
                                                               'Respiratory: Rx Meds for Asthma']].head(200)

# saving to workspace in google bucket

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = person_co_dr_mr_svy_obs   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'person_cond_drug_mearsu_svy_obs.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')
