# Running queries on the source data

## Install required packages

Before executing the cells below, in a terminal session install the following required R packages:

`conda create -n omop-source r-glue r-tidyverse r-data.table r-dbi  r-rpostgres r-irkernel -y`

### Connect to the SQL database

In [None]:
library(tidyverse)
library(data.table)
library(DBI)
library(RPostgres)

DBNAME <- #<Add here relevant password>
HOST <- #<Add here relevant password>
PORT <- #<Add here relevant port>
PASSWORD <-  #<Add here relevant password>
USER <- 'jupyter_notebook'

connectionection<- DBI::dbConnect(
    RPostgres::Postgres(),
    dbname = DBNAME,
    host = HOST,
    port = PORT,
    password = PASSWORD,
    user = USER,
    options= "-c search_path=source_data_100kv13_covidv4" ### this sets the default schema. Other schemas (for example omop_data) can be queried by specifying the schema in the query
    )

In [None]:
all_schemas <- DBI::dbGetQuery(connection, "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA")
all_schemas

### List all tables in the source_data_100kv13_covidv4 schema

In [None]:
sql <- "
SELECT table_schema, table_name 
FROM information_schema.tables 
WHERE table_schema = 'source_data_100kv13_covidv4'
"

dbGetQuery(connectionection, sql)

### Query 1: Normalised Rare Disease Terms Cohort 

In [None]:
sql <- "
SELECT DISTINCT participant_id, normalised_specific_disease
FROM rare_diseases_participant_disease
WHERE normalised_specific_disease IN (
    'Familial pulmonary fibrosis','Familial primary spontaneous pneumothorax',
    'Familial and multiple pulmonary arteriovenous malformations','Hereditary haemorrhagic telangiectasia'
)
"

sql_to_run <- sql

dbGetQuery(connectionection, sql_to_run)

### Query 2: Normalised Rare Disease Terms and Participant Phenotypic Sex Cohort 

In [None]:
sql <- "
SELECT DISTINCT rd.participant_id, rd.normalised_specific_disease, par.participant_phenotypic_sex
FROM rare_diseases_participant_disease as rd
LEFT JOIN participant AS par
    ON rd.participant_id = par.participant_id
WHERE rd.normalised_specific_disease IN (
    'Familial pulmonary fibrosis','Familial primary spontaneous pneumothorax',
    'Familial and multiple pulmonary arteriovenous malformations','Hereditary haemorrhagic telangiectasia')
AND par.participant_phenotypic_sex IN ('Female')
"

sql_to_run <- sql

dbGetQuery(connectionection, sql_to_run)

### Query 3: Normalised Rare Disease Terms, Participant Phenotypic Sex Cohort, Age of Onset, and Predicted Ancestry

In [None]:
sql <- "
SELECT DISTINCT rd.participant_id, rd.normalised_specific_disease, par.participant_phenotypic_sex, rd.normalised_age_of_onset,
                agg.pred_european_ancestries
FROM rare_diseases_participant_disease as rd
LEFT JOIN participant AS par
    ON rd.participant_id = par.participant_id
LEFT JOIN aggregate_gvcf_sample_stats AS agg
    ON rd.participant_id = agg.participant_id
WHERE rd.normalised_specific_disease IN (
    'Familial pulmonary fibrosis','Familial primary spontaneous pneumothorax',
    'Familial and multiple pulmonary arteriovenous malformations','Hereditary haemorrhagic telangiectasia'
)
AND par.participant_phenotypic_sex IN ('Female')
AND rd.normalised_age_of_onset >= 50
AND agg.pred_european_ancestries >= 0.95
"

sql_to_run <- sql

dbGetQuery(connectionection, sql_to_run)

### Query 4: HPO terms cohort

In [None]:
sql <- "
SELECT DISTINCT participant_id
FROM rare_diseases_participant_phenotype
WHERE normalised_hpo_id IN ('HP:0002206','HP:0006530','HP:0002094')
AND hpo_present IN ('Yes')
"

sql_to_run <- sql

dbGetQuery(connectionection, sql_to_run) 

### Query 5: ICD10 terms cohort

In [None]:
sql <- "
SELECT DISTINCT participant_id
FROM hes_apc
WHERE diag01 LIKE 'J841'
OR diag02 LIKE 'J841'
OR diag03 LIKE 'J841'
"

sql_to_run <- sql

dbGetQuery(connectionection, sql_to_run)

### Query 6: ICD10 terms cohort - Part 2

In [None]:
sql <- "
SELECT DISTINCT apc.participant_id
FROM hes_apc AS apc
LEFT JOIN hes_op AS op
    ON apc.participant_id = op.participant_id
WHERE apc.diag01 IN ('J841')
OR apc.diag02 IN ('J841')
OR apc.diag03 IN ('J841')
OR op.diag01 IN ('J841')
OR op.diag02 IN ('J841')
OR op.diag03 IN ('J841')
"

sql_to_run <- sql

dbGetQuery(connectionection, sql_to_run)

### Query 7: Cancer type and status

In [None]:
sql <- "
SELECT DISTINCT ca.participant_id
FROM cancer_analysis AS ca
LEFT JOIN av_tumour AS av
    ON ca.participant_id = av.participant_id
WHERE ca.disease_type IN ('BREAST')
AND av.er_status = 'P'"

sql_to_run <- sql

dbGetQuery(connectionection, sql_to_run)

### Query 8: Cancer type and status - Part 1

In [None]:
sql <- "
SELECT DISTINCT participant_id
FROM sact
WHERE analysis_group IN ('NIVOLUMAB','PEMBROLIZUMAB')
"

sql_to_run <- sql

dbGetQuery(connectionection, sql_to_run)

### Query 9: Cancer type and status - Part 2

In [None]:
sql <- "
SELECT DISTINCT sa.participant_id
FROM sact AS sa
LEFT JOIN cancer_analysis AS ca
    ON sa.participant_id = ca.participant_id
WHERE sa.analysis_group IN ('NIVOLUMAB','PEMBROLIZUMAB')
AND ca.disease_type = 'BLADDER'
"

sql_to_run <- sql

dbGetQuery(connectionection, sql_to_run)