In [1]:
import pickle
import os
import pandas as pd
from datetime import datetime
from collections import Counter
# import hail as hl
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

# Demography

In [2]:
bucket = os.getenv("WORKSPACE_BUCKET")

save_path = f'{bucket}/data/aou_demog.csv'
demog = pd.read_csv(save_path)

In [3]:
mlh1 = demog[demog['group']=='MLH1']
msh2 = demog[demog['group']=='MSH2']
msh6 = demog[demog['group']=='MSH6']
pms2 = demog[demog['group']=='PMS2']

print(len(mlh1))
print(len(msh2))
print(len(msh6))
print(len(pms2))

63
55
212
286


In [4]:
from google.cloud import storage
import pickle

# Initialize the Google Cloud Storage client
client = storage.Client()

# Define the bucket and the path where your file is stored
bucket_name = 'fc-secure-9b1ab35f-6336-4ab5-aadc-2d39277e3d9b'
file_path = 'data/pat_with_med.pickle'

# Get the bucket
bucket = client.bucket(bucket_name)

# Create a Blob (file) object
blob = bucket.blob(file_path)

# Download the file's content as a bytes object
remove_serialized = blob.download_as_bytes()

# Deserialize the bytes object to get the original data
pat_concepts = pickle.loads(remove_serialized)

In [5]:
mlh1_pat = list(demog[demog['group']=='MLH1'].person_id)
msh2_pat = list(demog[demog['group']=='MSH2'].person_id)
msh6_pat = list(demog[demog['group']=='MSH6'].person_id)
pms2_pat = list(demog[demog['group']=='PMS2'].person_id)

In [6]:
mmr = list(set(mlh1_pat + msh2_pat + msh6_pat + pms2_pat))

In [7]:
len(mmr)

615

In [8]:
mmr_with_med_record = list(set(list(pat_concepts.keys()))&set(mmr))

In [9]:
len(mmr_with_med_record)

457

In [10]:
ctrl = list(demog[demog['group']=='control'].person_id)

In [11]:
len(ctrl)

217209

In [12]:
ctrl_with_med_record = list(set(list(pat_concepts.keys()))&set(ctrl))

In [13]:
len(ctrl_with_med_record)

162513

# LS carrier with family history

In [14]:
dataset_15423695_survey_sql = f"""
    SELECT *
    FROM `{os.environ["WORKSPACE_CDR"]}.ds_survey` answer
    WHERE answer.person_id IN ({', '.join(map(str, mmr_with_med_record))})
"""

survey = pd.read_gbq(
    dataset_15423695_survey_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

Downloading:   0%|          | 0/106098 [00:00<?, ?rows/s]

In [15]:
survey_grk = survey.groupby('survey')
survey_type = survey_grk.get_group('Personal and Family Health History')
ans = survey_type['answer'].unique()
qs = []
for a in ans:
    qs.append(a.split('-')[0].split(':')[0])
    
questions = list(set(qs))
pids = survey_type['person_id'].unique()

dataset_dict = {}
for pid in pids:
    dataset_dict[pid] = {}
    for q in questions:
        dataset_dict[pid][q] = []
        
grk = survey_type.groupby('person_id')
check = []
for pid in pids:
    pat_dat = grk.get_group(pid)
    pat_qa = pat_dat['answer']
    for qa in pat_qa:
        q = qa.split('-')[0].split(':')[0]
        a = qa.split('-')[-1].split(':')[-1]
        dataset_dict[pid][q].append(a)
        if len(dataset_dict[pid][q])>1:
            check.append((pid, qa)) # Why do some patients provide different answers for the same question

In [16]:
dataset_df = pd.DataFrame.from_dict(dataset_dict, orient='index')

In [17]:
survey_pat = list(survey_type.person_id.unique())

mlh1_survey,msh2_survey,msh6_survey,pms2_survey = 0, 0, 0, 0
for k in survey_pat:
    if k in mlh1_pat:
        mlh1_survey += 1
    if k in msh2_pat:
        msh2_survey += 1
    if k in msh6_pat:
        msh6_survey += 1
    if k in pms2_pat:
        pms2_survey += 1
        

print(mlh1_survey)
print(msh2_survey)
print(msh6_survey)
print(pms2_survey)

print(mlh1_survey/len(mlh1_pat))
print(msh2_survey/len(msh2_pat))
print(msh6_survey/len(msh6_pat))
print(pms2_survey/len(pms2_pat))

30
15
95
97
0.47619047619047616
0.2727272727272727
0.4481132075471698
0.33916083916083917


In [18]:
30+15+95+97

237

In [19]:
cancer_related_q = []
for i in dataset_df.columns:
    if 'cancer' in i:
        if 'family' in i:
            cancer_related_q.append(i)
cancer_related_q

['Including yourself, who in your family has had thyroid cancer? ',
 'Including yourself, who in your family has had blood or soft tissue cancer? ',
 'Including yourself, who in your family has had stomach cancer? ',
 'Including yourself, who in your family has had cervical cancer? ',
 'Including yourself, who in your family has had ovarian cancer? ',
 'Including yourself, who in your family has had esophageal cancer? ',
 'Including yourself, who in your family has had lung cancer? ',
 'Including yourself, who in your family has had head and neck cancer? (This includes cancers of the mouth, sinuses, nose, or throat. This does not include brain cancer.) ',
 'Including yourself, who in your family has had brain cancer? ',
 'Including yourself, who in your family has had bladder cancer? ',
 'Including yourself, who in your family has had other cancer(s)? ',
 'Including yourself, who in your family has had colon cancer/rectal cancer? ',
 'Including yourself, who in your family has had endo

In [21]:
idx = [2,4,9,11,12,15,17] # every run the order of list changes. Change idx to have a list of prostate, colon, endometrial, pancreatic, bladder, ovarian, and stomach cancer
LSq = [cancer_related_q[i] for i in idx]
LSq

['Including yourself, who in your family has had stomach cancer? ',
 'Including yourself, who in your family has had ovarian cancer? ',
 'Including yourself, who in your family has had bladder cancer? ',
 'Including yourself, who in your family has had colon cancer/rectal cancer? ',
 'Including yourself, who in your family has had endometrial cancer? ',
 'Including yourself, who in your family has had prostate cancer? ',
 'Including yourself, who in your family has had pancreatic cancer? ']

In [22]:
Q_res = {}
for Q in LSq:
    res = {}
    for i, row in dataset_df.iterrows():
        if len(row[Q])>0:
            res[i] = row[Q]
    Q_res[Q] = res

In [23]:
pat_survey = {}
for i in mmr_with_med_record:
    pat_survey[i] = []
for q in LSq:
    res = Q_res[q]
    for k,v in res.items():
        pat_survey[k].extend(v)

In [24]:
for k, v in pat_survey.items():
    unique_v = set(v)
    pat_survey[k] = [i for i in unique_v if i!=' Self']

In [25]:
mlh1_fh, msh2_fh, msh6_fh, pms2_fh = {},{},{},{}
for k,v in pat_survey.items():
    if len(v)>0:
        if k in mlh1_pat:
            mlh1_fh[k]=v
        if k in msh2_pat:
            msh2_fh[k]=v
        if k in msh6_pat:
            msh6_fh[k]=v
        if k in pms2_pat:
            pms2_fh[k]=v

In [26]:
# Participants reported family history of LS-associated cancers 
print(len(mlh1_fh))
print(len(msh2_fh))
print(len(msh6_fh))
print(len(pms2_fh))

17
6
51
38


In [27]:
17+6+51+38

112

In [32]:
def make2df(mydict,g):
    dat = pd.DataFrame(list(mydict.keys()))
    dat.columns = ['person_id']
    dat['status'] = g
    dat['fam'] = list(mydict.values())
    return dat

In [29]:
LS_fam = pd.concat([make2df(mlh1_fh, 'MLH1'),make2df(msh2_fh, 'MSH2'),make2df(msh6_fh, 'MSH6'),make2df(pms2_fh, 'PMS2')])

In [35]:
save_path = f'{bucket}/data/LS_fam.csv'
LS_fam.to_csv(save_path)

# nonLS carrier with family history

In [14]:
len(ctrl_with_med_record)

162513

In [15]:
#ctrl too large. Need to split the loading process
dataset_15423695_survey_sql = f"""
    SELECT *
    FROM `{os.environ["WORKSPACE_CDR"]}.ds_survey` answer
    WHERE answer.person_id IN ({', '.join(map(str, ctrl_with_med_record[0:80000]))})
"""

survey0 = pd.read_gbq(
    dataset_15423695_survey_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

Downloading:   0%|          | 0/18046814 [00:00<?, ?rows/s]

In [16]:
#ctrl too large. Need to split the loading process
dataset_15423695_survey_sql = f"""
    SELECT *
    FROM `{os.environ["WORKSPACE_CDR"]}.ds_survey` answer
    WHERE answer.person_id IN ({', '.join(map(str, ctrl_with_med_record[80000:]))})
"""

survey1 = pd.read_gbq(
    dataset_15423695_survey_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

Downloading:   0%|          | 0/18457317 [00:00<?, ?rows/s]

In [17]:
survey_ctrl = pd.concat([survey0, survey1])

In [18]:
survey_grk = survey_ctrl.groupby('survey')

In [19]:
survey_type = survey_grk.get_group('Personal and Family Health History')

In [20]:
ans = survey_type['answer'].unique()

In [21]:
qs = []
for a in ans:
    qs.append(a.split('-')[0].split(':')[0])
    
questions = list(set(qs))
pids = survey_type['person_id'].unique()

dataset_dict = {}
for pid in pids:
    dataset_dict[pid] = {}
    for q in questions:
        dataset_dict[pid][q] = []
        
grk = survey_type.groupby('person_id')
check = []
for pid in tqdm(pids):
    pat_dat = grk.get_group(pid)
    pat_qa = pat_dat['answer']
    for qa in pat_qa:
        q = qa.split('-')[0].split(':')[0]
        a = qa.split('-')[-1].split(':')[-1]
        dataset_dict[pid][q].append(a)
        if len(dataset_dict[pid][q])>1:
            check.append((pid, qa)) # Why do some patients provide different answers for the same question

100%|██████████| 81221/81221 [01:39<00:00, 813.33it/s] 


In [22]:
dataset_df = pd.DataFrame.from_dict(dataset_dict, orient='index')

In [23]:
survey_pat = list(survey_type.person_id.unique())
len(survey_pat)

81221

In [24]:
cancer_related_q = []
for i in dataset_df.columns:
    if 'cancer' in i:
        if 'family' in i:
            cancer_related_q.append(i)
cancer_related_q

['Including yourself, who in your family has had prostate cancer? ',
 'Including yourself, who in your family has had lung cancer? ',
 'Including yourself, who in your family has had cervical cancer? ',
 'Including yourself, who in your family has had pancreatic cancer? ',
 'Including yourself, who in your family has had skin cancer? ',
 'Including yourself, who in your family has had esophageal cancer? ',
 'Including yourself, who in your family has had bone cancer? ',
 'Including yourself, who in your family has had endocrine cancer? ',
 'Including yourself, who in your family has had stomach cancer? ',
 'Including yourself, who in your family has had ovarian cancer? ',
 'Including yourself, who in your family has had eye cancer? ',
 'Including yourself, who in your family has had bladder cancer? ',
 'Including yourself, who in your family has had kidney cancer? ',
 'Including yourself, who in your family has had blood or soft tissue cancer? ',
 'Including yourself, who in your famil

In [25]:
idx = [0,3,8,9,11,15,17] # every run the order of list changes. Change idx to have a list of prostate, colon, endometrial, pancreatic, bladder, ovarian, and stomach cancer
LSq = [cancer_related_q[i] for i in idx]
LSq

['Including yourself, who in your family has had prostate cancer? ',
 'Including yourself, who in your family has had pancreatic cancer? ',
 'Including yourself, who in your family has had stomach cancer? ',
 'Including yourself, who in your family has had ovarian cancer? ',
 'Including yourself, who in your family has had bladder cancer? ',
 'Including yourself, who in your family has had endometrial cancer? ',
 'Including yourself, who in your family has had colon cancer/rectal cancer? ']

In [26]:
Q_res = {}
for Q in LSq:
    res = {}
    for i, row in dataset_df.iterrows():
        if len(row[Q])>0:
            res[i] = row[Q]
    Q_res[Q] = res

In [27]:
pat_survey = {}
for i in ctrl_with_med_record:
    pat_survey[i] = []
for q in LSq:
    res = Q_res[q]
    for k,v in res.items():
        pat_survey[k].extend(v)

In [28]:
for k, v in pat_survey.items():
    unique_v = set(v)
    pat_survey[k] = [i for i in unique_v if i!=' Self']

In [29]:
ctrl_fh = {}
for k,v in pat_survey.items():
    if len(v)>0:
        ctrl_fh[k]=v

In [30]:
len(ctrl_fh)

27530

In [33]:
nonLS_fam = make2df(ctrl_fh, 'control')

In [34]:
bucket = os.getenv("WORKSPACE_BUCKET")

save_path = f'{bucket}/data/nonLS_fam.csv'
nonLS_fam.to_csv(save_path)