In [1]:
import os
import pandas as pd
from datetime import datetime
from collections import Counter
import hail as hl
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
import pickle

In [5]:
bucket = os.getenv("WORKSPACE_BUCKET")

save_path = f'{bucket}/data/aou_demog.csv'
grouped_demog = pd.read_csv(save_path)

In [6]:
patid = list(grouped_demog['person_id'].unique())

In [7]:
num_sublists = len(patid) // 1000 + (1 if len(patid) % 1000 else 0)

sublists = [patid[i * 1000: (i + 1) * 1000] for i in range(num_sublists)]

# collect conditions from EHR

In [8]:
dataset = os.getenv("WORKSPACE_CDR")

dfs = []
for lst in tqdm(sublists):
    concepts_sql = f"""
        SELECT co.person_id, co.condition_concept_id, c.concept_name, co.condition_start_date
        FROM `{dataset}.condition_occurrence` co
        JOIN `{dataset}.concept` c ON co.condition_concept_id = c.concept_id
        WHERE co.person_id IN ({', '.join(map(str, lst))})
    """

    concepts_df = pd.read_gbq(
        concepts_sql,
        dialect="standard",
        use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ))
    filter_df = concepts_df[['person_id','concept_name']]
    filter_df = filter_df[~filter_df.duplicated()]
    dfs.append(filter_df)

100%|██████████| 218/218 [33:02<00:00,  9.09s/it]


In [9]:
patlist = []
for d in dfs:
    patlist = list(set(patlist+list(d['person_id'].unique())))

In [10]:
len(patlist)

163002

# identify patients with cancer

In [11]:
patC = {}
for df in tqdm(dfs):
    df['concept_name'] = df['concept_name'].str.lower()
    for i, row in df.iterrows():
        if row['person_id'] not in list(patC.keys()):
            patC[row['person_id']]=[]
        if 'canc' in row['concept_name'] or 'malig' in row['concept_name'] or 'adenoc' in row['concept_name']  or 'tumo' in row['concept_name']:
            if row['concept_name'] not in patC[row['person_id']]:
                patC[row['person_id']].append(row['concept_name'])

100%|██████████| 218/218 [7:41:52<00:00, 127.12s/it]  


In [13]:
from google.cloud import storage
import pickle
import io

# Initialize the Google Cloud Storage client
client = storage.Client()

# Define the bucket and the path where you want to save the file
bucket_name = 'fc-secure-9b1ab35f-6336-4ab5-aadc-2d39277e3d9b'
file_path = 'data/pat_with_med.pickle'

# Get the bucket
bucket = client.bucket(bucket_name)

# Create a Blob (file) object
blob = bucket.blob(file_path)

# Serialize your object to a bytes object using pickle
remove_serialized = pickle.dumps(patC)

# Upload the serialized object to the bucket
blob.upload_from_string(remove_serialized, content_type='application/octet-stream')

In [None]:
# from google.cloud import storage
# import pickle

# # Initialize the Google Cloud Storage client
# client = storage.Client()

# # Define the bucket and the path where your file is stored
# bucket_name = 'fc-secure-9b1ab35f-6336-4ab5-aadc-2d39277e3d9b'
# file_path = 'data/pat_with_med.pickle'

# # Get the bucket
# bucket = client.bucket(bucket_name)

# # Create a Blob (file) object
# blob = bucket.blob(file_path)

# # Download the file's content as a bytes object
# remove_serialized = blob.download_as_bytes()

# # Deserialize the bytes object to get the original data
# test = pickle.loads(remove_serialized)