In [1]:
import pandas as pd
import sqlalchemy as sal
from dotenv import load_dotenv
import os
from tqdm import tqdm
import json
load_dotenv()

# Create a connection to the database
server_database = os.getenv("SERVER_DATABASE")
engine = sal.create_engine(f'mssql+pyodbc://@{server_database}?trusted_connection=yes&driver=SQL+Server')
conn = engine.connect()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
import generate_drug_diag_code as gen_code

In [3]:
# get patients data
pt = pd.read_csv('..\..\data\include_pt_28apr24.csv')
pt_list = pt['person_id']
print(len(pt_list))

361529


In [6]:
with open('diagnosis_dict.json', 'r') as file:
    diagnosis_dict = json.load(file)
    file.close()

In [7]:
# Due to memory problem, we will query in batch
# We will query 10000 at a time

n_per_batch = 10000
n_batches = len(pt_list) // n_per_batch + 1

condition_df=None

for i in tqdm(range(n_batches)):
    print("starting batch ", i+1, " of ", n_batches, "subject ", i*n_per_batch, " to ", (i+1)*n_per_batch - 1)

    sql_query = gen_code.create_combined_conditions_query(diagnosis_dict, pt_list[i*n_per_batch:(i+1)*n_per_batch])

    if condition_df is None:
        print("no exisiting dataframe, creating new one")
        condition_df = pd.read_sql(sql_query, conn)
        print("finising batch ", i+1, " of ", n_batches, 'number of records',len(condition_df))
    else:
        next_condition_df = pd.read_sql(sql_query, conn)
        
        condition_df = pd.concat([condition_df, next_condition_df])
        print("finising batch ", i+1, " of ", n_batches, 'number of records',len(condition_df))


  0%|          | 0/37 [00:00<?, ?it/s]

starting batch  1  of  37 subject  0  to  9999
no exisiting dataframe, creating new one


  3%|▎         | 1/37 [00:03<01:55,  3.21s/it]

finising batch  1  of  37 number of records 5932
starting batch  2  of  37 subject  10000  to  19999


  5%|▌         | 2/37 [00:06<02:01,  3.48s/it]

finising batch  2  of  37 number of records 11925
starting batch  3  of  37 subject  20000  to  29999


  8%|▊         | 3/37 [00:09<01:49,  3.22s/it]

finising batch  3  of  37 number of records 17837
starting batch  4  of  37 subject  30000  to  39999


 11%|█         | 4/37 [00:12<01:46,  3.21s/it]

finising batch  4  of  37 number of records 23584
starting batch  5  of  37 subject  40000  to  49999


 14%|█▎        | 5/37 [00:15<01:39,  3.10s/it]

finising batch  5  of  37 number of records 29511
starting batch  6  of  37 subject  50000  to  59999


 16%|█▌        | 6/37 [00:18<01:33,  3.02s/it]

finising batch  6  of  37 number of records 35520
starting batch  7  of  37 subject  60000  to  69999


 19%|█▉        | 7/37 [00:21<01:30,  3.03s/it]

finising batch  7  of  37 number of records 41283
starting batch  8  of  37 subject  70000  to  79999


 22%|██▏       | 8/37 [00:24<01:28,  3.05s/it]

finising batch  8  of  37 number of records 46867
starting batch  9  of  37 subject  80000  to  89999


 24%|██▍       | 9/37 [00:27<01:25,  3.04s/it]

finising batch  9  of  37 number of records 52532
starting batch  10  of  37 subject  90000  to  99999


 27%|██▋       | 10/37 [00:32<01:31,  3.39s/it]

finising batch  10  of  37 number of records 58120
starting batch  11  of  37 subject  100000  to  109999


 30%|██▉       | 11/37 [00:35<01:28,  3.39s/it]

finising batch  11  of  37 number of records 63774
starting batch  12  of  37 subject  110000  to  119999


 32%|███▏      | 12/37 [00:38<01:20,  3.23s/it]

finising batch  12  of  37 number of records 69440
starting batch  13  of  37 subject  120000  to  129999


 35%|███▌      | 13/37 [00:41<01:18,  3.29s/it]

finising batch  13  of  37 number of records 75006
starting batch  14  of  37 subject  130000  to  139999


 38%|███▊      | 14/37 [00:44<01:14,  3.22s/it]

finising batch  14  of  37 number of records 80490
starting batch  15  of  37 subject  140000  to  149999


 41%|████      | 15/37 [00:47<01:09,  3.16s/it]

finising batch  15  of  37 number of records 85985
starting batch  16  of  37 subject  150000  to  159999


 43%|████▎     | 16/37 [00:50<01:04,  3.05s/it]

finising batch  16  of  37 number of records 91314
starting batch  17  of  37 subject  160000  to  169999


 46%|████▌     | 17/37 [00:54<01:05,  3.30s/it]

finising batch  17  of  37 number of records 96577
starting batch  18  of  37 subject  170000  to  179999


 49%|████▊     | 18/37 [01:03<01:35,  5.04s/it]

finising batch  18  of  37 number of records 101732
starting batch  19  of  37 subject  180000  to  189999


 51%|█████▏    | 19/37 [01:10<01:41,  5.64s/it]

finising batch  19  of  37 number of records 106908
starting batch  20  of  37 subject  190000  to  199999


 54%|█████▍    | 20/37 [01:18<01:47,  6.32s/it]

finising batch  20  of  37 number of records 111928
starting batch  21  of  37 subject  200000  to  209999


 57%|█████▋    | 21/37 [01:25<01:41,  6.36s/it]

finising batch  21  of  37 number of records 116887
starting batch  22  of  37 subject  210000  to  219999


 59%|█████▉    | 22/37 [01:31<01:33,  6.26s/it]

finising batch  22  of  37 number of records 121765
starting batch  23  of  37 subject  220000  to  229999


 62%|██████▏   | 23/37 [01:36<01:24,  6.06s/it]

finising batch  23  of  37 number of records 126423
starting batch  24  of  37 subject  230000  to  239999


 65%|██████▍   | 24/37 [01:42<01:16,  5.87s/it]

finising batch  24  of  37 number of records 130702
starting batch  25  of  37 subject  240000  to  249999


 68%|██████▊   | 25/37 [01:46<01:06,  5.56s/it]

finising batch  25  of  37 number of records 134925
starting batch  26  of  37 subject  250000  to  259999


 70%|███████   | 26/37 [01:51<00:56,  5.13s/it]

finising batch  26  of  37 number of records 139179
starting batch  27  of  37 subject  260000  to  269999


 73%|███████▎  | 27/37 [01:54<00:45,  4.54s/it]

finising batch  27  of  37 number of records 143380
starting batch  28  of  37 subject  270000  to  279999


 76%|███████▌  | 28/37 [01:57<00:36,  4.11s/it]

finising batch  28  of  37 number of records 147502
starting batch  29  of  37 subject  280000  to  289999


 78%|███████▊  | 29/37 [02:01<00:32,  4.04s/it]

finising batch  29  of  37 number of records 151683
starting batch  30  of  37 subject  290000  to  299999


 81%|████████  | 30/37 [02:05<00:28,  4.10s/it]

finising batch  30  of  37 number of records 155810
starting batch  31  of  37 subject  300000  to  309999


 84%|████████▍ | 31/37 [02:08<00:23,  3.84s/it]

finising batch  31  of  37 number of records 159911
starting batch  32  of  37 subject  310000  to  319999


 86%|████████▋ | 32/37 [02:12<00:19,  3.95s/it]

finising batch  32  of  37 number of records 163928
starting batch  33  of  37 subject  320000  to  329999


 89%|████████▉ | 33/37 [02:17<00:16,  4.23s/it]

finising batch  33  of  37 number of records 167888
starting batch  34  of  37 subject  330000  to  339999


 92%|█████████▏| 34/37 [02:20<00:11,  3.67s/it]

finising batch  34  of  37 number of records 172060
starting batch  35  of  37 subject  340000  to  349999


 95%|█████████▍| 35/37 [02:22<00:06,  3.25s/it]

finising batch  35  of  37 number of records 176004
starting batch  36  of  37 subject  350000  to  359999


 97%|█████████▋| 36/37 [02:24<00:02,  2.81s/it]

finising batch  36  of  37 number of records 179890
starting batch  37  of  37 subject  360000  to  369999


100%|██████████| 37/37 [02:25<00:00,  3.94s/it]

finising batch  37  of  37 number of records 180469





In [9]:
condition_df.head()

Unnamed: 0,first_dm_date,first_acne_date,first_tachyarrythmia_date,first_palpitation_date,first_af_date,first_bph_date,first_cirrhosis_date,first_hairloss_date,first_hf_date,first_pregnancy_hypertension_date,first_hyperthyroid_date,first_migraine_date,first_ckd_date,first_tremor_date,person_id
0,2006-08-05,,,,,,,2007-04-25,,,,,,,378271
1,2009-10-21,,,,,,,2013-06-07,,,,,,,378807
2,2006-09-07,,,,,,,,,,,,,,379109
3,2003-04-03,,,,,,,,,,,,,,379209
4,2015-05-12,,2010-08-22,,,,,,2007-03-07,,,,2007-03-07,,379243


In [4]:
# ("2858","1495"),"Y",null)) as [K-sparing diuretics]
# ("948","3276"),"Y",null)) as [Calcium Channel Blocker (non-Dihydropyridine)]
# ("236","368","369","1116","2004","2113","2586","3418","2586"),"Y",null)) as [Beta Blocker]
# ("1017","2531","3026"),"Y",null)) as [Alpha Blocker]
# ("496","1069","1538","1778","2631","2652","2654","3371","3372","3373","3718"),"Y",null)) as [ACEI]
# ("1338","3148"),"Y",null)) as [Loop diuretics]
# ("1975","721"),"Y",null)) as [Alpha II Agonist]
# ("488","489","1645","1646","1802","1803","2225","3009","3010","3263","3264","3265","3365","3405","3406","3408","3423","3478","3524","3550","3560"),"Y",null)) as [ARB]

with open('drug_dict.json', 'r') as file:
    drug_dict = json.load(file)
    file.close()

In [5]:
n_per_batch = 10000
n_batches = len(pt_list) // n_per_batch + 1

drug_df=None

for i in tqdm(range(n_batches)):
    print("starting batch ", i+1, " of ", n_batches, "subject ", i*n_per_batch, " to ", (i+1)*n_per_batch - 1)

    sql_query = gen_code.create_combined_drugs_query(drug_dict, pt_list[i*n_per_batch:(i+1)*n_per_batch])

    if drug_df is None:
        print("no exisiting dataframe, creating new one")
        drug_df = pd.read_sql(sql_query, conn)
        print("finising batch ", i+1, " of ", n_batches, 'number of records',len(drug_df))
    else:
        next_drug_df = pd.read_sql(sql_query, conn)
        
        drug_df = pd.concat([drug_df, next_drug_df])
        print("finising batch ", i+1, " of ", n_batches, 'number of records',len(drug_df))

  0%|          | 0/37 [00:00<?, ?it/s]

starting batch  1  of  37 subject  0  to  9999
no exisiting dataframe, creating new one


  3%|▎         | 1/37 [00:27<16:15, 27.09s/it]

finising batch  1  of  37 number of records 7049
starting batch  2  of  37 subject  10000  to  19999


  5%|▌         | 2/37 [00:38<10:35, 18.15s/it]

finising batch  2  of  37 number of records 14185
starting batch  3  of  37 subject  20000  to  29999


  8%|▊         | 3/37 [00:49<08:13, 14.50s/it]

finising batch  3  of  37 number of records 21287
starting batch  4  of  37 subject  30000  to  39999


 11%|█         | 4/37 [00:55<06:15, 11.38s/it]

finising batch  4  of  37 number of records 28311
starting batch  5  of  37 subject  40000  to  49999


 14%|█▎        | 5/37 [01:03<05:23, 10.10s/it]

finising batch  5  of  37 number of records 35461
starting batch  6  of  37 subject  50000  to  59999


 16%|█▌        | 6/37 [01:08<04:21,  8.44s/it]

finising batch  6  of  37 number of records 42618
starting batch  7  of  37 subject  60000  to  69999


 19%|█▉        | 7/37 [01:15<03:55,  7.85s/it]

finising batch  7  of  37 number of records 49603
starting batch  8  of  37 subject  70000  to  79999


 22%|██▏       | 8/37 [01:20<03:23,  7.00s/it]

finising batch  8  of  37 number of records 56512
starting batch  9  of  37 subject  80000  to  89999


 24%|██▍       | 9/37 [01:25<02:55,  6.28s/it]

finising batch  9  of  37 number of records 63461
starting batch  10  of  37 subject  90000  to  99999


 27%|██▋       | 10/37 [01:30<02:38,  5.88s/it]

finising batch  10  of  37 number of records 70338
starting batch  11  of  37 subject  100000  to  109999


 30%|██▉       | 11/37 [01:35<02:30,  5.78s/it]

finising batch  11  of  37 number of records 77304
starting batch  12  of  37 subject  110000  to  119999


 32%|███▏      | 12/37 [01:41<02:21,  5.67s/it]

finising batch  12  of  37 number of records 84268
starting batch  13  of  37 subject  120000  to  129999


 35%|███▌      | 13/37 [01:45<02:06,  5.29s/it]

finising batch  13  of  37 number of records 91115
starting batch  14  of  37 subject  130000  to  139999


 38%|███▊      | 14/37 [01:50<01:55,  5.00s/it]

finising batch  14  of  37 number of records 98052
starting batch  15  of  37 subject  140000  to  149999


 41%|████      | 15/37 [01:55<01:56,  5.28s/it]

finising batch  15  of  37 number of records 104913
starting batch  16  of  37 subject  150000  to  159999


 43%|████▎     | 16/37 [02:00<01:49,  5.20s/it]

finising batch  16  of  37 number of records 111457
starting batch  17  of  37 subject  160000  to  169999


 46%|████▌     | 17/37 [02:06<01:47,  5.36s/it]

finising batch  17  of  37 number of records 118108
starting batch  18  of  37 subject  170000  to  179999


 49%|████▊     | 18/37 [02:29<03:21, 10.63s/it]

finising batch  18  of  37 number of records 124712
starting batch  19  of  37 subject  180000  to  189999


 51%|█████▏    | 19/37 [02:37<02:57,  9.85s/it]

finising batch  19  of  37 number of records 131141
starting batch  20  of  37 subject  190000  to  199999


 54%|█████▍    | 20/37 [02:44<02:32,  8.96s/it]

finising batch  20  of  37 number of records 137568
starting batch  21  of  37 subject  200000  to  209999


 57%|█████▋    | 21/37 [02:50<02:09,  8.12s/it]

finising batch  21  of  37 number of records 143899
starting batch  22  of  37 subject  210000  to  219999


 59%|█████▉    | 22/37 [02:55<01:49,  7.28s/it]

finising batch  22  of  37 number of records 150201
starting batch  23  of  37 subject  220000  to  229999


 62%|██████▏   | 23/37 [03:02<01:37,  6.94s/it]

finising batch  23  of  37 number of records 156166
starting batch  24  of  37 subject  230000  to  239999


 65%|██████▍   | 24/37 [03:09<01:31,  7.07s/it]

finising batch  24  of  37 number of records 161535
starting batch  25  of  37 subject  240000  to  249999


 68%|██████▊   | 25/37 [03:15<01:22,  6.87s/it]

finising batch  25  of  37 number of records 166805
starting batch  26  of  37 subject  250000  to  259999


 70%|███████   | 26/37 [03:23<01:18,  7.17s/it]

finising batch  26  of  37 number of records 171983
starting batch  27  of  37 subject  260000  to  269999


 73%|███████▎  | 27/37 [03:31<01:13,  7.35s/it]

finising batch  27  of  37 number of records 177125
starting batch  28  of  37 subject  270000  to  279999


 76%|███████▌  | 28/37 [03:39<01:08,  7.59s/it]

finising batch  28  of  37 number of records 182084
starting batch  29  of  37 subject  280000  to  289999


 78%|███████▊  | 29/37 [03:46<00:59,  7.47s/it]

finising batch  29  of  37 number of records 187158
starting batch  30  of  37 subject  290000  to  299999


 81%|████████  | 30/37 [03:54<00:53,  7.64s/it]

finising batch  30  of  37 number of records 192014
starting batch  31  of  37 subject  300000  to  309999


 84%|████████▍ | 31/37 [04:02<00:46,  7.73s/it]

finising batch  31  of  37 number of records 196832
starting batch  32  of  37 subject  310000  to  319999


 86%|████████▋ | 32/37 [04:12<00:41,  8.34s/it]

finising batch  32  of  37 number of records 201487
starting batch  33  of  37 subject  320000  to  329999


 89%|████████▉ | 33/37 [04:21<00:34,  8.61s/it]

finising batch  33  of  37 number of records 206065
starting batch  34  of  37 subject  330000  to  339999


 92%|█████████▏| 34/37 [04:29<00:24,  8.23s/it]

finising batch  34  of  37 number of records 210844
starting batch  35  of  37 subject  340000  to  349999


 95%|█████████▍| 35/37 [04:36<00:15,  7.95s/it]

finising batch  35  of  37 number of records 215073
starting batch  36  of  37 subject  350000  to  359999


 97%|█████████▋| 36/37 [04:43<00:07,  7.74s/it]

finising batch  36  of  37 number of records 219175
starting batch  37  of  37 subject  360000  to  369999


100%|██████████| 37/37 [04:47<00:00,  7.78s/it]

finising batch  37  of  37 number of records 219779





In [10]:
drug_df.head()

Unnamed: 0,first_sglt2_date,first_glp_date,first_k_sparing_date,first_nonDHP_date,first_bb_date,first_alpha_date,first_acei_date,first_loop_date,first_alpha2_date,first_arb_date,person_id
0,,,,2018-07-09,2013-07-12,,,,,2020-03-06,378472
1,,,,,2017-02-27,,,,,,379301
2,,,,,,,2013-05-21,,,,380087
3,,,,,,,,2017-04-19,,2017-04-19,380837
4,,,,,2013-10-10,,2014-09-11,,,,382845


In [11]:
print("number of records in condition_df", len(condition_df))
print("number of records in drug_df", len(drug_df))

number of records in condition_df 180469
number of records in drug_df 219779


In [15]:
pt_df = pt[['person_id', 'age_at_first_drug', 'age_at_first_diag', 'criteria']]

In [17]:
pt_df = pd.merge(pt_df, drug_df, on='person_id', how='left')
pt_df = pd.merge(pt_df, condition_df, on='person_id', how='left')

In [53]:
def create_exclusion_criteria(df):
    new_df = df[['person_id']]

    new_df['hyperthyroid_bb'] = (~df['first_hyperthyroid_date'].isna() & ~df['first_bb_date'].isna()).astype(int)
    new_df['af_bb'] = (~df['first_af_date'].isna() & ~df['first_bb_date'].isna()).astype(int)
    
    new_df['hf_k_sparing'] = (~df['first_hf_date'].isna() & ~df['first_k_sparing_date'].isna()).astype(int)
    new_df['acne_k_sparing'] = (~df['first_acne_date'].isna() & ~df['first_acne_date'].isna()).astype(int)
    new_df['hair_k_sparing'] = (~df['first_hairloss_date'].isna() & ~df['first_k_sparing_date'].isna()).astype(int)
    new_df['cirr_k_sparing'] = (~df['first_cirrhosis_date'].isna() & ~df['first_k_sparing_date'].isna()).astype(int)

    new_df['hf_loop'] = (~df['first_hf_date'].isna() & ~df['first_loop_date'].isna()).astype(int)

    new_df['bph_alpha'] = (~df['first_bph_date'].isna() & ~df['first_alpha_date'].isna()).astype(int)

    new_df['hf_acei'] = (~df['first_hf_date'].isna() & ~df['first_acei_date'].isna()).astype(int)
    new_df['hf_arb'] = (~df['first_hf_date'].isna() & ~df['first_arb_date'].isna()).astype(int)
    new_df['arrythmia_nondhp'] = (~df['first_tachyarrythmia_date'].isna() & ~df['first_nonDHP_date'].isna()).astype(int)
    
    new_df['preg_alpha2'] = (~df['first_pregnancy_hypertension_date'].isna() & ~df['first_alpha2_date'].isna()).astype(int)
    
    # new_df['migraine_bb'] = (~df['first_migraine_date'].isna() & ~df['first_bb_date'].isna()).astype(int)
    # new_df['cirr_bb'] = (~df['first_cirrhosis_date'].isna() & ~df['first_bb_date'].isna()).astype(int)
    # new_df['ckd_acei'] = (~df['first_ckd_date'].isna() & ~df['first_acei_date'].isna()).astype(int)
    # new_df['ckd_arb'] = (~df['first_ckd_date'].isna() & ~df['first_arb_date'].isna()).astype(int)
    
    # new_df['et_bb'] = (~df['first_tremor_date'].isna() & ~df['first_arb_date'].isna()).astype(int)
    new_df['summary'] = new_df.drop('person_id', axis=1).sum(axis=1)

    return new_df

In [54]:
drug_only = pt_df[pt_df['criteria'] == 'drug']

In [55]:
exclusion_df = create_exclusion_criteria(drug_only)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['hyperthyroid_bb'] = (~df['first_hyperthyroid_date'].isna() & ~df['first_bb_date'].isna()).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['af_bb'] = (~df['first_af_date'].isna() & ~df['first_bb_date'].isna()).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_d

In [56]:
exclusion_df

Unnamed: 0,person_id,hyperthyroid_bb,af_bb,hf_k_sparing,acne_k_sparing,hair_k_sparing,cirr_k_sparing,hf_loop,bph_alpha,hf_acei,hf_arb,arrythmia_nondhp,preg_alpha2,summary
3,377927,0,0,0,0,0,0,1,0,0,0,0,0,1
18,378360,0,0,0,1,0,0,0,0,0,0,0,0,1
19,378368,0,0,0,0,0,0,1,0,1,0,0,0,2
43,378869,0,0,0,0,0,0,0,0,0,0,0,0,0
77,379267,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361524,4833866,0,0,0,0,0,0,0,0,0,0,0,0,0
361525,4834098,0,0,0,0,0,0,0,0,0,0,0,0,0
361526,4834464,0,0,0,0,0,0,0,0,0,0,0,0,0
361527,4835063,0,0,0,0,0,0,0,0,0,0,0,0,0


In [57]:
exclusion_df.to_csv('..\..\data\exclusion_drug_only_28apr24_edited.csv', index=False)

In [58]:
exclusion_df[exclusion_df['summary'] >=1]

Unnamed: 0,person_id,hyperthyroid_bb,af_bb,hf_k_sparing,acne_k_sparing,hair_k_sparing,cirr_k_sparing,hf_loop,bph_alpha,hf_acei,hf_arb,arrythmia_nondhp,preg_alpha2,summary
3,377927,0,0,0,0,0,0,1,0,0,0,0,0,1
18,378360,0,0,0,1,0,0,0,0,0,0,0,0,1
19,378368,0,0,0,0,0,0,1,0,1,0,0,0,2
80,379301,0,0,0,1,0,0,0,0,0,0,0,0,1
89,379481,0,0,0,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361396,4810105,0,0,0,0,0,0,1,0,0,0,0,0,1
361440,4811154,0,0,0,0,0,1,0,0,0,0,0,0,1
361493,4812101,0,0,0,0,0,0,1,0,0,0,0,0,1
361506,4812316,0,0,0,0,0,0,1,0,0,0,0,0,1
