In [1]:
import sys
sys.path.insert(1, '../')
from utils.packages import *

  from .autonotebook import tqdm as notebook_tqdm


## Synthea

In [29]:
global_params = {
    'max_seq_len': 64,
    'gradient_accumulation_steps': 1
}

optim_param = {
    'lr': 3e-5,
    'warmup_proportion': 0.1,
    'weight_decay': 0.01
}

train_params = {
    'batch_size': 10,
    'use_cuda': True,
    'max_len_seq': global_params['max_seq_len'],
    'device': 'cuda' #change this to run on cuda #'cuda:0'
}

dataset_name = 'Synthea/Final_cohorts/'

feature_types = {'diagnosis':True, 'medications':True, 'procedures':True}

if (feature_types['diagnosis'] and feature_types['medications'] and not (feature_types['procedures'])):
    print("Do only use diagnosis")
    code_voc = 'MLM_diagnosmedcodes.npy'
    age_voc = 'MLM_age.npy'

elif (feature_types['diagnosis'] and not feature_types['medications']):
    code_voc = 'MLM_diagnoscodes.npy'
    age_voc = 'MLM_age.npy'

else:
    code_voc = 'MLM_diagnosproccodes.npy'
    age_voc = 'MLM_age.npy'

files = {'code':'../data/vocabularies/' + dataset_name + code_voc,
             'age':'../data/vocabularies/' + dataset_name + age_voc,
        }

tokenizer = EHRTokenizer(task='MLM', filenames=files)

model_config = {
        'vocab_size': len(tokenizer.getVoc('code').keys()), # number of disease + symbols for word embedding
        'hidden_size': 288, #tune.choice([100, 150, 288]), #288, # word embedding and seg embedding hidden size
        'seg_vocab_size': 2, # number of vocab for seg embedding
        'age_vocab_size': len(tokenizer.getVoc('age').keys()), # number of vocab for age embedding,
        'gender_vocab_size': 3,
        'max_position_embeddings': train_params['max_len_seq'], # maximum number of tokens
        'hidden_dropout_prob': 0.1, # dropout rate
        'num_hidden_layers': 6, #4, # number of multi-head attention layers required
        'num_attention_heads': 12, # number of attention heads
        'attention_probs_dropout_prob': 0.1, # multi-head attention dropout rate
        'intermediate_size': 512, # the size of the "intermediate" layer in the transformer encoder
        'hidden_act': 'gelu', # The non-linear activation function in the encoder and the pooler "gelu", 'relu', 'swish' are supported
        'initializer_range': 0.02, # parameter weight initializer range
        'use_prior':False,
        'reg':0.1,
        'age':True,
        'gender':False,
        'epochs':20,
    }

In [30]:
stats_path = '../data/train_stats/Synthea/'
condfiles = {'dd':stats_path + 'dd_cond_probs.empirical.p', 
             'dp':stats_path + 'dp_cond_probs.empirical.p',
             'dm':stats_path + 'dm_cond_probs.empirical.p',
             'pp':stats_path + 'pp_cond_probs.empirical.p', 
             'pd':stats_path + 'pd_cond_probs.empirical.p',
             'pm':stats_path + 'pd_cond_probs.empirical.p',
             'mm':stats_path + 'mm_cond_probs.empirical.p', 
             'md':stats_path + 'md_cond_probs.empirical.p',
             'mp':stats_path + 'mp_cond_probs.empirical.p',
            }

In [31]:
folderpath = '../data/pytorch_datasets/' + dataset_name
path='../data/datasets/' + dataset_name
train = pd.read_parquet(path + 'train.parquet')
traind = EHRDataset(train, max_len=train_params['max_len_seq'], feature_types=feature_types, conditional_files=condfiles, save_folder=folderpath, tokenizer=tokenizer, run_type='vizontology_dmp')

Use diagnosis: True
Use medications: True
Use procedures: True
Transforming data
Saving data


In [36]:
trainloader = torch.utils.data.DataLoader(traind, batch_size=1, shuffle=True)

In [33]:
dm_path = condfiles['dm']

In [59]:
md_path = condfiles['md']

In [66]:
mp_path = condfiles['mp']

In [70]:
dd_path= condfiles['dd']

In [71]:
dm = pickle.load(open(dm_path, 'rb'))
md = pickle.load(open(md_path, 'rb'))
mp = pickle.load(open(mp_path, 'rb'))
dd = pickle.load(open(dd_path, 'rb'))

In [47]:
patient= next(iter(trainloader))

In [48]:
age, gender, code, position, segment, mask, label, prior_guide, tokens = patient

In [49]:
sentence = tokenizer.convert_ids_to_tokens(tokens.numpy().squeeze(), 'code')

In [50]:
firstvist = []

for token in sentence:
    if token == '[CLS]':
        continue
    
    if token == '[SEP]':
        break
    firstvist.append(token)

In [51]:
firstvist

['59621000',
 '224299000',
 '160904001',
 '423315002',
 '73595000',
 '314076',
 '428211000124100',
 '710824005',
 '454711000124102',
 '763302001',
 '171207006']

In [65]:
dm['0, 423315002,314076']

0.6458539615758532

In [69]:
mp['0, 314076,763302001']

0.11168784033525

In [52]:
for key in dm:
    print(key)
    break

0, 44054006,308136


Codes:

59621000: Essential hypertension

224299000: Received higher education

160904001:  Finding of job details 

423315002: Limited social contact

73595000: Stress 

314076: lisinopril 10 MG Oral Tablet

428211000124100: Assessment of substance use

710824005: Assessment of health and social care needs

454711000124102: Depression screening 

763302001: Alcohol Use Disorders Identification Test

171207006: Depression screening 





## MIMIC

In [2]:
global_params = {
    'max_seq_len': 64,
    'gradient_accumulation_steps': 1
}

optim_param = {
    'lr': 3e-5,
    'warmup_proportion': 0.1,
    'weight_decay': 0.01
}

train_params = {
    'batch_size': 10,
    'use_cuda': True,
    'max_len_seq': global_params['max_seq_len'],
    'device': 'cuda' #change this to run on cuda #'cuda:0'
}

dataset_name = 'MIMIC/'

feature_types = {'diagnosis':True, 'medications':True, 'procedures':True}

if (feature_types['diagnosis'] and feature_types['medications'] and not (feature_types['procedures'])):
    print("Do only use diagnosis")
    code_voc = 'MLM_diagnosmedcodes.npy'
    age_voc = 'MLM_age.npy'

elif (feature_types['diagnosis'] and not feature_types['medications']):
    code_voc = 'MLM_diagnoscodes.npy'
    age_voc = 'MLM_age.npy'

else:
    code_voc = 'MLM_diagnosproccodes.npy'
    age_voc = 'MLM_age.npy'

files = {'code':'../data/vocabularies/' + dataset_name + code_voc,
             'age':'../data/vocabularies/' + dataset_name + age_voc,
        }

tokenizer = EHRTokenizer(task='MLM', filenames=files)

model_config = {
        'vocab_size': len(tokenizer.getVoc('code').keys()), # number of disease + symbols for word embedding
        'hidden_size': 288, #tune.choice([100, 150, 288]), #288, # word embedding and seg embedding hidden size
        'seg_vocab_size': 2, # number of vocab for seg embedding
        'age_vocab_size': len(tokenizer.getVoc('age').keys()), # number of vocab for age embedding,
        'gender_vocab_size': 3,
        'max_position_embeddings': train_params['max_len_seq'], # maximum number of tokens
        'hidden_dropout_prob': 0.1, # dropout rate
        'num_hidden_layers': 6, #4, # number of multi-head attention layers required
        'num_attention_heads': 12, # number of attention heads
        'attention_probs_dropout_prob': 0.1, # multi-head attention dropout rate
        'intermediate_size': 512, # the size of the "intermediate" layer in the transformer encoder
        'hidden_act': 'gelu', # The non-linear activation function in the encoder and the pooler "gelu", 'relu', 'swish' are supported
        'initializer_range': 0.02, # parameter weight initializer range
        'use_prior':False,
        'reg':0.1,
        'age':True,
        'gender':False,
        'epochs':20,
    }

In [3]:
stats_path = '../data/train_stats/MIMIC2/'
condfiles = {'dd':stats_path + 'dd_cond_probs.empirical.p', 
             'dp':stats_path + 'dp_cond_probs.empirical.p',
             'dm':stats_path + 'dm_cond_probs.empirical.p',
             'pp':stats_path + 'pp_cond_probs.empirical.p', 
             'pd':stats_path + 'pd_cond_probs.empirical.p',
             'pm':stats_path + 'pd_cond_probs.empirical.p',
             'mm':stats_path + 'mm_cond_probs.empirical.p', 
             'md':stats_path + 'md_cond_probs.empirical.p',
             'mp':stats_path + 'mp_cond_probs.empirical.p',
            }

In [4]:
folderpath = '../data/pytorch_datasets/' + dataset_name
path='../data/datasets/' + dataset_name
train = pd.read_parquet(path + 'train.parquet')
traind = EHRDataset(train, max_len=train_params['max_len_seq'], feature_types=feature_types, conditional_files=condfiles, save_folder=folderpath, tokenizer=tokenizer, run_type='vizontology_dmp')

Use diagnosis: True
Use medications: True
Use procedures: True
Transforming data
Saving data


In [5]:
trainloader = torch.utils.data.DataLoader(traind, batch_size=1, shuffle=True)

In [6]:
dm_path = condfiles['dm']
md_path = condfiles['md']
mp_path = condfiles['mp']

In [7]:
dm = pickle.load(open(dm_path, 'rb'))
md = pickle.load(open(md_path, 'rb'))
mp = pickle.load(open(mp_path, 'rb'))

In [11]:
patient= next(iter(trainloader))

In [12]:
age, gender, code, position, segment, mask, label, prior_guide, tokens = patient
sentence = tokenizer.convert_ids_to_tokens(tokens.numpy().squeeze(), 'code')
firstvist = []

for token in sentence:
    if token == '[CLS]':
        continue
    
    if token == '[SEP]':
        break
    firstvist.append(token)

In [13]:
firstvist

['END013',
 'INJ030',
 'CIR007',
 'DIG004',
 'FAC016',
 'NVS006',
 'END010',
 'END011',
 'SYM013',
 'INJ028',
 'INJ028',
 'EXT027',
 'FAC021',
 'FAC021',
 'FAC009',
 '2244',
 '0550',
 '3065',
 '7510',
 '0492',
 '0004',
 '1988',
 '6729',
 '6277',
 '0426',
 '4943',
 '0615',
 '0262',
 '0659',
 '0054',
 '1985',
 '0421',
 '6235',
 '0703',
 '4902',
 'CAR024']

In [20]:
dm['0, DIG004,3065']

0.08882037741244973

In [22]:
mp['0, 3065,CAR024']

0.42889701614229647

Codes:

END013: Pituitary disorders 

INJ030: Drug induced or toxic related condition

CIR007: Essential hypertension

DIG004: Esophageal disorders

NVS006: Other specified hereditary and degenerative nervous system conditions

2244: Febuxostat

3065: Fenofibrate

7510: Naproxen Sodium

6729: Magnesium Sulfate

CAR024: Venous and arterial catheter placement