## This notebook contains the code to generate the simulated data based on the input hyperparameter values. The simulated data is stored within a folder called "simulated_data" and within sub-folders named according to the hyperparameter values

In [15]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import os
import pickle as pkl

In [4]:
num_topics = 50
np.random.seed(0)

In [16]:
def frequency(words):
    # returns the frequency of each word in a given list of words
    cnt = Counter(words)
    freq = list(cnt.values())
    words = list(cnt.keys())
    return freq, words

In [24]:
def simulation(alpha, beta, num_patients, num_tokens, vocab_size, num_topics=num_topics):
    
    mixehr_sim_data = {'patId':[], 'typeId':[], 'pheId':[], 'stateId':[], 'freq':[]} # dictionary for making mxr_data
    mixehr_sim_meta = {'typeId':[], 'pheId':[], 'statecnt':[]} # dictionary for making mxr_meta
    
    words_by_topic = {} # phi matrix
    
    # sampling patients from topic mixture. [num_patients x K] 
    patients_by_topics = np.random.dirichlet([alpha]*num_topics, size=num_patients) # theta matrix?
    
    # sampling K topic probabilities. [vocab_size x K]
    for ind, beta_t in enumerate(beta): # for each note-type, we sample a phi matrix
        topic_probs = np.random.dirichlet([beta_t]*vocab_size, size=num_topics)
        words_by_topic[ind+1] = topic_probs
        
    words_picked_from_vocab = {1: set(), 2: set(), 3: set(), 4: set()}
    
    for patient in tqdm(range(1, num_patients+1)): # patId has to start from 1 and not 0

#           for running the experiments of simulating missing note types, uncomment the following lines
#           drop = np.random.random() # when we want to randomly drop one of the note types for missing types
#           if drop > 0.5:
#               types = [1]
#           else:
#               types = [2]
            # populating each patient record with sampled tokens/words
            patids = []
            typeids = []
            pheids = []
            stateids = []
            freq = []
            sampled_words = []        
            types = [1,2,3,4]
    
            for type_id in types: # for each note-type (i.e doctor's note and nurse's notes)

                beta_t = beta[type_id] # both types have same value of beta_t = 0.1 for now
                words_from_type = [] # e.g words sampled from nurses' notes & vocab

                while len(set(words_from_type)) < int(np.ceil(num_tokens/len(types))): # for each patient we sample 'num_tokens' number of tokens 
                    # we randomly pick one topic to sample words from the pat-topic mixture of given patient
                    picked_topic = np.argmax(np.random.multinomial(1, patients_by_topics[patient-1], size=1)[0])
                    picked_topic_dist = words_by_topic[type_id][picked_topic] # distbn over vocab

                    sampled_word = np.argmax(np.random.multinomial(1, picked_topic_dist, size=1)[0]) # we sample a word
                    words_from_type.append(sampled_word+1)
                    
                    words_picked_from_vocab[type_id].add(sampled_word)
                
                sampled_words.extend(words_from_type)
                          
                # we calculate the frequency of words sampled for the last column in mxr_data
                sampled_words_freq, words = frequency(sampled_words) 

                num_lines = len(sampled_words_freq) # for each sampled word, we need to repeat patId and typeId in mxr_data

                patids.extend([patient]*num_lines)
                typeids.extend([type_id]*num_lines)
                pheids.extend(words)
                stateids.extend([0]*num_lines) # no state counts so always 0
                freq.extend(sampled_words_freq)

                # entering rows of mxr_meta file
                mixehr_sim_meta['typeId'].extend([type_id]*num_lines)
                mixehr_sim_meta['pheId'].extend(words)
                mixehr_sim_meta['statecnt'].extend([1]*num_lines)

                # entering rows of mxr_data file
                mixehr_sim_data['patId'].extend(patids)
                mixehr_sim_data['typeId'].extend(typeids)
                mixehr_sim_data['pheId'].extend(pheids)
                mixehr_sim_data['stateId'].extend(stateids)
                mixehr_sim_data['freq'].extend(freq)

                # entering remaining words into the meta file
                remaining_words = set(range(1,vocab_size+1)) - words_picked_from_vocab[type_id]

                for rem_word in remaining_words:
                    mixehr_sim_meta['typeId'].append(type_id)
                    mixehr_sim_meta['pheId'].append(rem_word)
                    mixehr_sim_meta['statecnt'].append(1)       
    
    mxr_data = pd.DataFrame(data=mixehr_sim_data)
    mxr_meta = pd.DataFrame(data=mixehr_sim_meta)
    mxr_meta = mxr_meta.sort_values(by=['typeId','pheId'])
    
    return mxr_data, mxr_meta, words_by_topic, patients_by_topics

In [19]:
# defining hyperparameters
alpha = 0.01 # hyparparameter for the patient-topic mixture
beta_t = {1: 0.01,
          2: 0.01,
          3: 0.01,
          4: 0.01} # hyperparameter for the topic-word mixture
num_topics = 50 # K
num_patients = [1000, 4000, 8000] # D
num_tokens = [1000, 1500, 2000] # M
vocab_sizes = [2000, 2500, 3000] # V

In [81]:
def remove_less_than_5(data, meta):
    words_to_remove = {}
    for type_id in [1, 2]:
        dd = data[data['typeId'] == type_id]
        mm = meta[meta['typeId'] == type_id]
        words_to_remove[type_id] = []
        for grp in dd[['patId','pheId']].groupby('pheId'):
            patients = grp[1]['patId'].values
            if len(np.unique(patients)) < 5:
                #print(np.unique(patients))
                words_to_remove[type_id].append(grp[0])
        data = data.drop(dd[dd['pheId'].isin(words_to_remove[type_id])].index,axis=0)
        #print(words_to_remove[type_id])
        meta = meta.drop(mm[mm['pheId'].isin(words_to_remove[type_id])].index, axis=0)
    return data, meta

In [25]:
# run the simulations for different hyperparameter values
for patients in num_patients:
    for tokens in num_tokens:
        for vocab in vocab_sizes:
            file_name = "_"+str(patients)+"pats_"+str(tokens)+"tokens_"+str(vocab)+"vocab"
            folder_name = 'simulated_data/'+ str(patients)+"pats/"+str(tokens)+"tokens/"+str(vocab)+"vocab/"

            if not os.path.exists(folder_name):
                os.makedirs(folder_name)

            print(folder_name)   
            mxr_data, mxr_meta, phi, theta = simulation(alpha,beta_t, num_patients=patients, num_tokens=tokens, vocab_size=vocab)
            mxr_data.to_csv(folder_name+"sim_mxr_data"+file_name+".txt", index=False,header=None, sep=' ')
            mxr_meta.to_csv(folder_name+"sim_mxr_meta"+file_name+".txt",index=False,header=None, sep=' ')
            pkl.dump(phi, open(folder_name+"sim_phi"+file_name+".pkl","wb"))
            pkl.dump(theta, open(folder_name+"sim_theta"+file_name+".csv","wb"))

simulated_data/4000pats/300tokens/2500vocab/


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


