## This notebook contains the code to create a baseline data and meta file for training the heterogenous_ehr model. The baseline considers the same words from different note-types as the same and just adds up their frequency. This is the LDA model. 

In [59]:
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm.notebook import tqdm_notebook as tqdm

In [60]:
def baseline1(data, meta):
    # The first baseline considers the same word from different note types the same. 
    # So it just use the total count for the same word across notes.
    mixehr_sim_data = {'patId':[], 'typeId':[], 'pheId':[], 'stateId':[], 'freq':[]} # dictionary for making mxr_data
    mixehr_sim_meta = {'typeId':[], 'pheId':[], 'statecnt':[]} # dictionary for making mxr_meta
    vocab_size = 2500
    words_picked_from_vocab = set()
    pairs = set()
    for grp in tqdm(data.groupby(2)):
        tt = grp[1]

        for g in tt.groupby(0):
            if len(g[1]) > 1:
                freq = sum(g[1][4])
            else:
                freq = 1
            type_id = 1
            word = g[1][2].values[0]

            patid = g[1][0].values[0]
            stateid = 0
            words_picked_from_vocab.add(word)

            # entering rows of mxr_meta file
            if (type_id, word) not in pairs:
                mixehr_sim_meta['typeId'].append(type_id)
                mixehr_sim_meta['pheId'].append(word)
                mixehr_sim_meta['statecnt'].append(1)
                pairs.add((type_id, word))

            # entering rows of mxr_data file
            mixehr_sim_data['patId'].append(patid)
            mixehr_sim_data['typeId'].append(type_id)
            mixehr_sim_data['pheId'].append(word)
            mixehr_sim_data['stateId'].append(stateid)
            mixehr_sim_data['freq'].append(freq)

    # entering remaining words into the meta file
    remaining_words = set(range(1,vocab_size+1)) - words_picked_from_vocab

    for rem_word in remaining_words:
        mixehr_sim_meta['typeId'].append(type_id)
        mixehr_sim_meta['pheId'].append(rem_word)
        mixehr_sim_meta['statecnt'].append(1)       
    
    mxr_data = pd.DataFrame(data=mixehr_sim_data)
    mxr_meta = pd.DataFrame(data=mixehr_sim_meta)
    mxr_meta = mxr_meta.sort_values(by=['typeId','pheId'])
    mxr_data = mxr_data.sort_values(by='patId')
    
    return mxr_data, mxr_meta

In [61]:
for type_id in [2, 4]:
    file_d = "simulated_data/4000pats/300tokens/2500vocab/sim_mxr_data_type"+str(type_id)+"_new_4000pats_300tokens_2500vocab.txt"
    file_m = "simulated_data/4000pats/300tokens/2500vocab/sim_mxr_meta_type"+str(type_id)+"_new_4000pats_300tokens_2500vocab.txt"
    data = pd.read_csv(file_d, header=None, sep=' ')
    meta = pd.read_csv(file_m, header=None, sep=' ')
    d1, m1 = baseline1(data, meta)
    
    d1.to_csv("simulated_data/4000pats/300tokens/2500vocab/sim_mxr_data_type"+str(type_id)+"_new_b1_4000pats_300tokens_2500vocab.txt",header=None, index=False,sep=' ')
    m1.to_csv("simulated_data/4000pats/300tokens/2500vocab/sim_mxr_meta_type"+str(type_id)+"_new_b1_4000pats_300tokens_2500vocab.txt",header=None, index=False,sep=' ')

HBox(children=(IntProgress(value=0, max=2500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2500), HTML(value='')))


