# Column Class

In [22]:
class Column(object):
    def __init__(self, name, attribute, col_type, verb):
        self.name = name
        self.attribute = attribute
        self.type = col_type
        self.verb = verb
        
    def is_binary(self):
        return self.type == "binary"
    
    def is_categorical(self):
        return self.type == "categorical"
    
    def is_numerical(self):
        return self.type == "numerical"
    
    def create_sentence(self, value, prefix, missing_word, replace_numbers, descriptive):
        if descriptive:
            return self.fn_descriptive(value, prefix, missing_word, replace_numbers)
        else:
            return self.fn_basic(value, prefix, missing_word, replace_numbers)
        
        
class Binary_Column(Column):
    def __init__(self, name, attribute, verb, neg_verb):
        self.neg_verb = neg_verb
        super().__init__(name, attribute, "binary", verb)
        

    def create_descriptive_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if str(value).lower()  in ["1", "0", "true", "false"]:
            if int(value) == 1:
                sentence = prefix + self.verb + " " + self.attribute
            elif int(value) == 0:
                sentence = prefix + self.neg_verb + " " + self.attribute
        return sentence
            

    def create_basic_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if str(value).lower()  in ["1", "0", "true", "false"]:
            if int(value) == 1:
                sentence = self.verb + " " + self.attribute + ": yes" 
            elif int(value) == 0:
                sentence = self.neg_verb + " " + self.attribute + ": no"
        elif missing_word != "":
            sentence = self.verb + " " + self.attribute + ": " + missing_word
        return sentence
        
class Categorical_Column(Column):
    def __init__(self, name, attribute, verb):
        super().__init__(name, attribute, "categorical", verb)

    def create_descriptive_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if str(value).lower() not in ["nan", "", "none", "missing"]:
            sentence = prefix + self.attribute + " " + self.verb + " " + str(value)
        elif  missing_word != "":
            sentence = prefix + self.attribute + " " + self.verb + " " + missing_word
        return sentence
            

    def create_basic_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if  str(value).lower() not in ["nan", "", "none", "missing"]:
            sentence = self.attribute + ": " + str(value)
        elif missing_word != "":
            sentence = self.attribute + ": " + missing_word
        return sentence
    
class Numerical_Column(Column):
    def __init__(self, name, attribute, verb, avg, sd):
        self.avg = avg
        self.sd = sd
        super().__init__(name, attribute, "numerical", verb)
        
        
    def create_descriptive_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if str(value).lower() not in ["nan", "", "none", "missing"]:
            col_value = self.encode_number(value, replace_numbers)
            sentence = prefix + self.attribute + " " + self.verb + " " + str(col_value) 
        elif  missing_word != "":
            sentence = prefix + self.attribute + " " + self.verb + " " + missing_word 
        return sentence
            

    def create_basic_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if  str(value).lower() not in ["nan", "", "none", "missing"]:
            col_value = self.encode_number(value, replace_numbers)
            sentence = self.attribute + ": " + str(col_value)
        elif missing_word != "":
            sentence = self.attribute + ": " + missing_word
        return sentence
    
    def encode_number(self, value, replace_numbers):
        new_value = value
        if replace_numbers:
            if self.avg - 2*self.sd > value:
                new_value = "very low"
            elif self.avg - 2*self.sd <= value < self.avg - self.sd:
                new_value = "low"
            elif self.avg + 2*self.sd >= value > self.avg + self.sd:
                new_value = "high"
            elif self.avg + 2*self.sd < value:
                new_value = "very high"
            else:
                new_value = "normal"
        return new_value

## Column Testing

In [23]:
bin_col = Binary_Column("hisp", "hispanic", "is", "is not")

In [26]:
bin_col.create_descriptive_sentence(np.nan, "the patient ","", True)

''

In [25]:
bin_col.create_basic_sentence(np.nan, "the patient ",np.nan, True)

'is hispanic: nan'

In [13]:
bin_col.create_descriptive_sentence(np.nan, "the patient ","is missing", True)

''

In [11]:
cat_col = Categorical_Column("nat", "nationality", "is")

In [12]:
num_col = Numerical_Column("tmp", "temperature",  "is", 10, 3)

# Tabular Class

In [2]:
import pandas as pd
from math import isnan
from transformers import AutoTokenizer, AutoModel, logging
import torch
import numpy as np

#One table per patient per tabular data structure
class Table(object):
    def __init__(self, name, df, columns, metadata, time_col):
        self.name = name
        self.headers = df.columns
        self.columns = columns
        self.metadata = metadata
        self.df = df
        self.time_col = time_col

        
    def create_weighted_text(self, prefix, missing_word, replace_numbers, descriptive):    
        text = []
        for t_i in range(self.df.shape[0]):
            text_i = self.metadata
            
            for column in self.columns:
                value = self.df[t_i, column.name]
                text_i += column.create_sentence(value, prefix, missing_word, replace_numbers, descriptive) + ", "
                
            text.append(text_i)
    
        self.df["text"] = text 

    
    def create_weighted_embeddings(self):
        embeddings = []

        for i in range(self.weighted_text.shape[0]):
            text = self.weighted_text.iloc[i]["text"]
            full_embedding = get_biobert_embeddings(text)[0]
            embeddings.append(full_embedding.reshape(-1))

        self.df["embeddings"] = embeddings
        
    def create_timebounded_embeddings(self, start_hr, end_hr):
        timebounded_df = self.df
        
        if start_hr is not None:
            timebounded_df = timebounded_df[timebounded_df[time_col]>= start_hr]
        if end_hr is not None:
            timebounded_df = timebounded_df[timebounded_df[time_col]<= end_hr]
            
        timebound_df["weights"] = create_time_weights(timebound_df[self.time_col])
        
        return timebound_df
    
    

# Patient Class

In [None]:
class Patient(object):
    def __init__(self, tables, pat_id, time_col):
        self.id = pat_id
        
        for table in tables:
            setattr(self, table.name , table)

        
    def create_embeddings(self, start_hr = None, end_hr = None, single_embedding=True):
        
        
    
    
    
        
    def save_patient_object(obj, filepath):
        with open(filepath, 'wb') as output:
            pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

    # LOAD SINGLE PATIENT ICU STAY RECORDS FOR MIMIC-IV
    def load_patient_object(filepath):
        with open(filepath, 'rb') as input:  
            return pickle.load(input)

In [None]:
def create_tables():
    #TO_DO
    
def get_columns(attributes_map, verb_map, type_map):
    #TO-DO

# Utils

In [None]:
import pandas as pd
from math import isnan
from transformers import AutoTokenizer, AutoModel, logging
import torch
import numpy as np

def get_biobert_embeddings(text):
    # Inputs:
    #   text -> Input text (str)
    #
    # Outputs:
    #   embeddings -> Final Biobert embeddings with vector dimensionality = (1,768)
    #   hidden_embeddings -> Last hidden layer in Biobert model with vector dimensionality = (token_size,768)

    # %% EXAMPLE OF USE
    # embeddings, hidden_embeddings = get_biobert_embeddings(text)

    tokens_pt = biobert_tokenizer(text, return_tensors="pt")
    outputs = biobert_model(**tokens_pt)
    last_hidden_state = outputs.last_hidden_state
    pooler_output = outputs.pooler_output
    hidden_embeddings = last_hidden_state.detach().numpy()
    embeddings = pooler_output.detach().numpy()

    return embeddings, hidden_embeddings

def create_time_weights(timestamps):
    #TO-DO
    n = len(timestamps)
    return [1/n for i in range(n)]


# cleans up column strings 
def strip_df(df):
    for col in df.columns:
        try:
            df[col] = df[col].str.strip()
        except:
            None
    return df

# Some dataframes has columns with singular values, maybe we can delete these columns
def unique_col(df):
    lis_to_delete = []
    for i in df.columns:
        unique_val = df[i].unique()
        if(len(unique_val) == 1):
            print(i, unique_val)
            lis_to_delete.append(i)
    df = df[df.columns[~df.columns.isin(lis_to_delete)]]
    
    return df

def convert_to_days(x):
    days = x.astype('timedelta64[D]')
    x = int(days/np.timedelta64(1, 'D'))
    return x

# Convert an entry to days
def date_diff_hrs(t1, t0):
    delta_t = round((t1-t0).total_seconds()/3600) # Result in hrs
    return delta_t

def compute_delta_time(df, starttime, time_col):
    df['delta_time'] = df.apply(lambda x: date_diff_hrs(x['time_col'], starttime), axis=1)
    return df

# Patient Filter

In [1]:
# Not all patients have ALL data modalities, this filters so that the patient has records in all modalities in 
# df_lis
def take_patient_intersection(df_lis, id_col):
    intersection_id = set(df_lis[0][id_col].unique())

    count = 1
    while(count <= len(df_lis)):
        intersection_id = intersection_id.intersection(set(df_lis[count][id_col].unique()))
        
    return intersection_id

In [None]:
def filter_na(df, df_col):
    return df[~df[df_col].isna()]

# Data Preprocessing

In [2]:
# Change categorical with string to serial numbers
def to_cat_from_str(df, df_col):
    new_col_lis = []
    for i in df_col:
        if(type(df[i].values[0]) is str):
            df[i + '_CODE'] = pd.Categorical(df[i]).codes
            new_col_lis.append(i + '_CODE')
        else:
            new_col_lis.append(i)
            
    return df[new_col_lis]

In [None]:
# Height was in the format of 5' 3'', change it to cm
def change_height_from_str(age_list):
    demo_cm_lis = []
    for i in age_list:
        try:
            converted = round(float(i.split()[1][:-1]) * 2.54 + float(i.split()[0][:-1]) * 30.48, 3)
            demo_cm_lis.append(converted)
        except:
            demo_cm_lis.append(np.nan)


    return demo_cm_lis

In [7]:
def retrieve_meta_info():
    dict_meta = {'demo': 'The following is the demographics information of this patient, which describes ...\
    information such as name, date of birth and address, along with insurance information.', 
     
     'encounter': 'The following is the encounter information of this patient, which describes the ...\
        medical information submitted by health care providers (physicians, hospitals, Ancillaries, etc.) ...\
        which documents both the clinical conditions, services and items delivered to the member to treat ...\
        their conditions.',
     
     'medication': 'The following is the medication information of this patient, which describes the ...\
        chemicals that are used to cure, halt, or prevent disease; ease symptoms; or help in the diagnosis ...\
        of illnesses.', 
                 
     'problem': 'The following is the problem information of this patient, which describes the ...\
        disease, condition, or injury from a patient signs and symptoms.', 
     
     'sign': 'The following is the signs information of this patient, which describes the ...\
        physical response linked medical fact or characteristic that is detected by a physician, nurse, ...\
        or medical device during the examination of a patient.', 
     
     'social': 'The following is the social information of this patient, which describes the ...\
        the circumstances of the places where people reside, work, learn, and engage in recreation.', 
    }
    
    return dict_meta

# Loading Packages

In [5]:
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import sys

# For RRT

In [3]:
def load_data(df_name, id_col):
    df = pd.read_csv(df_name)
    df = strip_df(df)
    df = unique_col(df)
    return df

In [None]:
def restrict_to_patient(df, id_to_include):
    df = df[df[id_col].isin(id_to_include)]

In [None]:
def change_time(df, df_col):
    df[df_col] = pd.to_datetime(df[df_col])

In [12]:
dict_RRT_col_include = {demo: ['PAT_CLASS', 'SEX_CODE', 'ETHNICITY', 'age', 'RACE_CODE', 'FIRST_THREE_DIGITS_ZIP',
            'HEIGHT_CM', 'LOC_NAME', 'WEIGHT'],
                encounter: ['BP_DIASTOLIC', 'BMI', 'WEIGHT', 'BP_SYSTOLIC', 'TEMPERATURE', 'BSA', 
           'PULSE','RESPIRATIONS', 'HEIGHT_CM'],
                medicine: ['DOSE', 'DOSE_UNIT', 'MED_NAME'], 
                problem: ['DX_NAME', 'DX_GROUP', 'CURRENT_ICD10_LIST', 'PRINCIPAL_PROB_YN'], 
                social: ['TOBACCO_PAK_PER_DY', 'TOBACCO_COMMENT', 'SMOKING_STATUS', 'ALCOHOL_OZ_PER_WK',
          'USES_ALCOHOL', 'ALCOHOL_COMMENT', 'ILLICIT_DRUG_USER', 'ILLICIT_DRUG_FREQ', 
          'CIGARETTES_YN', 'PIPES_YN', 'SNUFF_YN', 'CHEW_YN', 'IV_DRUG_USER_YN'],
                sign: ['MEAS_VALUE', 'FLO_MEAS_NAME']
}

In [13]:
dict_RRT_col_time = {demo: 'INP_ADM_DATE', 
                    encounter: 'CONTACT_DATE',
                    medicine:'PERFORMEDFROMDTM', 
                    problem: 'NOTED_DATE', 
                    social: 'CONTACT_DATE', 
                    sign: 'dtDateRec'}

In [None]:
id_col = 'PAT_ENC_CSN_GUID'

In [7]:
demo = load_data('Data/Full_Encounter.txt', 'PAT_ENC_CSN_GUID')
encounter = load_data('../../RRR-full/Full_Encounter.txt', 'PAT_ENC_CSN_GUID')
med = load_data('../../RRR-full/Full_Meds.txt', 'PAT_ENC_CSN_GUID')
problem = load_data('../../RRR-full/Full_ProblemList.txt', 'PAT_ENC_CSN_GUID')
sign = load_data('../../RRR-full/Full_SignsSymptoms.txt', 'PAT_ENC_CSN_GUID')
social = load_data('../../RRR-full/Full_SocialHx.txt', 'PAT_ENC_CSN_GUID')

In [None]:
id_to_include = take_patient_intersection([demo, encounter, med, problem, sign, social], id_col)

In [None]:
demo['HEIGHT_cm'] = change_height_from_str(demo['HEIGHT'])
encounter['HEIGHT_cm'] = change_height_from_str(encounter['HEIGHT'])

In [None]:
for df in [demo, encounter, med, problem, sign, social]:
    df_col = dict_RRT_col_include[df]
    to_cat_from_str(df, df_col)

## Constructs patient objects 

In [None]:
total = 0
for id_is in tqdm(id_to_include):
    patient_redcap = redcap[redcap[id_col] == id_is]
    end_time_point = patient_redcap['DateTime'].values[0]

    patient_encounter = encounter[encounter[id_col] == id_is]
    admit_time = patient_encounter['HOSP_ADMSN_TIME'].values[0]
    admit_time = (pd.Series(admit_time) + pd.DateOffset(-1)).values[0]
    dischrg_time = patient_encounter['HOSP_DISCHRG_TIME'].values[0]

    patient_demo = demo[demo[id_col] == id_is]
    patient_med = med[med[id_col] == id_is]
    patient_prob = problem[problem[id_col] == id_is]
    patient_sign = sign[sign[id_col] == id_is]
    patient_social = social[social[id_col] == id_is]
    
    patient_med = filter_na(patient_med, 'PERFORMEDFROMDTM')
    patient_prob = filter_na(patient_prob, 'NOTED_DATE')
    patient_sign = filter_na(patient_sign, 'dtDateRec')
    patient_social = filter_na(patient_social, 'CONTACT_DATE')
    
    if((len(patient_med) != 0) & (len(patient_prob) != 0) & (len(patient_sign) != 0) & (len(patient_social) != 0)):
        patient_med_time = restrict_df_by_redcap(patient_med, 'PERFORMEDFROMDTM', end_time_point, admit_time)
        patient_prob_time = restrict_df_by_redcap(patient_prob, 'NOTED_DATE', end_time_point, admit_time)
        patient_sign_time = restrict_df_by_redcap(patient_sign, 'dtDateRec', end_time_point, admit_time)
        patient_social_time = restrict_df_by_redcap(patient_social, 'CONTACT_DATE', end_time_point, admit_time)
    
    if((len(patient_med_time) != 0) & (len(patient_prob_time) != 0) & (len(patient_sign_time) != 0) & (len(patient_social_time) != 0)):
        total += 1
        patient_obj = Patient(id_is, end_time_point, admit_time, dischrg_time, patient_demo, patient_encounter, 
                              patient_redcap, patient_sign_time, patient_med_time, patient_prob_time, 
                              patient_social_time)

        save_patient_object(patient_obj, f'tabtext_paper_obj/obj_{id_is}.pkl')

## Make strings out of the objects 

In [None]:
# use the above functions