In [133]:
EMB_COL = "embeddings"
TEXT_COL = "text"
WEIGHT_COL = "weight"

# Column Class

In [217]:
class Column(object):
    def __init__(self, name, attribute, col_type, verb):
        self.name = name
        self.attribute = attribute
        self.type = col_type
        self.verb = verb
        
    def is_binary(self):
        return self.type == "binary"
    
    def is_categorical(self):
        return self.type == "categorical"
    
    def is_numerical(self):
        return self.type == "numerical"
    
    def create_sentence(self, value, prefix, missing_word, replace_numbers, descriptive):
        if descriptive:
            return self.create_descriptive_sentence(value, prefix, missing_word, replace_numbers)
        else:
            return self.create_basic_sentence(value, prefix, missing_word, replace_numbers)
        
        
class Binary_Column(Column):
    def __init__(self, name, attribute, verb, neg_verb):
        self.neg_verb = neg_verb
        super().__init__(name, attribute, "binary", verb)
        

    def create_descriptive_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if str(value).lower()  in ["1", "0", "true", "false"]:
            if int(value) == 1:
                sentence = prefix + self.verb + " " + self.attribute
            elif int(value) == 0:
                sentence = prefix + self.neg_verb + " " + self.attribute
        return sentence
            

    def create_basic_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if str(value).lower()  in ["1", "0", "true", "false"]:
            if int(value) == 1:
                sentence = self.verb + " " + self.attribute + ": yes" 
            elif int(value) == 0:
                sentence = self.neg_verb + " " + self.attribute +" : no"
        elif missing_word != "":
            sentence = self.verb + " " + self.attribute + ": " + missing_word
        return sentence
        
class Categorical_Column(Column):
    def __init__(self, name, attribute, verb):
        super().__init__(name, attribute, "categorical", verb)

    def create_descriptive_sentence(self, value, prefix, missing_word, replace_numbers):
        if len(prefix) != 0:
            prefix = prefix[:-1] + "'s "
        sentence = ""
        if str(value).lower() not in ["nan", "", "none", "missing"]:
            sentence = prefix + self.attribute + " " + self.verb + " " + str(value)
        elif  missing_word != "":
            sentence = prefix + self.attribute + " " + self.verb + " " + missing_word
        return sentence
            

    def create_basic_sentence(self, value, prefix, missing_word, replace_numbers):
        if len(prefix) != 0:
            prefix = prefix[:-1] + "'s "
        sentence = ""
        if  str(value).lower() not in ["nan", "", "none", "missing"]:
            sentence = self.attribute + ": " + str(value)
        elif missing_word != "":
            sentence = self.attribute + ": " + missing_word
        return sentence
    
class Numerical_Column(Column):
    def __init__(self, name, attribute, verb, avg, sd):
        self.avg = avg
        self.sd = sd
        super().__init__(name, attribute, "numerical", verb)
        
        
    def create_descriptive_sentence(self, value, prefix, missing_word, replace_numbers):
        if len(prefix) != 0:
            prefix = prefix[:-1] + "'s "
        sentence = ""
        if str(value).lower() not in ["nan", "", "none", "missing"]:
            col_value = self.encode_number(value, replace_numbers)
            sentence = prefix + self.attribute + " " + self.verb + " " + str(col_value) 
        elif  missing_word != "":
            sentence = prefix + self.attribute + " " + self.verb + " " + missing_word 
        return sentence
            

    def create_basic_sentence(self, value, prefix, missing_word, replace_numbers):
        if len(prefix) != 0:
            prefix = prefix[:-1] + "'s "
        sentence = ""
        if  str(value).lower() not in ["nan", "", "none", "missing"]:
            col_value = self.encode_number(value, replace_numbers)
            sentence = self.attribute + ": " + str(col_value)
        elif missing_word != "":
            sentence = self.attribute + ": " + missing_word
        return sentence
    
    def encode_number(self, value, replace_numbers):
        new_value = value
        if replace_numbers:
            if self.avg - 2*self.sd > value:
                new_value = "very low"
            elif self.avg - 2*self.sd <= value < self.avg - self.sd:
                new_value = "low"
            elif self.avg + 2*self.sd >= value > self.avg + self.sd:
                new_value = "high"
            elif self.avg + 2*self.sd < value:
                new_value = "very high"
            else:
                new_value = "normal"
        return new_value

## Column Testing

In [218]:
bin_col = Binary_Column("hisp", "hispanic", "is", "is not")

In [219]:
bin_col.create_descriptive_sentence("1", "the patient ","is missing", True)

'the patient is hispanic'

In [220]:
cat_col = Categorical_Column("nat", "nationality", "is")

In [221]:
num_col = Numerical_Column("tmp", "temperature",  "is", 10, 3)

# Tabular Class

In [222]:
import pandas as pd
from math import isnan
from transformers import AutoTokenizer, AutoModel, logging
import torch
import numpy as np

#One table per patient per tabular data structure
class Table(object):
    def __init__(self, name, df, columns, metadata, time_col):
        self.name = name
        self.headers = df.columns
        self.columns = columns
        self.metadata = metadata
        self.df = df
        self.time_col = time_col

    def is_temporal(self):
        return self.time_col is not None
    
    def is_static(self):
        return self.time_col is None

        
    def create_text(self, prefix, missing_word, replace_numbers, descriptive):
        text = []
        for t_i in range(self.df.shape[0]):
            text_i = self.metadata
            
            for column in self.columns:
                print
                value = self.df.iloc[t_i][column.name]
                col_text = column.create_sentence(value, prefix, missing_word, replace_numbers, descriptive)
                if len(col_text) >0:
                    col_text += ", "
                text_i += col_text
                
            text.append(text_i)
    
        self.df[TEXT_COL] = text 

    
    def create_embeddings(self):
        embeddings = []

        for i in range(self.df.shape[0]):
            text = self.df.iloc[i][TEXT_COL]
            full_embedding = get_biobert_embeddings(text)[0]
            embeddings.append(full_embedding.reshape(-1))

        self.df[EMB_COL] = embeddings
        
    def get_timebounded_df(self, start_hr, end_hr):
        
        if self.time_col is None:
            return self.df
        
        else:
            timebounded_df = self.df.copy()

            if start_hr is not None:
                timebounded_df = timebounded_df[timebounded_df[time_col]>= start_hr]
            if end_hr is not None:
                timebounded_df = timebounded_df[timebounded_df[time_col]<= end_hr]

            return timebound_df
    
    

# Patient Class

In [417]:
import functools
from functools import reduce

class Patient(object):
    def __init__(self, tables, pat_id, time_col):
        self.id = pat_id
        self.time_col = time_col   
        self.tables = tables  
        
    def get_tables_name(self):
        table_names = []
        for table in self.tables:
            table_names.append(table.name)
        return table_names

        
    def create_timed_data(self, prefix, missing_word, replace_numbers, descriptive, merge_tables_text=True):
        for table in self.tables:
            table.create_text(prefix, missing_word, replace_numbers, descriptive)
            
        timed_data = reduce(lambda t1, t2: merge_text(t1, t2, self.time_col), self.tables).df
        
        if merge_tables_text:
            timed_data[EMB_COL] = create_embeddings(timed_data)
        
        else:
            for table in self.tables:
                table.create_embeddings()

            emb_data = reduce(lambda t1, t2: merge_emb(t1, t2, self.time_col), self.tables).df
            timed_data[EMB_COL + "_per_table"] = emb_data[EMB_COL]
        self.timed_data = timed_data
        return timed_data
    

    def get_timebounded_embeddings(self, weight_fn, start_hr = None, end_hr = None, merge_tables_text=True):
        timebounded_df = self.timed_data.copy()
        
        if merge_tables_text:
            timebounded_df = timebounded_df[[EMB_COL, self.time_col]]
        else:
            timebounded_df = timebounded_df[[EMB_COL + "_per_table", self.time_col]]
            
        if start_hr is not None:
            timebounded_df = timebounded_df[timebounded_df[self.time_col]>= start_hr]
        if end_hr is not None:
            timebounded_df = timebounded_df[timebounded_df[self.time_col]<= end_hr]

        timebounded_df[WEIGHT_COL] = weight_fn(timebounded_df[self.time_col])
        return timebounded_df    

# Creating Patient Objects

In [340]:
def get_patients(tables_info, id_col, time_col):
    unique_ids = get_unique_ids(tables_info, id_col)
    patients = []
    for pat_id in unique_ids:
        tables = create_patient_tables(tables_info, pat_id, id_col, time_col)
        patient = Patient(tables, pat_id, time_col)
        patients.append(patient)
    return patients
    
def create_columns(attributes_info):
    columns = []
    for col_name in attributes_info:
        col_attribute = attributes_info[col_name]["attribute"]
        col_verb = attributes_info[col_name]["column_verb"]
        col_type = attributes_info[col_name]["column_type"]
        if col_type == "binary":
            col_neg_verb = attributes_info[col_name]["column_neg_verb"]
            column = Binary_Column(col_name, col_attribute, col_verb, col_neg_verb)
        elif col_type == "categorical":
            column = Categorical_Column(col_name, col_attribute, col_verb)
        else:
            avg = attributes_info[col_name]["avg"]
            sd = attributes_info[col_name]["sd"]
            column = Numerical_Column(col_name, col_attribute, col_verb, avg, sd)
        columns.append(column)
    return columns

def create_patient_tables(tables_info, pat_id, id_col, time_col):
    pat_tables = []
    for i in range(len(tables_info)):
        
        table_df = tables_info[i]["df"]
        table_name = tables_info[i]["name"]
        attributes_info = tables_info[i]["attributes_info"]
        columns = create_columns(attributes_info)
        metadata = tables_info[i]["metadata"]
        
        if pat_id in table_df[id_col].unique():
            pat_table_df = table_df[table_df[id_col]== pat_id]
            if time_col not in table_df.columns:
                table = Table(table_name, pat_table_df, columns, metadata, None)
            else:
                table = Table(table_name, pat_table_df, columns, metadata, time_col)
            pat_tables.append(table)
    return pat_tables

def get_unique_ids(tables_info, id_key):
    unique_ids = set()
    for i in range(len(tables_info)):
        table_df = tables_info[i]["df"]
        table_ids = table_df[id_key].unique()
        unique_ids.update(table_ids)
    return unique_ids

# Utils

In [392]:
import pandas as pd
from math import isnan
from transformers import AutoTokenizer, AutoModel, logging
import torch
import numpy as np

biobert_path = '/home/gridsan/kimvc/haim_shared/pretrained_bert_tf/biobert_pretrain_output_all_notes_150000/'
biobert_tokenizer = AutoTokenizer.from_pretrained(biobert_path)
biobert_model = AutoModel.from_pretrained(biobert_path)

def get_biobert_embeddings(text):
    # Inputs:
    #   text -> Input text (str)
    #
    # Outputs:
    #   embeddings -> Final Biobert embeddings with vector dimensionality = (1,768)
    #   hidden_embeddings -> Last hidden layer in Biobert model with vector dimensionality = (token_size,768)

    # %% EXAMPLE OF USE
    # embeddings, hidden_embeddings = get_biobert_embeddings(text)

    tokens_pt = biobert_tokenizer(text, return_tensors="pt")
    outputs = biobert_model(**tokens_pt)
    last_hidden_state = outputs.last_hidden_state
    pooler_output = outputs.pooler_output
    hidden_embeddings = last_hidden_state.detach().numpy()
    embeddings = pooler_output.detach().numpy()

    return embeddings, hidden_embeddings

def create_embeddings(df):
    embeddings = []

    for i in range(df.shape[0]):
        text = df.iloc[i][TEXT_COL]
        full_embedding = get_biobert_embeddings(text)[0]
        embeddings.append(full_embedding.reshape(-1))

    return embeddings

def create_time_weights(timestamps):
    #TO-DO
    n = len(timestamps)
    return [1/n for i in range(n)]

def merge_text(table1, table2, time_col):
    new_df = pd.DataFrame()
    new_time_col = time_col
    if table1.is_static() and table2.is_static():
        df = table1.df.copy()
        df[TEXT_COL] = table1.df[TEXT_COL] + table2.df[TEXT_COL]
        new_df = df[TEXT_COL]
        new_time_col = None
    elif table1.is_static():
        df = table2.df.copy()
        df[TEXT_COL] = table1.df.iloc[0][TEXT_COL] + table2.df[TEXT_COL]
        new_df = df[[time_col, TEXT_COL]]
    elif table2.is_static():
        df = table1.df.copy()
        df[TEXT_COL] = table1.df[TEXT_COL] + table2.df.iloc[0][TEXT_COL]
        new_df = df[[time_col, TEXT_COL]]
    else:
        df = table1.df.copy()
        df = df.merge(table2.df, how="outer", on=time_col)
        print()
        df = df.fillna("")
        df[TEXT_COL] = df[TEXT_COL + "_x"] + df[TEXT_COL + "_y"]
        new_df = df[[time_col, TEXT_COL]]
    table = Table("Merged Table", new_df, table1.columns + table2.columns, "", new_time_col)
    return table

    
def merge_emb(table1, table2, time_col):
    new_df = pd.DataFrame()
    new_time_col = time_col
    if table1.is_static() and table2.is_static():
        df = table1.df.copy()
        df[EMB_COL] = np.concatenate((df1[EMB_COL][0] ,  df2[EMB_COL][0]))
        new_df = df[EMB_COL]
        new_time_col = None
    elif table1.is_static():
        df = table2.df.copy()
        df[EMB_COL] = df[EMB_COL].apply(lambda x: np.concatenate((table1.df.iloc[0][EMB_COL], x)))
        new_df = df[[time_col, EMB_COL]]
    elif table2.is_static():
        df = table1.df.copy()
        df[EMB_COL] = df[EMB_COL].apply(lambda x: np.concatenate((x, table2.df.iloc[0][EMB_COL])))
        new_df = df[[time_col, EMB_COL]]
    else:
        df = table1.df.copy()
        df = df.merge(table2.df, how="outer", on=time_col)
        df[EMB_COL + "_x"] = df[EMB_COL + "_x"].apply(lambda d: d if isinstance(d, np.ndarray) else [])
        df[EMB_COL + "_y"] = df[EMB_COL + "_y"].apply(lambda d: d if isinstance(d, np.ndarray) else [])
        df[EMB_COL] = [np.concatenate((df[EMB_COL + "_x"][i], df[EMB_COL + "_y"][i])) for i in range(len(df))]
        new_df = df[[time_col, EMB_COL]]
    table = Table("Merged Table", new_df, table1.columns + table2.columns, "", new_time_col)
    return table
    
def get_attributes_info(df, info_file_path):
    attributes_info = {}
    info_file = pd.read_csv(info_file_path)
    for i in range(info_file.shape[0]):
        col_name, attribute, verb, neg_verb, col_type = info_file.iloc[i]
        sd, avg = None, None
        if col_type == "numerical":
            col_values = df[[col_name]].astype(np.float)
            col_values = col_values[col_name][pd.notnull(col_values[col_name])]
            avg = col_values.mean()
            sd = col_values.std()
        attributes_info[col_name] = {"attribute": attribute,
                                    "column_verb": verb,
                                    "column_neg_verb": neg_verb,
                                    "column_type": col_type,
                                    "avg": avg,
                                    "sd": sd}
    return attributes_info
                                            

Some weights of the model checkpoint at /home/gridsan/kimvc/haim_shared/pretrained_bert_tf/biobert_pretrain_output_all_notes_150000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [273]:
def fix_RASS_LAST(rass_col):
    new_rass = []
    for i in range(len(rass_col)):
        if str(rass_col[i]).lower() in ["nan", "none", ""]:
            new_rass.append(str(rass_col[i]))
        else:
            new_rass.append(rass_col[i].split(">")[1])
    return new_rass

def fix_ipa_ambulating(ambulating_col):
    new_amb = []
    amb_map = {0:"not difficult", 1:"moderately difficult", 2:"cannot ambulate independentely"}
    for i in range(len(ambulating_col)):
        if str(ambulating_col[i]).lower() in ["nan", "none", ""]:
            new_amb.append(str(ambulating_col[i]))
        else:
            new_amb.append(amb_map[float(ambulating_col[i])])
    return new_amb
        

In [10]:
import datetime
from datetime import timedelta

In [359]:
event_df = pd.read_csv("../allData_downloaded/adtEventData.csv", "|")
clin_df = pd.read_csv("../allData_downloaded/clindocData.csv", "|")
enc_df = pd.read_csv("../allData_downloaded/encounterData.csv", "|")

event_df = event_df[(event_df['PAT_ENC_CSN_ID'] == 100087860068) | (event_df['PAT_ENC_CSN_ID'] ==100083064488)]
clin_df = clin_df[(clin_df['PAT_ENC_CSN_ID'] == 100087860068) | (clin_df['PAT_ENC_CSN_ID'] ==100083064488)]
enc_df = enc_df[(enc_df['PAT_ENC_CSN_ID'] == 100087860068) | (enc_df['PAT_ENC_CSN_ID'] ==100083064488)]

event_df['time'] = pd.to_datetime(event_df['EFFECTIVE_DTTM'], infer_datetime_format=True)#.dt.date 
clin_df['time'] = pd.to_datetime(clin_df['CALENDAR_DT'], infer_datetime_format=True)#.dt.date

clin_df["RASS_LAST"] = fix_RASS_LAST(clin_df["RASS_LAST"].values)
clin_df["IPA_DIFFICULTY_AMBULATING"] = fix_ipa_ambulating(clin_df["IPA_DIFFICULTY_AMBULATING"].values)

events = {}
events["df"] = event_df
events["name"] = "ADT_Events"
events["attributes_info"] = get_attributes_info(event_df, "adtEventDataColumnsInfo.csv")
events["metadata"] = "The following is the information for admission, discharge and transfer events. "

clinical = {}
clinical["df"] = clin_df
clinical["name"] = "Clinical_Documents"
clinical["attributes_info"] = get_attributes_info(clin_df, "clinDocDataColumnsInfo.csv")
clinical["metadata"] = "The following is the clinical information. "

encounter = {}
encounter["df"] = enc_df
encounter["name"] = "Encounter_Information"
encounter["attributes_info"] = get_attributes_info(enc_df, "encounterDataColumnsInfo.csv")
encounter["metadata"] = "The following is the encounter information. "


tables_info = [encounter, events, clinical]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [428]:
patients = get_patients(tables_info, 'PAT_ENC_CSN_ID', 'time')

In [429]:
first_patient = patients[0]

In [435]:
t = first_patient.create_timed_data("the patient ", "missing", False, True, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[TEXT_COL] = text
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[EMB_COL] = embeddings





In [436]:
first_patient.get_timebounded_embeddings(create_time_weights, start_hr = t.iloc[2]["time"], end_hr = t.iloc[5]["time"], merge_tables_text=False)

Unnamed: 0,embeddings_per_table,time,weight
2,"[-0.18052378296852112, -0.15505649149417877, 0...",2020-01-07 19:43:00,0.5
5,"[-0.12036321312189102, -0.11608690768480301, 0...",2020-01-08 00:00:00,0.5
