# Column Class

In [7]:
class Column(object):
    def __init__(self, name, attribute, col_type, verb):
        self.name = name
        self.attribute = attribute
        self.type = col_type
        self.verb = verb
        
    def is_binary(self):
        return self.type == "binary"
    
    def is_categorical(self):
        return self.type == "categorical"
    
    def is_numerical(self):
        return self.type == "numerical"
    
    def create_sentence(self, value, prefix, missing_word, replace_numbers, descriptive):
        if descriptive:
            return self.fn_descriptive(value, prefix, missing_word, replace_numbers)
        else:
            return self.fn_basic(value, prefix, missing_word, replace_numbers)
        
        
class Binary_Column(Column):
    def __init__(self, name, attribute, verb, neg_verb):
        self.neg_verb = neg_verb
        super().__init__(name, attribute, "binary", verb)
        

    def create_descriptive_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if str(value).lower()  in ["1", "0", "true", "false"]:
            if int(value) == 1:
                sentence = prefix + self.verb + " " + self.attribute
            elif int(value) == 0:
                sentence = prefix + self.neg_verb + " " + self.attribute
        return sentence
            

    def create_basic_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if str(value).lower()  in ["1", "0", "true", "false"]:
            if int(value) == 1:
                sentence = self.verb + " " + self.attribute + ": yes" 
            elif int(value) == 0:
                sentence = self.neg_verb + " " + self.attribute + ": no"
        elif missing_word != "":
            sentence = self.verb + " " + self.attribute + ": " + missing_word
        return sentence
        
class Categorical_Column(Column):
    def __init__(self, name, attribute, verb):
        super().__init__(name, attribute, "categorical", verb)

    def create_descriptive_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if str(value).lower() not in ["nan", "", "none", "missing"]:
            sentence = prefix + self.attribute + " " + self.verb + " " + str(value)
        elif  missing_word != "":
            sentence = prefix + self.attribute + " " + self.verb + " " + missing_word
        return sentence
            

    def create_basic_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if  str(value).lower() not in ["nan", "", "none", "missing"]:
            sentence = self.attribute + ": " + str(value)
        elif missing_word != "":
            sentence = self.attribute + ": " + missing_word
        return sentence
    
class Numerical_Column(Column):
    def __init__(self, name, attribute, verb, avg, sd):
        self.avg = avg
        self.sd = sd
        super().__init__(name, attribute, "numerical", verb)
        
        
    def create_descriptive_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if str(value).lower() not in ["nan", "", "none", "missing"]:
            col_value = self.encode_number(value, replace_numbers)
            sentence = prefix + self.attribute + " " + self.verb + " " + str(col_value) 
        elif  missing_word != "":
            sentence = prefix + self.attribute + " " + self.verb + " " + missing_word 
        return sentence
            

    def create_basic_sentence(self, value, prefix, missing_word, replace_numbers):
        sentence = ""
        if  str(value).lower() not in ["nan", "", "none", "missing"]:
            col_value = self.encode_number(value, replace_numbers)
            sentence = self.attribute + ": " + str(col_value)
        elif missing_word != "":
            sentence = self.attribute + ": " + missing_word
        return sentence
    
    def encode_number(self, value, replace_numbers):
        new_value = value
        if replace_numbers:
            if self.avg - 2*self.sd > value:
                new_value = "very low"
            elif self.avg - 2*self.sd <= value < self.avg - self.sd:
                new_value = "low"
            elif self.avg + 2*self.sd >= value > self.avg + self.sd:
                new_value = "high"
            elif self.avg + 2*self.sd < value:
                new_value = "very high"
            else:
                new_value = "normal"
        return new_value

## Column Testing

In [8]:
bin_col = Binary_Column("hisp", "hispanic", "is", "is not")

In [10]:
bin_col.create_descriptive_sentence("1", "the patient ","is missing", True)

'the patient is hispanic'

In [108]:
cat_col = Categorical_Column("nat", "nationality", "is")

In [109]:
num_col = Numerical_Column("tmp", "temperature",  "is", 10, 3)

# Tabular Class

In [2]:
import pandas as pd
from math import isnan
from transformers import AutoTokenizer, AutoModel, logging
import torch
import numpy as np

#One table per patient per tabular data structure
class Table(object):
    def __init__(self, name, df, columns, metadata, time_col):
        self.name = name
        self.headers = df.columns
        self.columns = columns
        self.metadata = metadata
        self.df = df
        self.time_col = time_col

        
    def create_weighted_text(self, prefix, missing_word, replace_numbers, descriptive):    
        text = []
        for t_i in range(self.df.shape[0]):
            text_i = self.metadata
            
            for column in self.columns:
                value = self.df[t_i, column.name]
                text_i += column.create_sentence(value, prefix, missing_word, replace_numbers, descriptive) + ", "
                
            text.append(text_i)
    
        self.df["text"] = text 

    
    def create_weighted_embeddings(self):
        embeddings = []

        for i in range(self.weighted_text.shape[0]):
            text = self.weighted_text.iloc[i]["text"]
            full_embedding = get_biobert_embeddings(text)[0]
            embeddings.append(full_embedding.reshape(-1))

        self.df["embeddings"] = embeddings
        
    def create_timebounded_embeddings(self, start_hr, end_hr):
        timebounded_df = self.df
        
        if start_hr is not None:
            timebounded_df = timebounded_df[timebounded_df[time_col]>= start_hr]
        if end_hr is not None:
            timebounded_df = timebounded_df[timebounded_df[time_col]<= end_hr]
            
        timebound_df["weights"] = create_time_weights(timebound_df[self.time_col])
        
        return timebound_df
    
    

# Patient Class

In [None]:
class Patient(object):
    def __init__(self, tables, pat_id, time_col):
        self.id = pat_id
        
        for table in tables:
            setattr(self, table.name , table)

        
    def create_embeddings(self, start_hr = None, end_hr = None, single_embedding=True):
        
        
    
    
        

In [None]:
def create_tables():
    #TO_DO
    
def get_columns(attributes_map, verb_map, type_map):
    #TO-DO

# Utils

In [None]:
import pandas as pd
from math import isnan
from transformers import AutoTokenizer, AutoModel, logging
import torch
import numpy as np

def get_biobert_embeddings(text):
    # Inputs:
    #   text -> Input text (str)
    #
    # Outputs:
    #   embeddings -> Final Biobert embeddings with vector dimensionality = (1,768)
    #   hidden_embeddings -> Last hidden layer in Biobert model with vector dimensionality = (token_size,768)

    # %% EXAMPLE OF USE
    # embeddings, hidden_embeddings = get_biobert_embeddings(text)

    tokens_pt = biobert_tokenizer(text, return_tensors="pt")
    outputs = biobert_model(**tokens_pt)
    last_hidden_state = outputs.last_hidden_state
    pooler_output = outputs.pooler_output
    hidden_embeddings = last_hidden_state.detach().numpy()
    embeddings = pooler_output.detach().numpy()

    return embeddings, hidden_embeddings

def create_time_weights(timestamps):
    #TO-DO
    n = len(timestamps)
    return [1/n for i in range(n)]