In [1]:
from datetime import datetime
import json
import os

import scipy
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import numpy as np
import pandas as pd

import pysnooper

import torch
from torch.utils.data import DataLoader
import torch.nn as nn

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
LOCATION = "../new_pressure_injury_data" 

lab_data_df = pd.read_csv(os.path.join(LOCATION, "LAB_DATA.csv"))

path = os.path.join(LOCATION, "new_indices_of_positive.json")
with open(path) as f:
   pos_locs = [tuple(x) for x in json.load(f)]

In [4]:
pos_indices = lab_data_df.DE_ID_PAT_ID.isin((a for a,b in pos_locs))

In [5]:
len(lab_data_df["DE_ID_PAT_ID"].drop_duplicates()), len(lab_data_df[["DE_ID_PAT_ID", "ENCOUNTER_COUNTER"]].drop_duplicates())

(80257, 129398)

In [7]:
lab_data_df["VALUE_FLOAT"] = pd.to_numeric(lab_data_df['VALUE'], errors='coerce')

# drop encounters with more than two tests
locs_with_more_than_two_tests = lab_data_df \
    .groupby(['DE_ID_PAT_ID', 'ENCOUNTER_COUNTER']) \
    .COMPONENT.count() \
    .to_frame().rename(columns={"COMPONENT": "NUM_TESTS"}) \
    .query("NUM_TESTS > 2").index

lab_data_df_processed = lab_data_df \
    .set_index(['DE_ID_PAT_ID', 'ENCOUNTER_COUNTER']) \
    .loc[locs_with_more_than_two_tests] \
    .reset_index() \
    \
    .set_index(['DE_ID_PAT_ID', 'ENCOUNTER_COUNTER','RESULT_DAY', 'COMPONENT'])[["VALUE_FLOAT"]] \
    .dropna(axis='rows') \
    .query("COMPONENT in ['HGB', 'ALBUMIN', 'HEMOGLOBIN A1C', 'PREALBUMIN']") \
    .sort_values('RESULT_DAY') \
    .groupby(['DE_ID_PAT_ID', 'ENCOUNTER_COUNTER','RESULT_DAY', 'COMPONENT']).VALUE_FLOAT.agg("mean") \
    .unstack()
# lab_data_df_processed

In [8]:
def reindex_with_more_NaN(df, first_day, last_day):
    """
    Reindex with days from (frist_day) to (last_day)
    """
    indices = df.index.droplevel("RESULT_DAY").drop_duplicates()
    day_range = pd.RangeIndex(first_day, last_day + 1)
    return df.reindex(
        pd.DataFrame(None, index=indices, columns=day_range) \
            .reset_index() \
            .melt(id_vars=["DE_ID_PAT_ID", "ENCOUNTER_COUNTER"], value_vars=day_range, var_name='RESULT_DAY') \
            .set_index(["DE_ID_PAT_ID", "ENCOUNTER_COUNTER", "RESULT_DAY"]) \
            .sort_index(level = 0).index
     )

first_day=1
last_day=10

X = reindex_with_more_NaN(lab_data_df_processed, first_day, last_day)  # lab_data_df_processed_reindexed

### Train test split

In [9]:
def to_numpy_array(df, first_day, last_day):
    return np.array_split(
        df.values,
        len(df)/(last_day-first_day+1)
    )

def split_and_impute(df, pos_locs, imputer, scaler=None):
    X_train_IDs, X_test_IDs = train_test_split(df.index.get_level_values(0).drop_duplicates())

    X_train_df = df.loc[X_train_IDs]
    X_test_df = df.loc[X_test_IDs]

    y_train = X_train_df.index.droplevel(2) \
        .drop_duplicates() \
        .isin(pos_locs) \
        .astype(np.int32)

    y_test = X_test_df.index.droplevel(2) \
        .drop_duplicates() \
        .isin(pos_locs) \
        .astype(np.int32)

    # impute
    imputer.fit(X_train_df)

    X_train_df = pd.DataFrame(imputer.transform(X_train_df)) \
        .set_index(X_train_df.index)
    X_test_df = pd.DataFrame(imputer.transform(X_test_df)) \
        .set_index(X_test_df.index)

    # scale
    if scaler is not None:
        X_train_df = pd.DataFrame(scaler.fit_transform(X_train_df))
        X_train_df = pd.DataFrame(scaler.transform(X_test_df))

    X_train = to_numpy_array(X_train_df, first_day, last_day)
    X_test = to_numpy_array(X_test_df, first_day, last_day)
    
    
    return (
        *(torch.tensor(arr, dtype=torch.float32, requires_grad=True) for arr in [X_train, X_test]),
        *(torch.tensor(arr, dtype=torch.float32, requires_grad=False) for arr in [y_train, y_test])
    )

# X_train, X_test, y_train, y_test = split_and_impute(
#     X,
#     pos_locs,
#     SimpleImputer(missing_values=np.nan, strategy='mean')
# )

# (X_train.shape, sum(y_train)), (X_test.shape, sum(y_test))

### Creating fake data

In [10]:
# pd.MultiIndex.from_tuples(pos_locs) \
def fake_locs(pos_locs):
    """this just adds 1s to each row in the index"""
    if isinstance(pos_locs, pd.MultiIndex):
        AAA = pos_locs.to_frame()
    else:
        AAA = pd.DataFrame(pos_locs)
    AAA[2] = [1]*len(pos_locs)
    return pd.MultiIndex.from_frame(AAA)

new_locs = X.index.droplevel("RESULT_DAY").drop_duplicates().intersection(pos_locs)
new_X = X*0

# reduced size of dataset...
trimmed_locs = new_X.index.droplevel("RESULT_DAY").drop_duplicates()[:1000].union(new_locs)
hope = pd.DataFrame([None]*len(trimmed_locs), index=fake_locs(trimmed_locs)) \
    .reset_index() \
    .rename(columns={2: "RESULT_DAY"}) \
    .set_index(["DE_ID_PAT_ID", "ENCOUNTER_COUNTER", "RESULT_DAY"])
new_X = new_X.loc[reindex_with_more_NaN(hope, 1, 10).index]


new_X.at[fake_locs(new_locs)] = 100
print(new_X.value_counts())

X_train, X_test, y_train, y_test = split_and_impute(
    new_X,
    new_locs,
    SimpleImputer(missing_values=np.nan, strategy='mean'),
    preprocessing.MinMaxScaler()
)

ALBUMIN  HEMOGLOBIN A1C  HGB    PREALBUMIN
100.0    100.0           100.0  100.0         244
0.0      0.0             0.0    0.0             2
dtype: int64


  *(torch.tensor(arr, dtype=torch.float32, requires_grad=True) for arr in [X_train, X_test]),


In [12]:
train_dataloader = DataLoader(list(zip(X_train, y_train.unsqueeze(1))), batch_size=32, shuffle=True)
test_dataloader = DataLoader(list(zip(X_test, y_test.unsqueeze(1))), batch_size=32, shuffle=True)

In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' # ?'cuda:2'
# !nvidia-smi

In [36]:
# class GRUModel(nn.Module):
#     def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob=0):
#         super(GRUModel, self).__init__()

#         # Defining the number of layers and the nodes in each layer
#         self.layer_dim = layer_dim
#         self.hidden_dim = hidden_dim

#         # GRU layers
#         self.gru = nn.GRU(
#             input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob
#         )

#         # Fully connected layer
#         self.fc = nn.Linear(hidden_dim, output_dim)

#     def forward(self, x):
#         # Initializing hidden state for first input with zeros
#         h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(device)
#         # Forward propagation by passing in the input and hidden state into the model
#         out, _ = self.gru(x, h0.detach())
#         # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
#         # so that it can fit into the fully connected layer
#         out = out[:, -1, :]
#         # Convert the final state to our desired output shape (batch_size, output_dim)
#         out = self.fc(out)
#         return out
class LinearModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob=0):
        super(LinearModel, self).__init__()

        # Defining the number of layers and the nodes in each layer
        self.layer_dim = layer_dim
        self.hidden_dim = hidden_dim

        

        self.l0 = nn.Linear(4, 1)
        self.l1 = nn.Linear(10, 1)

        # Fully connected layer
        # self.fc = nn.Linear(32, output_dim)
        
        #  Sigmoid ?
        self.sig = nn.Sigmoid()
        
    def forward(self, x):
        # Initializing hidden state for first input with zeros
        # h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(device)
        # Forward propagation by passing in the input and hidden state into the model
        # out, _ = self.gru(x, h0.detach())
        # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
        # so that it can fit into the fully connected layer
        # print(out.size())
        # out = out[:, -1, :]
        # Convert the final state to our desired output shape (batch_size, output_dim)
        # print(out.size())
        
        ##
        # print("x.size() ->", x.size())
        # take last from
        # out = x[:, -1, :]
        
        # print("selec", out.size())
        
        out = self.l0(x).squeeze()
        # print("after l0 ->", out.size())
        
        out = self.l1(out)
        # print("after l1 ->", out.size())

        # out = self.fc(out)
        # print("after fc ->", out.size())
        
        out = self.sig(out)
        # print("after sig ->",out.size())
        
        return out
    
    
class Optimization:
    def __init__(self, model, loss_fn, optimizer):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.train_losses = []
        self.val_losses = []
    
    # @pysnooper.snoop()
    # @pysnooper.snoop(watch=('self.model.state_dict()'))
    def train_step(self, x, y):
        # Sets model to train mode
        self.model.train()

        # Makes predictions
        yhat = self.model(x)

        # Computes loss
        # print("before", self.model.state_dict())
        loss = self.loss_fn(y, yhat).squeeze()
        # Computes gradients
        loss.backward()
        
        # Updates parameters and zeroes gradients
        self.optimizer.step()
        
        self.optimizer.zero_grad()
        
        # Returns the loss
        return loss.item()
    
    # @pysnooper.snoop()
    def train(self, train_loader, val_loader, batch_size=32, n_epochs=1, n_features=1):
        # model_path = f'models/{self.model}_{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'

        for epoch in range(1, n_epochs + 1):
            batch_losses = []
            # print("batches(x100)->", end="")
            for batches, (x_batch, y_batch) in enumerate(train_loader):
                
                # x_batch = x_batch.view([batch_size, -1, n_features]).to(device)
                x_batch = x_batch.to(device)

                # print("??", x_batch.shape)
                
                y_batch = y_batch.to(device)
                loss = self.train_step(x_batch, y_batch)
                
                batch_losses.append(loss)

                if batches>0 and batches % 200 == 0:
                    print(f">[batch[{batches}]] Batch loss: {loss}")
                
            training_loss = np.mean(batch_losses)
            self.train_losses.append(training_loss)
            # print("<-done")
            
            with torch.no_grad():
                batch_val_losses = []
                for x_val, y_val in val_loader:
                    # x_val = x_val.view([batch_size, -1, n_features]).to(device)
                    x_val = x_val.to(device)
                    y_val = y_val.to(device)
                    self.model.eval()
                    yhat = self.model(x_val)
                    val_loss = self.loss_fn(y_val, yhat).item()
                    batch_val_losses.append(val_loss)
                validation_loss = np.mean(batch_val_losses)
                self.val_losses.append(validation_loss)

            if (epoch <= 10) | (epoch % 50 == 0):
                print(
                    f"[{epoch}/{n_epochs}] Training loss: {training_loss:.4f}\t Validation loss: {validation_loss:.4f}"
                )

        # torch.save(self.model.state_dict(), model_path)

    def evaluate(self, test_loader, batch_size=1, n_features=1):
        with torch.no_grad():
            predictions = []
            values = []
            for x_test, y_test in test_loader:
                # x_test = x_test.view([batch_size, -1, n_features]).to(device)
    
                x_test = x_test.to(device)
                y_test = y_test.to(device)
                self.model.eval()
                yhat = self.model(x_test)
                predictions.append(yhat.to(device).detach().cpu().numpy())
                values.append(y_test.to(device).detach().cpu().numpy())

        return predictions, values

In [37]:
bidirectional = 1
input_size = X_train.shape[2]
H_in = 16  # size of hidden state
H_out = 1
num_layers = 1

model = nn.Sequential(
          LinearModel(input_size, H_in, num_layers, H_out),
        ).to(device)

print(model)

#BCEWithLogitsLoss no
# weight= torch.tensor()[.5], dtype=torch.float).to(device)
opt = Optimization(
    LinearModel(input_size, H_in, num_layers, H_out).to(device),
    nn.BCELoss(),#weight),
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
)
print(opt.model.state_dict())
opt.train(train_dataloader, test_dataloader, n_epochs=300)

Sequential(
  (0): LinearModel(
    (l0): Linear(in_features=4, out_features=1, bias=True)
    (l1): Linear(in_features=10, out_features=1, bias=True)
    (sig): Sigmoid()
  )
)
OrderedDict([('l0.weight', tensor([[-0.1348,  0.2135, -0.1253,  0.0395]], device='cuda:0')), ('l0.bias', tensor([0.3954], device='cuda:0')), ('l1.weight', tensor([[ 0.2106, -0.1290,  0.2045, -0.2302,  0.2786,  0.2670,  0.2277, -0.0564,
          0.1304, -0.1761]], device='cuda:0')), ('l1.bias', tensor([0.2071], device='cuda:0'))])
[1/300] Training loss: 58.4632	 Validation loss: 78.0981
[2/300] Training loss: 58.1466	 Validation loss: 79.2114
[3/300] Training loss: 58.4648	 Validation loss: 76.9850
[4/300] Training loss: 58.7997	 Validation loss: 80.3258
[5/300] Training loss: 58.1537	 Validation loss: 79.2113
[6/300] Training loss: 57.8448	 Validation loss: 78.0997
[7/300] Training loss: 58.4689	 Validation loss: 76.9834
[8/300] Training loss: 58.4743	 Validation loss: 79.2117
[9/300] Training loss: 58.1653	 V

In [None]:
p, v = opt.evaluate(test_dataloader)

p_c = np.concatenate(p).squeeze()
v_c = np.concatenate(v).squeeze()