# import statements and upload data

In [None]:
import pandas as pd
import numpy as np
# import torch
# import torch.optim as optim
# from torch import nn
# from torch.utils.data import Dataset, DataLoader
# import shap

# from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, mean_squared_error
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler 

# data = pd.read_csv("../clean_data/nafl/combined.large.nafl.csv")

In [None]:
# create the X and Y datasets

# data = data.drop(columns='DaysUntilFirstProgression')
data = data.drop(columns='Outcome')
data = data.drop(columns='Censored')

# Y = data[['StudyID', 'Outcome']]
Y = data[['StudyID', 'DaysUntilFirstProgression']]
X = data.drop(columns='DaysUntilFirstProgression')
X = X.drop(columns=['mean_BMI_category', 'last_BMI_category'])

X = X.set_index('StudyID')
Y = Y.set_index('StudyID')

# get all features that start with Lab
lab_feat = [feat for feat in X.columns if 'Lab' in feat]
numerical_feat = ['mean_BMI', 'last_BMI', 'FirstNAFL.Age.90']
numerical_feat.extend(lab_feat)

In [None]:
# check if GPU is enabled
device = "cuda" if torch.cuda.is_available() else "cpu" # need to define device since python can use both cpu and gpu
print(f"Using {device} device")
print(f"Shape of X: {X.shape}. Shape of Y: {Y.shape}.")

# setup the model using saved weights

In [None]:
# curate the dataset
class MAFLDDataset(Dataset): # must contain init, len, and getitem
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# dataset = MAFLDDataset(X_torch, Y_torch)
# train_loader = DataLoader(dataset, batch_size=64, shuffle=True) # batch size 64

In [None]:
# define by subclassing nn.Module and initialize the neural network layers in __init__.
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__() # inherit init from parent class
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(X.shape[1], 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1), # no activation follows this layer
        )
        # self.sigmoid = nn.Sigmoid() # remove if predicting non-binary outcome

    def forward(self, x):
        pred = self.linear_relu_stack(x)
        return pred

In [None]:
# create an instance of NeuralNetwork, move to device, print its structure
model = NeuralNetwork().to(device)
model.load_state_dict(torch.load("numeric_nn_scaled_x_and_y.pth"))

# define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3) # start with this baseline learning rate

# scale the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [None]:
# write a function to only standardize the numerical columns and reattach to the rest of the dataframe
scaler = StandardScaler()

def standardize_numerical(dataframe, num_feat=numerical_feat, training_set=True):
    """
    dataframe: Pandas DataFrame

    Returns: a processed DataFrame where the numerical features have been standardized and the categorical features remain the same.
    """
    if training_set:
        scaled = scaler.fit_transform(dataframe[num_feat])
    else:
        scaled = scaler.transform(dataframe[num_feat])
        
    scaled_df = pd.DataFrame(scaled, columns=num_feat, index=dataframe.index)
    cat = dataframe.drop(columns=num_feat)
    processed = pd.concat([scaled_df, cat], axis=1)

    return processed

In [None]:
# standardize our features
X_train_scaled = standardize_numerical(X_train, training_set=True)
X_test_scaled = standardize_numerical(X_test, training_set=False)

In [None]:
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.to_numpy().reshape(-1, 1))

# get the features from lin/log reg model

In [None]:
neg_feat_lin_reg = ['MedType_Code_HCPCS_J0456', 'MedType_Code_EPIC-MED_17380', 'MedType_Code_EPIC-MED_26226', 'MedType_Code_EPIC-MED_6004080010', 'Gender_Legal_Sex_Unknown-U', 'MedType_Code_HCPCS_J2720', 'MedType_Code_HCPCS_J2590', 'MedType_Code_EPIC-MED_29132', 'MedType_Code_EPIC-MED_18302', 'MedType_Code_EPIC-PRC_47501945']
pos_feat_lin_reg = ['MedType_Code_EPIC-PRC_47561667', 'MedType_Code_EPIC-MED_98249', 'MedType_Code_HCPCS_J1756', 'MedType_Code_EPIC-MED_6677', 'MedType_Code_HCPCS_J1453', 'MedType_Code_HCPCS_J7507', 'MedType_Code_EPIC-PRC_77100001', 'MedType_Code_HCPCS_J2185', 'MedType_Code_HCPCS_C9113', 'MedType_Code_EPIC-MED_21063']

In [None]:
neg_feat_log_reg = ['MedType_Code_HCPCS_J0456', 'MedType_Code_EPIC-MED_17380', 'MedType_Code_EPIC-MED_26226', 'MedType_Code_EPIC-MED_6004080010', 'Gender_Legal_Sex_Unknown-U', 'MedType_Code_HCPCS_J2720', 'MedType_Code_HCPCS_J2590', 'MedType_Code_EPIC-MED_29132', 'MedType_Code_EPIC-MED_18302', 'MedType_Code_EPIC-PRC_47501945']
pos_feat_log_reg = ['MedType_Code_EPIC-PRC_47561667', 'MedType_Code_EPIC-MED_98249', 'MedType_Code_HCPCS_J1756', 'MedType_Code_EPIC-MED_6677', 'MedType_Code_HCPCS_J1453', 'MedType_Code_HCPCS_J7507', 'MedType_Code_EPIC-PRC_77100001', 'MedType_Code_HCPCS_J2185', 'MedType_Code_HCPCS_C9113', 'MedType_Code_EPIC-MED_21063']

# build shap explainer

In [None]:
def model_wrapper(array):
    ''' Wrapper around the torch model() function to pass into shap explainer.
        array: Pandas DataFrame
        Returns: torch
    '''
    if isinstance(array, pd.DataFrame):
        array = array.to_numpy()
    array = torch.tensor(array.astype('float32')).to(device)
    # array = array.to(device)
    model.eval()
    y_hat_test_scaled = model(array)
    y_hat_test_scaled = y_hat_test_scaled.cpu().detach()
    y_pred_rescaled = scaler_y.inverse_transform(y_hat_test_scaled.numpy())
    return y_pred_rescaled

feature_names = X.columns

In [None]:
explainer = shap.DeepExplainer(model, torch.tensor(X_train_scaled.to_numpy().astype(np.float32)).to(device))

In [None]:
shap_values = explainer.shap_values(torch.tensor(X_test_scaled.to_numpy().astype(np.float32)).to(device))

# getting human-readable names from the shap features

In [None]:
shap_positive = ['Lab_19153-6',
 'Lab_2093-3',
 'MedType_Code_EPIC-MED_10328',
 'Lab_3094-0',
 'Lab_2089-1',
 'Lab_1968-7',
 'Lab_2695-5',
 'MedType_Code_EPIC-MED_10012',
 'MedType_Code_EPIC-MED_10368',
 'MedType_Code_EPIC-MED_27698']

shap_negative = ['Lab_777-3',
 'Lab_2336-6',
 'Lab_4679-7',
 'Lab_2502-3',
 'Lab_2284-8',
 'Lab_789-8',
 'Code_Z12.5',
 'MedType_Code_LMR_576',
 'MedType_Code_EPIC-MED_693',
 'Lab_XC5-9']

shap_top_10 = ['Lab_4679-7',
 'Lab_14338-8',
 'Lab_2132-9',
 'Lab_6768-6',
 'Code_Z23',
 'Lab_6690-2',
 'Lab_2093-3',
 'MedType_Code_EPIC-MED_10328',
 'Lab_13457-7',
 'Lab_2571-8']

In [None]:
# setup the translate function
med_df = pd.read_csv("/nobackup/users/ericason/mlhc-final-project/data/NAFLpatients_Jan2025request/Med_all.use.final.txt", delimiter="\t", header=0)
lab_df = pd.read_csv("/nobackup/users/ericason/mlhc-final-project/data/NAFLpatients_Jan2025request/Lab_all.use.final.txt", delimiter="\t", header=0)
dia_df = pd.read_csv("/nobackup/users/ericason/mlhc-final-project/data/NAFLpatients_Jan2025request/Dia_all.use.final.txt", delimiter="\t", header=0)

med_codes = "MedType_Code_" + med_df["Code_Type"] + "_" + med_df["Code"]
med_codes_df = pd.concat([med_codes, med_df["Medication"]], axis=1)
med_codes_df.columns = ["Code", "Medication"]
med_codes_df = med_codes_df.drop_duplicates() # drop duplicate codes and medications

lab_codes = "Lab_" + lab_df["Loinc_Code"]
lab_codes_df = pd.concat([lab_codes, lab_df["Test_Description"]], axis=1)
lab_codes_df.columns = ["Code", "Lab Test"]
lab_codes_df = lab_codes_df.drop_duplicates() # drop duplicate codes and medications

dia_codes = "Code_" + dia_df["Code"]
dia_codes_df = pd.concat([dia_codes, dia_df["Diagnosis_Name"]], axis=1)
dia_codes_df.columns = ["Code", "Diagnosis"]
dia_codes_df = dia_codes_df.drop_duplicates() # drop duplicate codes and medications

In [None]:
def translate_codes(input):
    """
    Given a list of various codified features, return a list in the same order with human-readable names.
    input: list
    Returns: list
    """
    output = []

    for code in input:
        if 'Med' in code:
            translation = translate_helper(code, med_codes_df)
        elif 'Lab' in code:
            translation = translate_helper(code, lab_codes_df)
        elif 'Code' in code:
            translation = translate_helper(code, dia_codes_df)
        else:
            translation = "Unknown"

        output.append({'code': code, 'description': translation})

    return pd.DataFrame(output)
            
def translate_helper(code, df):
    foo = df[df['Code'] == code] # .drop_duplicates(subset='Code', keep='first')
    return foo.iloc[:, 1]

In [None]:
foo = ['Code_R53.83',
 'Code_R79.89',
 'Lab_2078-4',
 'Lab_3094-0',
 'Code_E11.9',
 'Lab_2132-9',
 'Lab_2571-8',
 'Code_M79.672',
 'Code_175',
 'Code_R53.81']

bar = translate_codes(foo)
bar

In [None]:
pd.set_option('display.max_colwidth', 3000)