## Setting up the model

In [1]:
#Choose a data set
data_set = "Breast Cancer"

#Choose how to impute tabular data
imputer = "zero_imp"

In [2]:
import datetime
from datetime import timedelta
import json
import sys
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split
import os.path
from pathlib import Path
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, logging, LongformerModel, LongformerTokenizer, AutoModelForMaskedLM
import torch
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments
from transformers import Trainer
import math
warnings.filterwarnings("ignore")
import logging


with open('configs/config_' + data_set+ '.json') as config_file:
        UCI_config = json.load(config_file)

""
           #Load data and parameters
""
EXAMPLE_PATH = UCI_config["EXAMPLE_PATH"]
TABLES_FILE = UCI_config["TABLES_FILE"]
COLUMNS_PATH = UCI_config["COLUMNS_PATH"]
ID_COL = UCI_config["ID_COL"]
TARGET_FILE = UCI_config["TARGET_INFO_FILE"]
TARGET_COL = UCI_config["TARGET_COL"]
split_seed = UCI_config["TARGET_SPLIT_SEED"]
split_ratio = UCI_config["TEST_SPLIT_RATIO"]
TIME_COL = None

sys.path.insert(0, './../../src')
from get_data_info import *
from get_patients import *
from get_features import *
from train_models import *

DATA_PATH = EXAMPLE_PATH + UCI_config["RAW_DATA_PATH"] 
paths = [EXAMPLE_PATH, DATA_PATH, TABLES_FILE, COLUMNS_PATH]

  from .autonotebook import tqdm as notebook_tqdm
2025-03-16 19:13:28.919936: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-03-16 19:13:29.057391: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-16 19:13:30.720537: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/pkg/nccl/nccl-2.18.1-cuda1

In [4]:
#load file with the targets
targets_df = pd.read_csv(TARGET_FILE)

#split into training and testing sets
train_df, test_df = train_test_split(targets_df, test_size=split_ratio, random_state=split_seed, stratify=targets_df[TARGET_COL])

#find the subject ids for training and testing
training_ids = train_df[ID_COL].unique()
testing_ids = test_df[ID_COL].unique()

#save all the model information
save_model_info(paths, ID_COL, TIME_COL, imputer, training_ids, testing_ids, model_name=data_set)

In [5]:
#Choose text and llm settings
prefix = ""
missing = ""
replace = True
descriptive = True
meta = True
clinical = True
long = True
biogpt = False

## FineTuning Model 
We use the textual data from the training set to fine-tune the model (in this case Clinical Longformer)

In [6]:
llm_name = ""
original_llm_path = ""

if long:
    if clinical:
        original_llm_path = "./../../LLMs/ClinicalLongformer/"
        llm_name = "ClinicalLongformer"
    else:
        original_llm_path = "./../../LLMs/Longformer/"
        llm_name = "Longformer"
if biogpt:
    assert(long==False)
    assert(clinical==False)
    assert(finetuned==False)
    llm_name = "BioGPT"
    
folder_name = "Training/" + llm_name + "/" + data_set + "/"
sent_name = "RAW_DATA_" + str(prefix) +"_"+ str(missing) +"_"+ str(replace) +"_"+ str(descriptive) +"_"+ str(meta)

In [7]:
#Create text data for the training set
tables_info, global_imputer, all_ids = get_model_info(paths, ID_COL, TIME_COL, imputer, "Training", None, model_name=data_set)
get_and_save_pickle_patients(tables_info, ID_COL, TIME_COL, all_ids, prefix, missing, replace, descriptive, meta, global_imputer, folder_name, EXAMPLE_PATH, "RAW_DATA", clinical, long, biogpt, "" , ["text"])
get_and_save_features(all_ids, TIME_COL, ID_COL, ["text"], None, folder_name, EXAMPLE_PATH, sent_name, job_id=(str(0)))

In [8]:
#Finetune the pretrained model
X_text = pd.read_csv(folder_name + "text/" + sent_name + "/Features/0.csv", index_col=0)
X_train = X_text[[ID_COL, "text"]]
fine_tune(X_train, original_llm_path, data_set, batch_size=4)

Some weights of the model checkpoint at ./../../LLMs/ClinicalLongformer/model were not used when initializing LongformerForMaskedLM: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using cuda_amp half precision backend                         
The following columns in the training set don't have a corresponding argument in `LongformerForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `LongformerForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examp

Epoch,Training Loss,Validation Loss
1,1.1806,0.181715
2,0.1737,0.128674
3,0.105,0.069346
4,0.0691,0.050183
5,0.0508,0.036963
6,0.051,0.033436
7,0.0483,0.056211


The following columns in the evaluation set don't have a corresponding argument in `LongformerForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `LongformerForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
The following columns in the evaluation set don't have a corresponding argument in `LongformerForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `LongformerForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
The following columns in the evaluation set don't have a corresponding argument in `LongformerForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `LongformerForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
The following columns in the evaluation set do

Saving model checkpoint to Breast Cancer_finetuned
Configuration saved in Breast Cancer_finetuned/config.json


>>> Perplexity: 1.02


Model weights saved in Breast Cancer_finetuned/pytorch_model.bin


## Create Tabtext Embeddings 
We next create embeddings for training and testing sets using the finetuned llm

In [13]:
#Select finetuned or not finetuned model
finetuned = True
feature_types = ["sep_imputations", "sep_embeddings"]

llm_name = ""

if long:
    if clinical:
        if finetuned:
            llm_name = "ClinicalLongformerFinetuned"
            finetuned_path = data_set + "_finetuned"
        else:
            llm_name = "ClinicalLongformer"
    else:
        if finetuned:
            llm_name = "LongformerFinetuned"
            finetuned_path = data_set + "_finetuned"
        else:
            llm_name = "Longformer"
if biogpt:
    assert(long==False)
    assert(clinical==False)
    assert(finetuned==False)
    llm_name = "BioGPT"

In [14]:
import logging
# Set the logging level to WARNING (this will suppress INFO messages like "loading file ...")
logging.getLogger("transformers").setLevel(logging.WARNING)

#Training embeddings
folder_name = "Training/" + llm_name + "/" + data_set + "/"
tables_info, global_imputer, all_ids = get_model_info(paths, ID_COL, TIME_COL, imputer, "Training", None, model_name=data_set)
get_and_save_pickle_patients(tables_info, ID_COL, TIME_COL, all_ids, prefix, missing, replace, descriptive, meta, global_imputer, folder_name, EXAMPLE_PATH, "RAW_DATA", clinical, long, biogpt, finetuned_path, feature_types)

sent_name = "RAW_DATA_" + str(prefix) +"_"+ str(missing) +"_"+ str(replace) +"_"+ str(descriptive) +"_"+ str(meta)
get_and_save_features(all_ids, TIME_COL, ID_COL, feature_types, None, folder_name, EXAMPLE_PATH, sent_name, job_id="0")


#Testing embeddings
folder_name = "Testing/" + llm_name + "/" + data_set + "/"
tables_info, global_imputer, all_ids = get_model_info(paths, ID_COL, TIME_COL, imputer, "Testing", None, model_name=data_set)
get_and_save_pickle_patients(tables_info, ID_COL, TIME_COL, all_ids, prefix, missing, replace, descriptive, meta, global_imputer, folder_name, EXAMPLE_PATH, "RAW_DATA", clinical, long, biogpt, finetuned_path, feature_types)

sent_name = "RAW_DATA_" + str(prefix) +"_"+ str(missing) +"_"+ str(replace) +"_"+ str(descriptive) +"_"+ str(meta)
get_and_save_features(all_ids, TIME_COL, ID_COL, feature_types, None, folder_name, EXAMPLE_PATH, sent_name, job_id="0")

## Training for Downstream Task

In [15]:
X_emb_train = load_embeddings("Training/" + llm_name + "/" + data_set + "/sep_embeddings/" + sent_name + "/Features/", start=0, num_files=1)
X_tab_train = load_embeddings("Training/" + llm_name + "/" + data_set + "/sep_imputations/" + sent_name+"/Features/", start=0, num_files=1)

X_emb_test = load_embeddings("Testing/" + llm_name + "/" + data_set + "/sep_embeddings/" + sent_name + "/Features/", start=0, num_files=1)
X_tab_test = load_embeddings("Testing/" + llm_name + "/" + data_set + "/sep_imputations/" + sent_name + "/Features/", start=0, num_files=1)

targets_df = pd.read_csv(TARGET_FILE)[[ID_COL, TARGET_COL]]
le = LabelEncoder()
target_encoded = le.fit_transform(targets_df[TARGET_COL])
targets_df[TARGET_COL] = target_encoded
num_classes = len(le.classes_)

In [16]:
merged_train = X_tab_train.merge(X_emb_train, on=[ID_COL], how="inner").merge(targets_df, on=[ID_COL], how="inner")
merged_train = merged_train.drop(columns=[ID_COL])

merged_valtest = X_tab_test.merge(X_emb_test, on=[ID_COL], how="inner").merge(targets_df, on=[ID_COL], how="inner")
merged_valtest = merged_valtest.drop(columns=[ID_COL])

In [17]:
def get_valid_cols(method, merged_columns, tab_columns):
    valid_cols = []
    if method == 'tabular':
        valid_cols = [c for c in merged_columns if ((c in tab_columns) or (c==TARGET_COL))]
    elif method == 'merged':
        valid_cols = list(merged_columns)
    elif method == 'language':
        valid_cols = [c for c in merged_columns if (c not in tab_columns)]
    return valid_cols

In [18]:
#From unseen data, select 15% for validation and 15% for testing
merged_test, merged_val = train_test_split(merged_valtest, test_size=0.25, random_state=split_seed, stratify=merged_valtest[TARGET_COL])

In [19]:
for method in ["tabular", "merged", "language"]:
    folder_name =  data_set  + "_" + method
    valid_cols = get_valid_cols(method, merged_train.columns, X_tab_train.columns)
    df_train, df_val, df_test = merged_train[valid_cols], merged_val[valid_cols], merged_test[valid_cols]

    for n_est in [100, 200, 300]:
        for max_param in [3, 5, 7]:
            for lr in [0.05, 0.1, 0.3]:
                for λ in [0.01, 0.001, 1e-4, 1e-5, 0]:

                    val_auc, val_acc, _ = train_xgb(df_train, df_val, TARGET_COL, n_est, max_param, lr, λ, num_classes)
                    test_auc, test_acc, _ = train_xgb(pd.concat([df_train, df_val], axis=0), df_test, TARGET_COL, n_est, max_param, lr, λ, num_classes)
                    target = TARGET_COL

                    results = [target, val_auc, test_auc, str(df_train.shape), str(df_val.shape), str(df_test.shape),
                               val_acc, test_acc, n_est, max_param, lr, λ, split_seed]

                    column_list = ["target", "val_auc",  "test_auc",  "train_size", "val_size", "test_size", 
                                   "val_acc", "test_acc", "n_est", "max_param", "lr", "lambda", "seed"]

                    df_results = pd.DataFrame(np.array([results])) 

                    if not os.path.exists(EXAMPLE_PATH + 'Results/'+llm_name + '/'+ sent_name + "/" ):
                        os.makedirs(EXAMPLE_PATH + 'Results/'+ llm_name  +'/' + sent_name + "/" )

                    # if file does not exist write header   
                    if not os.path.isfile(EXAMPLE_PATH + 'Results/'+ llm_name  +'/' + sent_name + "/" + folder_name + ".csv"):
                        pd.DataFrame([column_list]).to_csv(EXAMPLE_PATH + 'Results/'+ llm_name +'/' + sent_name + "/" + folder_name + ".csv", header=False)

                    # else it exists so append without writing the header
                    df_results.to_csv(EXAMPLE_PATH + 'Results/'+ llm_name  +'/'+ sent_name + "/" + folder_name + ".csv",
                                      mode='a', header=False)

## Evaluate Performance

In [20]:
avg_tabular = 0
avg_language = 0
avg_merged = 0
num_seeds = 1

df_tab = pd.read_csv('Results/'+ llm_name  +'/' + sent_name + "/" + data_set +  '_tabular.csv')
df_lang = pd.read_csv('Results/'+ llm_name  +'/' + sent_name + "/" + data_set + '_language.csv')
df_merged = pd.read_csv('Results/'+ llm_name  +'/' + sent_name + "/" + data_set + '_merged.csv')

tab_shape = (df_tab['train_size'].iloc[0], df_tab['val_size'].iloc[0], df_tab['test_size'].iloc[0])
lang_shape = (df_lang['train_size'].iloc[0], df_lang['val_size'].iloc[0], df_lang['test_size'].iloc[0])
merged_shape = (df_merged['train_size'].iloc[0], df_merged['val_size'].iloc[0], df_merged['test_size'].iloc[0])

tab_size = eval(tab_shape[0])[0] +  eval(tab_shape[1])[0] +  eval(tab_shape[2])[0] 
lang_size = eval(lang_shape[0])[0] +  eval(lang_shape[1])[0] +  eval(lang_shape[2])[0] 
merged_size = eval(merged_shape[0])[0] +  eval(merged_shape[1])[0] +  eval(merged_shape[2])[0] 

assert(tab_size == lang_size)
assert(merged_size == lang_size)

for i in range(1):
    dfi_tab = df_tab[df_tab['seed']==i]
    dfi_lang = df_lang[df_lang['seed']==i]
    dfi_merged = df_merged[df_merged['seed']==i]

    auc_tab = dfi_tab[dfi_tab['val_auc'] == dfi_tab['val_auc'].max()]['test_auc'].mean()
    auc_lang = dfi_lang[dfi_lang['val_auc'] == dfi_lang['val_auc'].max()]['test_auc'].mean()
    auc_merged = dfi_merged[dfi_merged['val_auc'] == dfi_merged['val_auc'].max()]['test_auc'].mean()
    
    print(auc_tab, auc_lang,auc_merged )

    avg_tabular += auc_tab
    avg_language += auc_lang
    avg_merged += auc_merged  

avg_tabular = round(avg_tabular/num_seeds, 3)
avg_language = round(avg_language/num_seeds, 3)
avg_merged = round(avg_merged/num_seeds, 3)

0.6705653021442495 0.72046783625731 0.7087719298245614


In [21]:
print("tabular AUC: ", avg_tabular)
print("language AUC: ", avg_language)
print("merged AUC: ", avg_merged)

tabular AUC:  0.671
language AUC:  0.72
merged AUC:  0.709
