In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np

In [4]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

def vectorize_clinical_note(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

pancan_subj_path = '../subj_hadm_process/pancan_subj.npy'
safe_subj_path = '../subj_hadm_process/safe_subj.npy'

import json

with open('../subj_hadm_process/hadms.json', 'r') as f: ### PUT NAME OF THRID FILE HERE
    json_data = json.load(f)
json_data = {int(k): int(v) for k, v in json_data.items()}

valid_hadms = []
for key in json_data:
    valid_hadms.append(json_data[key])
print(len(valid_hadms)) #610

pancan_subj = np.load(pancan_subj_path, allow_pickle=True).tolist()
safe_subj = np.load(safe_subj_path, allow_pickle=True).tolist()

# Load the clinical notes DataFrame
csv_file_path = '/Users/kushagragarwal2443/Documents/CMU/mimic-iv-note-deidentified-free-text-clinical-notes-2.2/note/discharge.csv'
df_notes = pd.read_csv(csv_file_path)

# Function to filter for the last note per patient and vectorize, adding zero vector if not present
def filter_and_vectorize_last_notes(df, patient_list, id_col='subject_id', note_col='text', time_col='charttime', vector_length=768):
    # Initialize a dictionary to hold vectors for all patients, defaulting to zero vectors
    patient_vectors = {patient_id: np.zeros((vector_length,)) for patient_id in patient_list}

    # Filter for the patients of interest who are in the DataFrame
    df_filtered = df[df[id_col].isin(patient_list)]

    df_filtered = df_filtered[df_filtered['hadm_id'].isin(valid_hadms)].reset_index(drop=True)

    # Sort by patient ID and chart time, then drop duplicates to keep only the last entry per patient
    df_last_notes = df_filtered.sort_values(by=[id_col, time_col]).reset_index(drop=True)

    # Vectorize the final note for each patient in df_last_notes
    for _, row in df_last_notes.iterrows():
        patient_id = row[id_col]
        note_text = row[note_col]
        patient_vectors[patient_id] = vectorize_clinical_note(note_text).numpy().flatten()  # Flatten to ensure it's a 1D array

    # Convert dictionary values to a numpy array for all patients in the patient_list
    vectors = np.array(list(patient_vectors.values()))

    return vectors

# Vectorize the final clinical note for each patient in both groups, ensuring a zero vector for missing subjects
pancan_vectors = filter_and_vectorize_last_notes(df_notes, pancan_subj, id_col='subject_id', note_col='text', time_col='charttime')
np.save('./pancan_vectors.npy', pancan_vectors)
safe_vectors = filter_and_vectorize_last_notes(df_notes, safe_subj, id_col='subject_id', note_col='text', time_col='charttime')
np.save('./safe_vectors.npy', safe_vectors)
print(len(pancan_vectors), len(safe_vectors))

17116
8473 8643
