### Read data and create Y-labels

In [1]:
import pandas as pd
import os

#read
folder_name = "alldata/note"
file_name = "discharge.csv"
file_path = os.path.join(os.getcwd(), folder_name, file_name)
df = pd.read_csv(file_path)
df.head(1)

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text
0,10000032-DS-21,10000032,22595853,DS,21,2180-05-07 00:00:00,2180-05-09 15:26:00,\nName: ___ Unit No: _...


In [2]:
#create Y label
df['charttime'] = pd.to_datetime(df['charttime'])
df = df.sort_values(by=['subject_id', 'charttime'])
df['Y'] = df.groupby('subject_id')['charttime'].shift(-1).notna().astype(int)
df.head(1)

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,Y
0,10000032-DS-21,10000032,22595853,DS,21,2180-05-07,2180-05-09 15:26:00,\nName: ___ Unit No: _...,1


### ClinicalBert - import and load the model
This is already pre-trained on clinical data and not fine-tuned for ours

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load ClinicalBERT tokenizer and model
model_name = "emilyalsentzer/Bio_ClinicalBERT"  # ClinicalBERT pretrained on MIMIC-III
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/kalyanilimaye/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

### Sample 10k rows

In [3]:
#Randomly sample rows from the dataset for reproducibility
sampled_df = df.sample(n=10000, random_state=42).reset_index(drop=True)

In [14]:
sampled_df.head(1)

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,Y
0,10202247-DS-15,10202247,28736349,DS,15,2173-11-11,2173-11-15 13:25:00,\nName: ___ Unit No: __...,1


This is just to try and make the runs on my Mac faster due to lack of GPU

In [34]:
torch.backends.mps.is_available()

True

### Hierarchical embedding function

In [32]:
import torch
from transformers import AutoTokenizer, AutoModel
import nltk
nltk.download('punkt')  # For sentence tokenization
from nltk.tokenize import sent_tokenize


device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model.to(device)  # Move the model to MPS (or CPU if not available)

def get_hierarchical_embedding(text, model, tokenizer, max_length=512):
    """
    Hierarchical chunking approach:
      1) Split text into sentences
      2) For each sentence, chunk into <=512 tokens
      3) Average chunk embeddings to get a sentence-level embedding
      4) Average all sentence embeddings to get a document-level embedding
    """
    
    sentences = sent_tokenize(text)
    sentence_embeddings = []

    for sent in sentences:
        # Tokenize without truncation to get full tokens for this sentence
        full_tokens = tokenizer(sent, truncation=False, padding=False)
        input_ids = full_tokens["input_ids"]
        
        num_chunks = (len(input_ids) // max_length) + (1 if len(input_ids) % max_length > 0 else 0)
        chunk_embeddings = []
        
        for i in range(num_chunks):
            chunk_ids = input_ids[i * max_length : (i + 1) * max_length]
            
            chunk_tokens = {
                "input_ids": torch.tensor([chunk_ids], device=device)
            }

            with torch.no_grad():
                output = model(**chunk_tokens)
            
            chunk_embedding = output.last_hidden_state.mean(dim=1)  # (1, 768)
            chunk_embeddings.append(chunk_embedding)
        
        if len(chunk_embeddings) > 0:
            sentence_embedding = torch.stack(chunk_embeddings).mean(dim=0)  # (1, 768)
            sentence_embeddings.append(sentence_embedding)
    
    if len(sentence_embeddings) == 0:
        return torch.zeros(model.config.hidden_size, device=device).cpu().numpy()

    doc_embedding = torch.stack(sentence_embeddings).mean(dim=0)  # (1, 768)
    return doc_embedding.squeeze().cpu().numpy()


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kalyanilimaye/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### run function on our sampled data and save

In [35]:
import time

start_time = time.time()

embeddings = []
for i, row in sampled_df.iterrows():
    text = row['text']
    emb = get_hierarchical_embedding(text, model, tokenizer)
    embeddings.append(emb)

end_time = time.time()
print(f"Time taken for 10000 rows: {end_time - start_time:.2f} seconds")

embedding_df = pd.DataFrame(np.array(embeddings))

embedding_df['Y'] = sampled_df['Y'].values
save_path = os.path.join("alldata/note", "semantic_BertHier_FullText.csv")
embedding_df.to_csv(save_path, index=False)
embedding_df.head(2)

Time taken for 10000 rows: 19264.25 seconds


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,Y
0,0.186606,0.085287,-0.136578,0.114657,0.078886,-0.039044,0.111851,0.070599,0.084644,-0.01935,...,0.015664,-0.259481,0.042405,0.033597,0.030835,0.149914,0.223261,0.122565,-0.01984,1
1,0.206593,0.026372,0.024923,0.170108,0.071886,-0.061216,0.113776,0.092842,0.062146,0.004418,...,-0.015256,-0.130282,-0.003898,0.104951,-0.032151,0.072405,0.173873,0.109439,-0.069683,1
