In [None]:
"""
Notebook: Query-Focused EHR Summarization Reproduction
Course: CS598 - Deep Learning for Healthcare (Spring 2025)
Authors: Abhitej Bokka (abhitej2), Liam Shen (liams4)
Paper: Query-Focused EHR Summarization to Aid Imaging Diagnosis
Paper Link: https://arxiv.org/abs/2004.04645

Purpose:
This notebook reproduces the core findings of the paper by implementing a transformer-based
extractive summarization model trained on MIMIC-III clinical notes. The system learns to identify
sentences relevant to future ICD diagnostic codes (within a 30-day window) using distantly supervised labels.

Main Steps:
- Preprocess MIMIC-III discharge summaries using sentence segmentation and ICD code parsing.
- Create distant supervision labels based on future ICD codes.
- Train a transformer-based sentence classifier (e.g., ClinicalBERT).
- Evaluate model performance using metrics like precision, recall, and F1 score.

Environment:
This notebook is designed for use in a Python environment with support for Hugging Face Transformers,
PyTorch, scikit-learn, and NLTK. It is compatible with both CPU and GPU setups (e.g., Colab, local, or cluster).

Instructions:
- Run the first cell below to install all necessary dependencies.
- Ensure access to MIMIC-III and relevant derived datasets prior to execution.
"""
# Install dependencies
!pip install transformers
!pip install nltk
!pip install scikit-learn
!pip install torch




In [3]:
# Setup spaCy
!pip install -U spacy
import spacy

# Download English tokenizer model
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
from google.colab import files

uploaded = files.upload()


KeyboardInterrupt: 

In [4]:
# Cell 1 - Import necessary libraries
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc, precision_recall_curve

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Load and clean header
notes_df = pd.read_csv("sample10000.csv")

# Load other files
diagnoses_df = pd.read_csv("diagnoses_icd.csv")
admissions_df = pd.read_csv("admissions.csv", parse_dates=["ADMITTIME", "DISCHTIME"])

# Ensure CHARTTIME is datetime
notes_df["CHARTTIME"] = pd.to_datetime(notes_df["CHARTTIME"])


# Merge to get admission/discharge window
merged_df = pd.merge(notes_df, admissions_df[["HADM_ID", "ADMITTIME", "DISCHTIME"]],
                     left_on="HADM_ID", right_on="HADM_ID", how="inner")

# Simulate CHARTTIME if missing or null
def simulate_charttime(row):
    if pd.notnull(row["ADMITTIME"]) and pd.notnull(row["DISCHTIME"]):
        return row["ADMITTIME"] + (row["DISCHTIME"] - row["ADMITTIME"]) * np.random.rand()
    return pd.NaT

merged_df["CHARTTIME"] = merged_df.apply(simulate_charttime, axis=1)

filtered_notes = merged_df[
    (merged_df["CHARTTIME"] >= merged_df["ADMITTIME"]) &
    (merged_df["CHARTTIME"] <= merged_df["DISCHTIME"])
][["SUBJECT_ID", "HADM_ID", "CHARTTIME", "TEXT"]].reset_index(drop=True).iloc[:1000]


print(filtered_notes)




     SUBJECT_ID  HADM_ID                     CHARTTIME  \
0         22532   167853 2151-07-25 20:14:44.678075742   
1         13702   107527 2118-06-07 23:25:40.351525092   
2         13702   167118 2119-05-18 22:10:39.036324439   
3         13702   196489 2124-07-28 03:41:41.477604415   
4         26880   135453 2162-03-10 12:29:25.456240980   
..          ...      ...                           ...   
995       15198   137838 2184-10-21 07:00:28.151380541   
996        2712   198266 2101-02-12 07:31:01.764739907   
997        2712   157537 2103-07-18 06:04:59.855792922   
998       18511   123903 2191-03-03 05:02:30.455869792   
999       76558   186116 2157-12-02 02:30:54.011481652   

                                                  TEXT  
0    Admission Date:  [**2151-7-16**]       Dischar...  
1    Admission Date:  [**2118-6-2**]       Discharg...  
2    Admission Date:  [**2119-5-4**]              D...  
3    Admission Date:  [**2124-7-21**]              ...  
4    Admission Dat

In [6]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")

def split_into_sentences_spacy(df):
    sentence_data = []
    texts = df['TEXT'].fillna('').tolist()  # Replace NaN with empty string
    hadm_ids = df['HADM_ID'].tolist()

    # Batch process texts using nlp.pipe
    for idx, (doc, hadm_id) in enumerate(zip(nlp.pipe(texts, batch_size=32, n_process=1), hadm_ids)):
        print(f"Processing row {idx}")
        for sent in doc.sents:
            sentence_data.append({
                "HADM_ID": hadm_id,
                "sentence": sent.text.strip()
            })
    return pd.DataFrame(sentence_data)

sentences_df = split_into_sentences_spacy(filtered_notes)
print(sentences_df)


Processing row 0
...Processing row 999
        HADM_ID                                           sentence
0        167853                                    Admission Date:
1        167853              [**2151-7-16**]       Discharge Date:
2        167853  [**2151-8-4**]\n\n\nService:\nADDENDUM:\n\nRAD...
3        167853   This also\nmoderate-sized left pleural effusion.
4        167853  HEAD CT:  Head CT showed no intracranial hemor...
...         ...                                                ...
143564   186116   [**Last Name (STitle) 3649**] in\nthe next week.
143565   186116  Please call [**Telephone/Fax (1) 3070**] to sc...
143566   186116  Please keep your previously scheduled appointm...
143567   186116                                          Provider:
143568   186116  [**First Name11 (Name Pattern1) **] [**Last Na...

[143569 rows x 2 columns]


In [7]:
# Map hadm_id to ICD codes
hadm_to_icd = diagnoses_df.groupby("HADM_ID")["ICD9_CODE"].apply(list).to_dict()

# Attach ICDs
sentences_df["labels"] = sentences_df["HADM_ID"].map(hadm_to_icd)

# Drop unlabeled and explode to one ICD code per row
sentences_df = sentences_df.dropna(subset=["labels"])
sentences_df = sentences_df.explode("labels").reset_index(drop=True)
print(len(sentences_df))


1973548


In [20]:
print(diagnoses_df)

        ROW_ID  SUBJECT_ID  HADM_ID  SEQ_NUM ICD9_CODE
0         1297         109   172335      1.0     40301
1         1298         109   172335      2.0       486
2         1299         109   172335      3.0     58281
3         1300         109   172335      4.0      5855
4         1301         109   172335      5.0      4254
...        ...         ...      ...      ...       ...
651042  639798       97503   188195      2.0     20280
651043  639799       97503   188195      3.0     V5869
651044  639800       97503   188195      4.0     V1279
651045  639801       97503   188195      5.0      5275
651046  639802       97503   188195      6.0      5569

[651047 rows x 5 columns]


In [8]:
class SentenceDataset(Dataset):
    def __init__(self, dataframe, tokenizer, label2idx):
        self.sentences = dataframe["sentence"].tolist()
        self.labels = [label2idx[label] for label in dataframe["labels"]]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        text = self.sentences[idx]
        inputs = self.tokenizer(text, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
        item = {key: val.squeeze() for key, val in inputs.items()}
        item["label"] = torch.tensor(self.labels[idx])
        return item

In [9]:
class ClinicalBertClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        return self.classifier(cls_output)


In [11]:
# Build label index
# Count label frequencies
top_labels = sentences_df["labels"].value_counts().nlargest(20).index

# Keep only rows with these top labels
sentences_df = sentences_df[sentences_df["labels"].isin(top_labels)].reset_index(drop=True)

# Rebuild label2idx since we reduced the label space
label2idx = {label: idx for idx, label in enumerate(sorted(sentences_df["labels"].unique()))}
idx2label = {v: k for k, v in label2idx.items()}


tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
from sklearn.model_selection import train_test_split

sampled_df = sentences_df.sample(n=1000, random_state=42).reset_index(drop=True)

train_df, test_df = train_test_split(
    sampled_df,
    test_size=0.3,
    stratify=sampled_df["labels"],
    random_state=42
)

train_df = train_df.sample(n=min(1000, len(train_df)), random_state=42).reset_index(drop=True)


train_dataset = SentenceDataset(train_df, tokenizer, label2idx)
test_dataset = SentenceDataset(test_df, tokenizer, label2idx)




train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)



In [14]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array(list(label2idx.values())),
    y=[label2idx[label] for label in train_df["labels"]]
)

class_weights = torch.tensor(class_weights, dtype=torch.float)


# Initialize model, optimizer, loss
model = ClinicalBertClassifier(num_labels=len(label2idx))
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

# Training loop
import time

# Training loop with timing
for epoch in range(3):
    start_time = time.time()

    model.train()
    total_loss = 0
    print(len(train_loader))
    for batch in train_loader:
        print(len(batch))
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    elapsed = time.time() - start_time
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, Time: {elapsed:.2f} seconds")



Epoch 1, Loss: 132.8522, Time: 521.84 seconds
Epoch 2, Loss: 125.1680, Time: 509.90 seconds
Epoch 3, Loss: 112.9669, Time: 501.92 seconds


In [15]:
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"].cpu().numpy()
        logits = model(input_ids, attention_mask).cpu().numpy()
        preds = logits.argmax(axis=1)

        y_true.extend(labels)
        y_pred.extend(preds)

# Only evaluate on labels that exist in test set
eval_labels = sorted(set(y_true + y_pred))
target_names = [idx2label[idx] for idx in eval_labels]

print(classification_report(y_true, y_pred, labels=eval_labels, target_names=target_names, zero_division=0))


              precision    recall  f1-score   support

        2449       0.00      0.00      0.00         8
       25000       0.00      0.00      0.00        17
        2720       0.00      0.00      0.00        13
        2724       0.00      0.00      0.00        14
        2762       0.00      0.00      0.00         7
        2859       0.00      0.00      0.00         9
        4019       0.12      0.03      0.04        38
       40390       0.00      0.00      0.00        10
       41401       0.06      0.06      0.06        17
       42731       0.10      0.04      0.06        25
        4280       0.25      0.06      0.10        32
         486       0.00      0.00      0.00        14
         496       0.07      0.18      0.10        11
        5070       0.07      0.30      0.11        10
       51881       0.09      0.06      0.07        17
       53081       0.13      0.18      0.15        11
        5849       0.00      0.00      0.00        20
        5990       0.08    

In [17]:
import torch.nn.functional as F

model.eval()
y_true, y_probs = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"].cpu().numpy()
        logits = model(input_ids, attention_mask).cpu().numpy()

        probs = F.softmax(torch.tensor(logits), dim=1).numpy()

        y_true.extend(labels)
        y_probs.extend(probs)

y_true = np.array(y_true)
y_probs = np.array(y_probs)

try:
    auroc = roc_auc_score(y_true, y_probs, multi_class='ovr')
    print(f"\nMulticlass AUROC (OVR): {auroc:.4f}")
except ValueError as e:
    print("AUROC could not be computed:", e)



Multiclass AUROC (OVR): 0.4902


In [18]:
from transformers import AutoModel
import torch.nn as nn

class ClinicalBertAvgPoolClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # shape: (batch_size, seq_len, hidden_size)

        # Apply attention mask for average pooling
        attention_mask = attention_mask.unsqueeze(-1)  # shape: (batch_size, seq_len, 1)
        masked_embeddings = last_hidden_state * attention_mask
        summed = masked_embeddings.sum(dim=1)
        counts = attention_mask.sum(dim=1)
        avg_pooled = summed / counts.clamp(min=1e-9)  # avoid division by zero

        logits = self.classifier(self.dropout(avg_pooled))
        return logits


In [19]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array(list(label2idx.values())),
    y=[label2idx[label] for label in train_df["labels"]]
)

class_weights = torch.tensor(class_weights, dtype=torch.float)


# Initialize model, optimizer, loss
model = ClinicalBertAvgPoolClassifier(num_labels=len(label2idx))
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

# Training loop
import time

# Training loop with timing
for epoch in range(3):
    start_time = time.time()

    model.train()
    total_loss = 0
    print(len(train_loader))
    for batch in train_loader:
        print(len(batch))
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    elapsed = time.time() - start_time
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, Time: {elapsed:.2f} seconds")



Epoch 1, Loss: 132.8941, Time: 516.45 seconds
Epoch 2, Loss: 127.8139, Time: 505.54 seconds
Epoch 3, Loss: 121.7167, Time: 512.59 seconds


In [20]:
import torch.nn.functional as F

model.eval()
y_true, y_probs = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"].cpu().numpy()
        logits = model(input_ids, attention_mask).cpu().numpy()

        probs = F.softmax(torch.tensor(logits), dim=1).numpy()

        y_true.extend(labels)
        y_probs.extend(probs)

y_true = np.array(y_true)
y_probs = np.array(y_probs)

try:
    auroc = roc_auc_score(y_true, y_probs, multi_class='ovr')
    print(f"\nMulticlass AUROC (OVR): {auroc:.4f}")
except ValueError as e:
    print("AUROC could not be computed:", e)



Multiclass AUROC (OVR): 0.4882
