In [1]:
# Cell 0 - Install dependencies
!pip install transformers
!pip install nltk
!pip install scikit-learn
!pip install torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
# Cell 2 - Setup spaCy
!pip install -U spacy
import spacy

# Download English tokenizer model
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m113.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [7]:
from google.colab import files

uploaded = files.upload()


Saving admissions.csv to admissions.csv


In [4]:
# Cell 1 - Import necessary libraries
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc, precision_recall_curve

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
# Load and clean header
notes_df = pd.read_csv("testing_noteevents.csv")

# Load other files
diagnoses_df = pd.read_csv("diagnoses_icd.csv")
admissions_df = pd.read_csv("admissions.csv", parse_dates=["admittime", "dischtime"])



# Ensure CHARTTIME is datetime
notes_df["CHARTTIME"] = pd.to_datetime(notes_df["CHARTTIME"])

# Merge to get admission/discharge window
merged_df = pd.merge(notes_df, admissions_df[["hadm_id", "admittime", "dischtime"]],
                     left_on="HADM_ID", right_on="hadm_id", how="inner")

filtered_notes = merged_df

# Filter notes within the stay period
# filtered_notes = merged_df[
#     (merged_df["CHARTTIME"] >= merged_df["admittime"]) &
#     (merged_df["CHARTTIME"] <= merged_df["dischtime"])
# ][["SUBJECT_ID", "HADM_ID", "CHARTTIME", "TEXT"]].dropna().reset_index(drop=True)

print(filtered_notes)


    ROW_ID  SUBJECT_ID   HADM_ID   CHARTDATE           CHARTTIME  \
0      293    10019385  20297618  2102-06-20 2102-06-20 08:20:00   
1     2088    10014354  22741225  2100-10-19 2100-10-19 05:23:00   
2     3100    10019003  26703331  2102-05-04 2102-05-04 18:32:00   
3     1788    10014354  26013492  2100-11-06 2100-11-06 20:52:00   
4     3483    10023117  21607814  2101-11-01 2101-11-01 18:03:00   
..     ...         ...       ...         ...                 ...   
95    4203    10002930  23720373  2100-09-13 2100-09-13 04:08:00   
96    2373    10000032  22595853  2100-05-17 2100-05-17 15:51:00   
97    3227    10003400  26467376  2101-06-28 2101-06-28 19:04:00   
98    2709    10027445  26275841  2100-11-21 2100-11-21 15:06:00   
99    4434    10002428  25797028  2102-04-15 2102-04-15 17:13:00   

              STORETIME           CATEGORY     DESCRIPTION  CGID  ISERROR  \
0   2102-06-20 09:12:00  Discharge summary  Simulated note   NaN      NaN   
1   2100-10-19 06:24:00  Disc

In [9]:
import spacy
nlp = spacy.load("en_core_web_sm")

def split_into_sentences_spacy(df):
    sentence_data = []
    for _, row in df.iterrows():
        if pd.notnull(row['TEXT']):
            doc = nlp(row['TEXT'])
            for sent in doc.sents:
                sentence_data.append({
                    "HADM_ID": row["HADM_ID"],
                    "sentence": sent.text.strip()
                })
    return pd.DataFrame(sentence_data)

sentences_df = split_into_sentences_spacy(filtered_notes)
print(sentences_df)


     HADM_ID                                           sentence
0   20297618  This is a simulated discharge note for subject...
1   22741225  This is a simulated discharge note for subject...
2   26703331  This is a simulated discharge note for subject...
3   26013492  This is a simulated discharge note for subject...
4   21607814  This is a simulated discharge note for subject...
..       ...                                                ...
95  23720373  This is a simulated discharge note for subject...
96  22595853  This is a simulated discharge note for subject...
97  26467376  This is a simulated discharge note for subject...
98  26275841  This is a simulated discharge note for subject...
99  25797028  This is a simulated discharge note for subject...

[100 rows x 2 columns]


In [10]:
# Map hadm_id to ICD codes
hadm_to_icd = diagnoses_df.groupby("hadm_id")["icd_code"].apply(list).to_dict()

# Attach ICDs
sentences_df["labels"] = sentences_df["HADM_ID"].map(hadm_to_icd)

# Drop unlabeled and explode to one ICD code per row
sentences_df = sentences_df.dropna(subset=["labels"])
sentences_df = sentences_df.explode("labels").reset_index(drop=True)


In [11]:
class SentenceDataset(Dataset):
    def __init__(self, dataframe, tokenizer, label2idx):
        self.sentences = dataframe["sentence"].tolist()
        self.labels = [label2idx[label] for label in dataframe["labels"]]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        text = self.sentences[idx]
        inputs = self.tokenizer(text, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
        item = {key: val.squeeze() for key, val in inputs.items()}
        item["label"] = torch.tensor(self.labels[idx])
        return item

In [12]:
class ClinicalBertClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        return self.classifier(cls_output)


In [23]:
# Build label index
# Count label frequencies
top_labels = sentences_df["labels"].value_counts().nlargest(20).index

# Keep only rows with these top labels
sentences_df = sentences_df[sentences_df["labels"].isin(top_labels)].reset_index(drop=True)

# Rebuild label2idx since we reduced the label space
label2idx = {label: idx for idx, label in enumerate(sorted(sentences_df["labels"].unique()))}
idx2label = {v: k for k, v in label2idx.items()}


tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


In [24]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    sentences_df,
    test_size=0.3,
    stratify=sentences_df["labels"],
    random_state=42
)

train_dataset = SentenceDataset(train_df, tokenizer, label2idx)
test_dataset = SentenceDataset(test_df, tokenizer, label2idx)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


In [25]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array(list(label2idx.values())),
    y=[label2idx[label] for label in train_df["labels"]]
)

class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)


# Initialize model, optimizer, loss
model = ClinicalBertClassifier(num_labels=len(label2idx)).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

# Training loop
for epoch in range(3):  # Keep small for demo
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 76.6981
Epoch 2, Loss: 75.3089
Epoch 3, Loss: 75.1957


In [26]:
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].cpu().numpy()
        logits = model(input_ids, attention_mask).cpu().numpy()
        preds = logits.argmax(axis=1)

        y_true.extend(labels)
        y_pred.extend(preds)

# Only evaluate on labels that exist in test set
eval_labels = sorted(set(y_true + y_pred))
target_names = [idx2label[idx] for idx in eval_labels]

print(classification_report(y_true, y_pred, labels=eval_labels, target_names=target_names, zero_division=0))


              precision    recall  f1-score   support

       25000       1.00      0.25      0.40         4
        2724       0.05      0.17      0.07         6
        2859       0.00      0.00      0.00         3
         311       0.00      0.00      0.00         5
        4019       0.67      0.22      0.33         9
       42731       0.00      0.00      0.00         3
        4280       0.00      0.00      0.00         3
        E039       0.00      0.00      0.00         5
        E119       0.00      0.00      0.00         3
        E785       0.00      0.00      0.00         8
        F329       0.00      0.00      0.00         4
        F419       0.00      0.00      0.00         4
       I2510       0.00      0.00      0.00         3
       I4891       0.00      0.00      0.00         3
        N179       0.00      0.00      0.00         4
       V1582       0.00      0.00      0.00         5
       V5867       0.00      0.00      0.00         3
        Z794       0.00    