In [18]:
!pip install transformers
# if you want to use Hugging Face datasets
!pip install datasets
# PyTorch is required for training
!pip install torch


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [19]:
import pandas as pd
import pickle
import torch
import transformers


In [20]:
# Load the saved pickle file
final_merged_data = pd.read_pickle('final_merged_data.pkl')


In [34]:
# Assuming 'ICD9_CODE' is your label column
unique_icd9_codes = final_merged_data['ICD9_CODE'].unique()
num_icd9_labels = len(unique_icd9_codes)

print(f"Number of unique ICD-9 labels: {num_icd9_labels}")
final_merged_data.head(5)

Number of unique ICD-9 labels: 278


Unnamed: 0,row_id_x,SUBJECT_ID,HADM_ID,seq_num_x,ICD9_CODE,ROW_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,TEXT,TEXT_LENGTH,CLEAN_TEXT,icd9_code_x,long_title,row_id_y,seq_num_y,icd9_code_y
0,112344,10006,142345,1,99591,1394273,2164-10-25,2164-10-25 07:16:00,2164-10-25 07:23:00,Nursing/other,Report,19150,NPN 1900-0700\nPt awaiting transfer to floor w...,493,npn pt awaiting transfer floor floor bed becom...,,,47335,1,9749
1,112344,10006,142345,1,99591,1394273,2164-10-25,2164-10-25 07:16:00,2164-10-25 07:23:00,Nursing/other,Report,19150,NPN 1900-0700\nPt awaiting transfer to floor w...,493,npn pt awaiting transfer floor floor bed becom...,,,47336,2,5491
2,112344,10006,142345,1,99591,1394273,2164-10-25,2164-10-25 07:16:00,2164-10-25 07:23:00,Nursing/other,Report,19150,NPN 1900-0700\nPt awaiting transfer to floor w...,493,npn pt awaiting transfer floor floor bed becom...,,,47337,3,3895
3,112344,10006,142345,1,99591,1394273,2164-10-25,2164-10-25 07:16:00,2164-10-25 07:23:00,Nursing/other,Report,19150,NPN 1900-0700\nPt awaiting transfer to floor w...,493,npn pt awaiting transfer floor floor bed becom...,,,47338,4,3995
4,112344,10006,142345,1,99591,1394273,2164-10-25,2164-10-25 07:16:00,2164-10-25 07:23:00,Nursing/other,Report,19150,NPN 1900-0700\nPt awaiting transfer to floor w...,493,npn pt awaiting transfer floor floor bed becom...,,,47339,5,3893


In [54]:
import pandas as pd

# Loading D_ICD_PROCEDURES.csv (assuming this contains 'icd9_code' and 'long_title' columns)
icd9_procedures = pd.read_csv('data/D_ICD_PROCEDURES.csv')

# Extracting the relevant columns from final_merged_data
# Assuming 'ICD9_CODE' is the ICD-9 code in final_merged_data
# Adjusting the column names as per our actual DataFrame
# Ensure both columns are the same type (convert to string)
final_merged_data['ICD9_CODE'] = final_merged_data['ICD9_CODE'].astype(str)
icd9_procedures['icd9_code'] = icd9_procedures['icd9_code'].astype(str)

icd9_data = final_merged_data[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE', 'CLEAN_TEXT']]  # Select relevant columns

# Merge selected_data with the descriptions from icd9_procedures on the ICD-9 code
merged_icd9_description = pd.merge(
    icd9_data, 
    icd9_procedures[['icd9_code', 'long_title']],  # Selecting ICD-9 code and its description
    left_on='ICD9_CODE', 
    right_on='icd9_code', 
    how='left'  # Use 'left' to keep all rows from selected_data
)

# Reset the index to avoid issues with missing column names
# merged_icd9_description.reset_index(drop=True, inplace=True)
merged_icd9_description.index.name = 'Record_ID'

# Remove rows where 'long_title' (ICD-9 description) is NaN
merged_icd9_description= merged_icd9_description.dropna(subset=['long_title'])
# Display the new DataFrame with ICD-9 descriptions
merged_icd9_description.head()

Unnamed: 0_level_0,SUBJECT_ID,HADM_ID,ICD9_CODE,CLEAN_TEXT,icd9_code,long_title
Record_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
56,10006,142345,5672,npn pt awaiting transfer floor floor bed becom...,5672,Revision of ureterointestinal anastomosis
57,10006,142345,5672,npn pt awaiting transfer floor floor bed becom...,5672,Revision of ureterointestinal anastomosis
58,10006,142345,5672,npn pt awaiting transfer floor floor bed becom...,5672,Revision of ureterointestinal anastomosis
59,10006,142345,5672,npn pt awaiting transfer floor floor bed becom...,5672,Revision of ureterointestinal anastomosis
60,10006,142345,5672,npn pt awaiting transfer floor floor bed becom...,5672,Revision of ureterointestinal anastomosis


In [55]:
print(merged_icd9_description.columns)
merged_icd9_description.info()

Index(['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE', 'CLEAN_TEXT', 'icd9_code',
       'long_title'],
      dtype='object')


In [None]:
from transformers import pipeline

# Use BERT base model instead of BioBERT
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

# Apply NER to text
texts = merged_icd9_description['CLEAN_TEXT'][:1000].tolist()
ner_labels = ner_pipeline(texts)

# Add the NER labels back to the DataFrame
# merged_icd9_description['NER_LABELS'] = ner_labels


In [None]:
# Saving the DataFrame to a CSV file
merged_icd9_description.to_csv('fmerged_icd9_description.csv', index=False)

# we can also save it as a pickle file (smaller size, faster load time)
merged_icd9_description.to_pickle('merged_icd9_description.pkl')


In [73]:
import os
import pandas as pd
from transformers import pipeline

# Load BioBERT NER pipeline
ner_pipeline = pipeline("ner", model="dmis-lab/biobert-base-cased-v1.1", tokenizer="dmis-lab/biobert-base-cased-v1.1", aggregation_strategy="simple")

# Function to apply NER and handle truncation manually
def generate_ner_labels(text, max_length=512):
    tokens = ner_pipeline.tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)["input_ids"][0]
    
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    entities = []
    
    for chunk in chunks:
        chunk_text = ner_pipeline.tokenizer.decode(chunk, skip_special_tokens=True)
        chunk_entities = ner_pipeline(chunk_text)
        entities.extend(chunk_entities)
        
    return entities

# File to save the progress in .pkl format
checkpoint_file = "ner_labels_checkpoint.pkl"

# Check if the checkpoint file already exists
if os.path.exists(checkpoint_file):
    print(f"Resuming from {checkpoint_file}")
    ner_data = pd.read_pickle(checkpoint_file)
    last_processed = len(ner_data.dropna(subset=['NER_LABELS']))
else:
    # Create a new DataFrame for NER labels
    ner_data = pd.DataFrame(columns=['SUBJECT_ID', 'HADM_ID', 'CLEAN_TEXT', 'NER_LABELS'])
    last_processed = 0

# Apply NER to each text in your dataset in batches
batch_size = 100
texts = merged_icd9_description['CLEAN_TEXT'].tolist()

# Process texts in batches starting from last processed point
for i in range(last_processed, len(texts), batch_size):
    print(f"Processing batch {i} to {i + batch_size}")
    batch = texts[i:i + batch_size]
    batch_labels = [generate_ner_labels(text) for text in batch]

    # Ensure the lengths of arrays match
    if len(batch_labels) == len(batch):
        # Collecting corresponding SUBJECT_ID, HADM_ID, and CLEAN_TEXT for the batch
        batch_df = pd.DataFrame({
            'SUBJECT_ID': merged_icd9_description.loc[i:i+batch_size-1, 'SUBJECT_ID'].values,
            'HADM_ID': merged_icd9_description.loc[i:i+batch_size-1, 'HADM_ID'].values,
            'CLEAN_TEXT': merged_icd9_description.loc[i:i+batch_size-1, 'CLEAN_TEXT'].values,
            'NER_LABELS': batch_labels
        })

        # Append the batch to ner_data
        ner_data = pd.concat([ner_data, batch_df], ignore_index=True)

        # Save progress every 1000 rows (or choose your own frequency)
        if (i % 1000 == 0) or (i + batch_size >= len(texts)):
            print(f"Saving progress at batch {i} to {i + batch_size}")
            ner_data.to_pickle(checkpoint_file)
    else:
        print(f"Length mismatch in batch {i} to {i + batch_size}. Skipping this batch.")

# Final save after all processing in .pkl format
ner_data.to_pickle("final_ner_labels.pkl")

# View the results
ner_data[['SUBJECT_ID', 'HADM_ID', 'CLEAN_TEXT', 'NER_LABELS']].head()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing batch 0 to 100


ValueError: All arrays must be of the same length

In [74]:
batch_size = 100  # Adjust as necessary
total_batches = len(texts) // batch_size
print_interval = 100  # How frequently to print status

for i in range(0, total_batches):
    batch_texts = texts[i*batch_size:(i+1)*batch_size]
    
    # Apply NER to the batch (you might have your own NER pipeline call)
    ner_labels = ner_pipeline(batch_texts)
    
    # Save or process the results as needed here

    if i % print_interval == 0:
        print(f"Processing batch {i} out of {total_batches} batches")


Processing batch 0 out of 326 batches


RuntimeError: The size of tensor a (532) must match the size of tensor b (512) at non-singleton dimension 1

In [75]:
import time

batch_size = 100
total_batches = len(texts) // batch_size
print_interval = 100  # How often to print progress
start_time = time.time()

for i in range(0, total_batches):
    batch_texts = texts[i*batch_size:(i+1)*batch_size]
    
    # Apply NER to the batch
    ner_labels = ner_pipeline(batch_texts)
    
    # Save or process results here

    # Print progress and time estimation
    if i % print_interval == 0:
        elapsed_time = time.time() - start_time
        avg_time_per_batch = elapsed_time / (i + 1)
        remaining_time = avg_time_per_batch * (total_batches - i - 1)
        
        print(f"Processing batch {i} of {total_batches}")
        print(f"Time elapsed: {elapsed_time:.2f}s, Estimated time remaining: {remaining_time:.2f}s")


Processing batch 0 of 326
Time elapsed: 26.88s, Estimated time remaining: 8735.17s


RuntimeError: The size of tensor a (532) must match the size of tensor b (512) at non-singleton dimension 1

In [69]:
from transformers import pipeline

# Load BioBERT NER pipeline
ner_pipeline = pipeline("ner", model="dmis-lab/biobert-base-cased-v1.1", tokenizer="dmis-lab/biobert-base-cased-v1.1", aggregation_strategy="simple")

# Function to apply NER and handle truncation manually
def generate_ner_labels(text, max_length=512):
    tokens = ner_pipeline.tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)["input_ids"][0]
    
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    entities = []
    
    for chunk in chunks:
        chunk_text = ner_pipeline.tokenizer.decode(chunk, skip_special_tokens=True)
        chunk_entities = ner_pipeline(chunk_text)
        entities.extend(chunk_entities)
        
    return entities

# Apply NER to each text in your dataset in batches
batch_size = 100
ner_labels = []
texts = merged_icd9_description['CLEAN_TEXT'].tolist()

# Process texts in batches to avoid memory overload
for i in range(0, len(texts), batch_size):
    print(f"Processing batch {i} to {i + batch_size}")
    batch = texts[i:i + batch_size]
    batch_labels = [generate_ner_labels(text) for text in batch]
    ner_labels.extend(batch_labels)

# Add the NER labels back to the DataFrame
merged_icd9_description['NER_LABELS'] = ner_labels

# View the results
merged_icd9_description[['CLEAN_TEXT', 'NER_LABELS']].head()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing batch 0 to 100
Processing batch 100 to 200
Processing batch 200 to 300
Processing batch 300 to 400
Processing batch 400 to 500
Processing batch 500 to 600
Processing batch 600 to 700
Processing batch 700 to 800
Processing batch 800 to 900
Processing batch 900 to 1000
Processing batch 1000 to 1100
Processing batch 1100 to 1200
Processing batch 1200 to 1300
Processing batch 1300 to 1400
Processing batch 1400 to 1500
Processing batch 1500 to 1600
Processing batch 1600 to 1700
Processing batch 1700 to 1800
Processing batch 1800 to 1900
Processing batch 1900 to 2000
Processing batch 2000 to 2100
Processing batch 2100 to 2200
Processing batch 2200 to 2300
Processing batch 2300 to 2400
Processing batch 2400 to 2500
Processing batch 2500 to 2600
Processing batch 2600 to 2700
Processing batch 2700 to 2800
Processing batch 2800 to 2900
Processing batch 2900 to 3000
Processing batch 3000 to 3100
Processing batch 3100 to 3200
Processing batch 3200 to 3300
Processing batch 3300 to 3400
P

KeyboardInterrupt: 

In [None]:
from transformers import pipeline
import torch

# Load the BioBERT model for NER
ner_pipeline = pipeline("ner", model="dmis-lab/biobert-base-cased-v1.1", tokenizer="dmis-lab/biobert-base-cased-v1.1", aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1)

# Function to batch NER
def generate_ner_labels_batch(texts, max_length=512, batch_size=16):
    all_entities = []
    # Process texts in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        # Tokenize each batch
        tokenized_batch = [ner_pipeline.tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length, padding=True) for text in batch_texts]
        # Apply NER to the tokenized batch
        for tokenized_text in tokenized_batch:
            chunk_entities = ner_pipeline(ner_pipeline.tokenizer.decode(tokenized_text['input_ids'][0], skip_special_tokens=True))
            all_entities.append(chunk_entities)
    return all_entities

# Apply NER in batches to the 'TEXT' column
texts = merged_icd9_description['CLEAN_TEXT'].tolist()
ner_labels = generate_ner_labels_batch(texts[:1000])

# Adding the NER labels back to the DataFrame
merged_icd9_description['NER_LABELS'] = ner_labels

# Viewing the generated NER labels
merged_icd9_description[['CLEAN_TEXT', 'NER_LABELS']].head()


In [30]:
# Assuming 'final_merged_data' has a column 'CLEAN_TEXT' and labels (NER labels)
texts = final_merged_data['CLEAN_TEXT'].tolist()
labels = final_merged_data['NER_LABELS'].tolist()  # Assuming we have token-level labels


KeyError: 'NER_LABELS'

In [29]:
from transformers import BertTokenizerFast

# Load the BERT tokenizer (this can be replaced with BioBERT's tokenizer)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# Tokenize the text and align labels with tokens
def tokenize_and_align_labels(texts, labels):
    tokenized_inputs = tokenizer(texts, truncation=True, padding=True, is_split_into_words=True)
    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their word ids
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Subword token
            previous_word_idx = word_idx
        aligned_labels.append(label_ids)
    
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

tokenized_inputs = tokenize_and_align_labels(texts, labels)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



IndexError: invalid index of a 0-dim tensor. Use `tensor.item()` in Python or `tensor.item<T>()` in C++ to convert a 0-dim tensor to a number

### Train a BERT Model

In [27]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments

# Load the BERT model (can be replaced with BioBERT)
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create a Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_inputs,  # Your training dataset
    eval_dataset=tokenized_inputs,   # Your evaluation dataset
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


NameError: name 'label_list' is not defined

In [26]:
from transformers import pipeline

# Load BioBERT model for NER
ner_pipeline = pipeline("ner", model="dmis-lab/biobert-base-cased-v1.1", tokenizer="dmis-lab/biobert-base-cased-v1.1", aggregation_strategy="simple")

# Function to apply NER on text and handle tokenization within the 512 token limit
def ner_on_long_text(text, max_length=512):
    # Tokenize the text and split into chunks if necessary
    tokens = ner_pipeline.tokenizer(text, return_tensors="pt", truncation=False)["input_ids"]
    chunks = [tokens[0][i:i+max_length] for i in range(0, tokens.shape[1], max_length)]
    
    entities = []
    for chunk in chunks:
        # Convert chunk back to text
        chunk_text = ner_pipeline.tokenizer.decode(chunk, skip_special_tokens=True)
        # Apply NER to the chunk
        entities.extend(ner_pipeline(chunk_text))
    
    return entities

# Apply NER to the 'CLEAN_TEXT' column in the dataset
final_merged_data['NER_ENTITIES'] = final_merged_data['CLEAN_TEXT'].apply(ner_on_long_text)

# Display the first few rows with extracted NER entities
final_merged_data[['CLEAN_TEXT', 'NER_ENTITIES']].head()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


RuntimeError: The size of tensor a (513) must match the size of tensor b (512) at non-singleton dimension 1

In [23]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Load BERT or BioBERT model and tokenizer
model_name = "dmis-lab/biobert-base-cased-v1.1"  # BioBERT model
# For BERT, you can use 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_icd9_labels)  # Define the number of labels based on your task


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
def tokenize_data(texts, labels, tokenizer):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    labels = torch.tensor(labels)  # Convert labels to tensor
    return inputs, labels

# Assuming 'final_merged_data' contains the text data and the corresponding ICD-9 codes as labels
texts = final_merged_data['CLEAN_TEXT'].tolist()  # Clinical text
labels = final_merged_data['ICD9_CODE'].astype('category').cat.codes.tolist()  # Labels for classification (convert to categorical)

# Tokenize the data
inputs, labels = tokenize_data(texts, labels, tokenizer)


In [25]:
# Split the data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(inputs['input_ids'], labels, test_size=0.2, random_state=42)
