In [5]:
import pandas as pd

In [6]:
# Load the saved pickle file
final_merged_data = pd.read_pickle('final_merged_data.pkl')
# Inspect the data
print(final_merged_data.columns)

final_merged_data.head()


Index(['row_id_x', 'SUBJECT_ID', 'HADM_ID', 'seq_num_x', 'ICD9_CODE', 'ROW_ID',
       'CHARTDATE', 'CHARTTIME', 'STORETIME', 'CATEGORY', 'DESCRIPTION',
       'CGID', 'TEXT', 'TEXT_LENGTH', 'CLEAN_TEXT', 'icd9_code_x',
       'long_title', 'row_id_y', 'seq_num_y', 'icd9_code_y'],
      dtype='object')


Unnamed: 0,row_id_x,SUBJECT_ID,HADM_ID,seq_num_x,ICD9_CODE,ROW_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,TEXT,TEXT_LENGTH,CLEAN_TEXT,icd9_code_x,long_title,row_id_y,seq_num_y,icd9_code_y
0,112344,10006,142345,1,99591,1394273,2164-10-25,2164-10-25 07:16:00,2164-10-25 07:23:00,Nursing/other,Report,19150,NPN 1900-0700\nPt awaiting transfer to floor w...,493,npn pt awaiting transfer floor floor bed becom...,,,47335,1,9749
1,112344,10006,142345,1,99591,1394273,2164-10-25,2164-10-25 07:16:00,2164-10-25 07:23:00,Nursing/other,Report,19150,NPN 1900-0700\nPt awaiting transfer to floor w...,493,npn pt awaiting transfer floor floor bed becom...,,,47336,2,5491
2,112344,10006,142345,1,99591,1394273,2164-10-25,2164-10-25 07:16:00,2164-10-25 07:23:00,Nursing/other,Report,19150,NPN 1900-0700\nPt awaiting transfer to floor w...,493,npn pt awaiting transfer floor floor bed becom...,,,47337,3,3895
3,112344,10006,142345,1,99591,1394273,2164-10-25,2164-10-25 07:16:00,2164-10-25 07:23:00,Nursing/other,Report,19150,NPN 1900-0700\nPt awaiting transfer to floor w...,493,npn pt awaiting transfer floor floor bed becom...,,,47338,4,3995
4,112344,10006,142345,1,99591,1394273,2164-10-25,2164-10-25 07:16:00,2164-10-25 07:23:00,Nursing/other,Report,19150,NPN 1900-0700\nPt awaiting transfer to floor w...,493,npn pt awaiting transfer floor floor bed becom...,,,47339,5,3893


In [7]:
from transformers import BertTokenizer
import torch

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_function(text):
    return tokenizer(
        text,
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length=128,           # Maximum length for truncation
        truncation=True,
        padding='max_length',     # Pad to max length
        return_attention_mask=True,  # Create attention masks
        return_tensors='pt'       # Return PyTorch tensors
    )

# Apply tokenizer to the CLEAN_TEXT column
texts = final_merged_data['CLEAN_TEXT'].tolist()

# Tokenize the texts
tokenized_inputs = [tokenize_function(text) for text in texts]

# Extract input_ids and attention_masks for model input
input_ids = torch.cat([x['input_ids'] for x in tokenized_inputs], dim=0)
attention_masks = torch.cat([x['attention_mask'] for x in tokenized_inputs], dim=0)




In [8]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels (ICD9_CODE)
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(final_merged_data['ICD9_CODE'])

# Convert to torch tensor
labels = torch.tensor(labels)


In [9]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Create a TensorDataset and DataLoader
batch_size = 16
dataset = TensorDataset(input_ids, attention_masks, labels)

# Split the dataset into training and validation sets (80% train, 20% validation)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create DataLoader for training and validation
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)


In [10]:
from transformers import BertForSequenceClassification, AdamW

# Load pre-trained BERT with a classification head
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_encoder.classes_),  # Number of classes in your labels
    output_attentions=False,
    output_hidden_states=False
)

# Move model to GPU (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.nn import CrossEntropyLoss
import torch

# Training the model
epochs = 3

for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    total_loss = 0

    model.train()

    for batch in train_dataloader:
        batch_input_ids, batch_attention_masks, batch_labels = tuple(t.to(device) for t in batch)

        # Convert labels to Long type
        batch_labels = batch_labels.long()

        # Zero gradients
        model.zero_grad()

        # Forward pass
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average training loss: {avg_train_loss}')


Epoch 1/3


In [None]:
model.eval()

predictions = []
true_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        batch_input_ids, batch_attention_masks, batch_labels = tuple(t.to(device) for t in batch)

        # Forward pass
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks)
        logits = outputs.logits

        # Get predictions
        predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.append(batch_labels.cpu().numpy())

# Flatten the predictions and true labels
predictions = [item for sublist in predictions for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

# Evaluate with accuracy, precision, recall, F1-score
from sklearn.metrics import classification_report
print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))


In [None]:
model.save_pretrained('path_to_save_model')
tokenizer.save_pretrained('path_to_save_tokenizer')


In [None]:
from transformers import BertModel

# Load pre-trained BERT for feature extraction
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.to(device)

# Extract embeddings
with torch.no_grad():
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        outputs = bert_model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Average of token embeddings
