In [1]:
from transformers import BertModel, BertTokenizer
import numpy as np
import pandas as pd
import torch
from torch import nn
import matplotlib.pyplot as plt

# Custom imports
from stse.bytes import bit_vect
import bert

In [2]:
# Define globals
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CONFIG = 'bert-large-uncased'

# Define BERT model
bert_model = BertModel.from_pretrained(CONFIG).to(DEVICE)

# Import data
notes_df = pd.read_csv('data/patient_notes.csv')
train_df = pd.read_csv('data/train.csv')
features_df = pd.read_csv('data/features.csv')

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# One-hot encode features
features_df['feature_vect'] = [bit_vect(len(features_df) + 1, i) for i in range(len(features_df['feature_text']))]
none_row = pd.DataFrame({
        'feature_num': [-1],
        'case_num': [-1],
        'feature_text': ['NONE'],
        'feature_vect': [bit_vect(len(features_df) + 1, len(features_df))]
    }, index=[len(features_df)])
features_df = pd.concat((features_df, none_row))  # Add NONE value as a feature

# APPEND AND CLEAN DATA
data = train_df[train_df['annotation'] != '[]']  # Drop blank annotations ('[]')
data['annotation'] = [i.translate(i.maketrans('', '', '[]\'')).split(' ') for i in data['annotation']]
data = data.merge(features_df[['feature_num', 'feature_text', 'feature_vect']], on='feature_num')  # Add features
data = data.merge(notes_df[['pn_num', 'pn_history']], on='pn_num')  # Add notes
# seps = [' ', ',', ';', ':', '.', '!', '?', '-', '_', '\n']  # WORRY ABOUT THIS LATER
word_lists = data['pn_history'].apply(lambda x: np.array(x.split(' '))).to_numpy()  # Convert notes to lists of words
data = data.dropna().reset_index(drop=True)  # Drop and reindex any leftover trouble-makers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['annotation'] = [i.translate(i.maketrans('', '', '[]\'')).split(' ') for i in data['annotation']]


In [38]:
total_labels = []
for i, note in enumerate(word_lists):
    word_labels = []
    for word in note:
        if word in data['annotation'].iloc[i]:
            word_labels.append(data['feature_vect'].iloc[i])
        else:
            word_labels.append(features_df['feature_vect'].iloc[-1])  # Value for NONE
    total_labels.append(np.array(word_labels))

In [39]:
# Tokenize word lists
tokenizer = BertTokenizer.from_pretrained(CONFIG)
encoded_word_lists = [tokenizer.encode(x.tolist()) for x in word_lists]

# Cast features and labels to tensors
X = [torch.cuda.IntTensor(np.array(x).reshape(1, -1)) for x in encoded_word_lists][0:5]
# y = [torch.cuda.ByteTensor(x) for x in data['feature_vect'].to_numpy()]
y = [torch.cuda.ByteTensor(x) for x in total_labels][0:5]

In [43]:
y[0][0].shape, X[0].shape


(torch.Size([144]), torch.Size([1, 144]))

In [49]:
dataset = bert.BertDataset(X, y)
x , y = dataset.__getitem__(0)

In [61]:
x
len(word_lists[0])
print(encoded_word_lists[0])
tokenizer.decode(encoded_word_lists[0])
# torch.cuda.IntTensor(np.array(word_lists[0]).reshape(1, -1))

[101, 100, 100, 100, 7534, 2007, 100, 100, 4311, 100, 2706, 1997, 23852, 4178, 1997, 100, 100, 2041, 1997, 2026, 100, 1016, 2420, 3283, 2076, 1037, 4715, 2208, 2018, 2019, 100, 2021, 2023, 2051, 2018, 3108, 3778, 1998, 2371, 2004, 2065, 2002, 2020, 2183, 2000, 3413, 2041, 100, 2025, 4558, 100, 100, 3602, 5776, 100, 100, 100, 3952, 2000, 2817, 100, 2335, 2566, 100, 100, 3522, 4715, 100, 2165, 100, 2305, 2077, 1998, 2851, 1997, 100, 100, 100, 1997, 100, 100, 100, 100, 100, 100, 3431, 1999, 100, 3431, 1999, 100, 21419, 100, 3431, 1999, 100, 2030, 100, 100, 100, 100, 3594, 2814, 100, 3566, 2007, 100, 100, 3611, 2007, 3522, 2540, 100, 100, 2039, 2000, 100, 100, 1999, 100, 100, 100, 8974, 1017, 6385, 1013, 2733, 100, 100, 23439, 100, 100, 2667, 100, 100, 3161, 2007, 6513, 1060, 1015, 100, 3594, 29094, 102]


'[CLS] [UNK] [UNK] [UNK] presents with [UNK] [UNK] reports [UNK] months of intermittent episodes of [UNK] [UNK] out of my [UNK] 2 days ago during a soccer game had an [UNK] but this time had chest pressure and felt as if he were going to pass out [UNK] not lose [UNK] [UNK] note patient [UNK] [UNK] [UNK] primarily to study [UNK] times per [UNK] [UNK] recent soccer [UNK] took [UNK] night before and morning of [UNK] [UNK] [UNK] of [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] changes in [UNK] changes in [UNK] abdominal [UNK] changes in [UNK] or [UNK] [UNK] [UNK] [UNK] uses friends [UNK] mom with [UNK] [UNK] dad with recent heart [UNK] [UNK] up to [UNK] [UNK] in [UNK] [UNK] [UNK] drinks 3 nights / week [UNK] [UNK] denies [UNK] [UNK] trying [UNK] [UNK] active with girlfriend x 1 [UNK] uses condoms [SEP]'

In [18]:
# Model params
LEARNING_RATE = 0.001
EPOCHS = 10

# Loss function, model, and optimizer
criterion = nn.CrossEntropyLoss()
model = bert.BertBased(num_classes=144, bert_config=CONFIG).to(DEVICE)
optimizer = torch.optim.Adam(lr=LEARNING_RATE, params=model.parameters())

# Loss history over epochs for plotting
train_loss_history = []
val_loss_history = []

for epoch in range(EPOCHS):
    print(f'EPOCH: {epoch}')
    
    # Initialize single-epoch loss
    epoch_train_loss = []
    epoch_val_loss = []
    
    for note, target in zip(X, y):
        # Zero out gradient every batch
        model.zero_grad()
        
        # Make predictions
        pred = model(note)
        print(pred.shape)
        print(target.shape)
        
        print(pred)
        print(target)
        # Calculate loss
        loss = criterion(pred, target)
        
        # Take train step
        loss.backward()
        optimizer.step()
        
        # Compile loss
        epoch_train_loss.append(loss.item())
    
    # Append average loss over epoch to history
    train_loss_history.append(sum(epoch_train_loss) / len(epoch_train_loss))

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


EPOCH: 0
torch.Size([1, 144, 144])
torch.Size([142, 144])
tensor([[[0.0027, 0.0078, 0.0053,  ..., 0.0064, 0.0063, 0.0073],
         [0.0074, 0.0074, 0.0101,  ..., 0.0061, 0.0073, 0.0075],
         [0.0078, 0.0076, 0.0075,  ..., 0.0069, 0.0083, 0.0073],
         ...,
         [0.0050, 0.0085, 0.0049,  ..., 0.0061, 0.0063, 0.0062],
         [0.0059, 0.0077, 0.0057,  ..., 0.0080, 0.0065, 0.0061],
         [0.0076, 0.0062, 0.0044,  ..., 0.0077, 0.0048, 0.0130]]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 1],
        ...,
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 1]], device='cuda:0', dtype=torch.uint8)


ValueError: Expected input batch_size (1) to match target batch_size (142).

In [None]:
# Plot loss over epochs
plt.figure()
plt.title('Cross Entropy Loss')
plt.plot(range(EPOCHS), train_loss_history, label='Train loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

Batch = 1 note
Sample = 1 word

each word needs its own vector (MOST WILL BE NONE)