In [1]:
from transformers import BertModel, BertTokenizer
import numpy as np
import pandas as pd
import torch
from torch import nn
import matplotlib.pyplot as plt

# Custom imports
from stse.bytes import bit_vect
import bert

In [2]:
# Define globals
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CONFIG = 'bert-large-uncased'

# Define BERT model
bert_model = BertModel.from_pretrained(CONFIG).to(DEVICE)

# Import data
notes_df = pd.read_csv('data/patient_notes.csv')
train_df = pd.read_csv('data/train.csv')
features_df = pd.read_csv('data/features.csv')

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# One-hot encode features
features_df['feature_vect'] = [bit_vect(len(features_df) + 1, i) for i in range(len(features_df['feature_text']))]
none_row = pd.DataFrame({
        'feature_num': [-1],
        'case_num': [-1],
        'feature_text': ['NONE'],
        'feature_vect': [bit_vect(len(features_df) + 1, len(features_df))]
    }, index=[len(features_df)])
features_df = pd.concat((features_df, none_row))  # Add NONE value as a feature

# APPEND AND CLEAN DATA
data = train_df[train_df['annotation'] != '[]']  # Drop blank annotations ('[]')
data['annotation'] = [i.translate(i.maketrans('', '', '[]\'')).split(' ') for i in data['annotation']]
data = data.merge(features_df[['feature_num', 'feature_text', 'feature_vect']], on='feature_num')  # Add features
data = data.merge(notes_df[['pn_num', 'pn_history']], on='pn_num')  # Add notes
# seps = [' ', ',', ';', ':', '.', '!', '?', '-', '_', '\n']  # WORRY ABOUT THIS LATER
word_lists = data['pn_history'].apply(lambda x: np.array(x.split(' '))).to_numpy()  # Convert notes to lists of words
data = data.dropna().reset_index(drop=True)  # Drop and reindex any leftover trouble-makers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['annotation'] = [i.translate(i.maketrans('', '', '[]\'')).split(' ') for i in data['annotation']]


In [4]:
none_vect = features_df['feature_vect'].iloc[-1]  # Vector value for NONE
total_labels = []
for i, note in enumerate(word_lists):
    word_labels = [none_vect]  # Pad first with NONE bc start token [CLS] added
    for word in note:
        if word in data['annotation'].iloc[i]:
            word_labels.append(data['feature_vect'].iloc[i])
        else:
            word_labels.append(none_vect)
    word_labels.append(none_vect)  # Pad last with NONE bc end token [SEP] added
    total_labels.append(np.array(word_labels))

In [7]:
new_feature = range(len(features_df))
new_feature
features_df['new_feature'] = new_feature

In [10]:
features_df

Unnamed: 0,feature_num,case_num,feature_text,feature_vect,new_feature
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
1,1,0,Family-history-of-thyroid-disorder,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
2,2,0,Chest-pressure,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
3,3,0,Intermittent-symptoms,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3
4,4,0,Lightheaded,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",4
...,...,...,...,...,...
139,913,9,Female,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",139
140,914,9,Photophobia,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",140
141,915,9,No-known-illness-contacts,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",141
142,916,9,Subjective-fever,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",142


In [None]:
# Tokenize word lists
tokenizer = BertTokenizer.from_pretrained(CONFIG)
encoded_word_lists = [tokenizer.encode(x.tolist()) for x in word_lists]

# Cast features and labels to tensors
X = [torch.cuda.IntTensor(np.array(x).reshape(1, -1)) for x in encoded_word_lists][0:5]
# y = [torch.cuda.ByteTensor(x) for x in data['feature_vect'].to_numpy()]
y = [torch.cuda.ByteTensor(x) for x in total_labels][0:5]

In [None]:
y[0].shape, X[0].shape


In [None]:
import bert

# Model params
LEARNING_RATE = 0.001
EPOCHS = 10

# Loss function, model, and optimizer
criterion = nn.CrossEntropyLoss()
model = bert.BertBased(num_classes=144, bert_config=CONFIG).to(DEVICE)
optimizer = torch.optim.Adam(lr=LEARNING_RATE, params=model.parameters())

# Loss history over epochs for plotting
train_loss_history = []
val_loss_history = []

for epoch in range(EPOCHS):
    print(f'EPOCH: {epoch}')
    
    # Initialize single-epoch loss
    epoch_train_loss = []
    epoch_val_loss = []
    
    for note, target in zip(X, y):
        # Zero out gradient every batch
        model.zero_grad()
        
        # Make predictions
        pred = model(note)
        print(pred.shape)
        print(target.shape)
        
        print(pred)
        print(target)
        # Calculate loss
        loss = criterion(pred, target)
        
        # Take train step
        loss.backward()
        optimizer.step()
        
        # Compile loss
        epoch_train_loss.append(loss.item())
    
    # Append average loss over epoch to history
    train_loss_history.append(sum(epoch_train_loss) / len(epoch_train_loss))

In [None]:
# Plot loss over epochs
plt.figure()
plt.title('Cross Entropy Loss')
plt.plot(range(EPOCHS), train_loss_history, label='Train loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

Batch = 1 note
Sample = 1 word

each word needs its own vector (MOST WILL BE NONE)