# Installs

In [None]:
!pip install transformers
!pip install tensorflow_addons
!pip install keras-crf

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [3]:
# BERT imports
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification, AutoTokenizer, AutoModelForMaskedLM
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup

In [4]:
import random

In [5]:
import tensorflow_addons as tfa
from keras_crf import CRFModel

# Data Set Up

In [6]:
# Unzip folder
!unzip /content/drive/MyDrive/266/Data/claim_dataset.zip

unzip:  cannot find or open /claim_dataset.zip, /claim_dataset.zip.zip or /claim_dataset.zip.ZIP.


In [7]:
def random_undersampler(df, percent, label='target'):
  '''Undersample class 0 to match percent subset of class 1'''
  class_1 = df[df[label] == 1]
  class_1_sample = class_1.sample(frac=percent, replace=False)
  class_1_count = len(class_1_sample)
  # Overrepresented class
  class_0 = df[df[label] == 0]
  class_0_sample = class_0.sample(class_1_count)
  full_sample = pd.concat([class_0_sample, class_1_sample], axis=0)
  return full_sample.sample(frac=1, replace=False).reset_index(drop=True)

In [8]:
def text_label_formatter(df, x_columns=['abstract_text'], label='target'):
  '''Split dataframe into features/x and labels'''
  x = pd.DataFrame()
  for col in x_columns:
    x[col] = df[col]
  labels = df[label]
  return x, labels

In [9]:
def explode_df(df):
  '''Convert claim dataset into one row per sentence format'''
  # Add sentence id to indicate order within abstract
  sentence_ids = []
  for label_list in df.labels:
    sentence_ids.append(list(range(len(label_list))))
  df['sentence_ids'] = sentence_ids

  # Explode labels, sentences, and ids
  df2 = df.explode(list(('labels','sentences', 'sentence_ids')),
                   ignore_index=True)
  df2['labels'] = df2['labels'].astype('int')
  df2['sentence_ids'] = df2['sentence_ids'].astype('int')
  return df2

In [11]:
# Load full json files
cval_df_raw = pd.read_json('/content/claim_dataset/validation_labels.json', lines=True)
ctrain_df_raw = pd.read_json('/content/claim_dataset/train_labels.json', lines=True)
ctest_df_raw = pd.read_json('/content/claim_dataset/test_labels.json', lines=True)

# Convert to single row per abstract format
cval_df = explode_df(cval_df_raw)
ctrain_df = explode_df(ctrain_df_raw)
ctest_df = explode_df(ctest_df_raw)

## Balance training dataset -> need full abstracts
# ctrain_balanced = random_undersampler(ctrain_df, 1, 'labels')

# Prepare data for embedding dataloader
ctrain_texts, ctrain_labels = text_label_formatter(ctrain_df,
                                                 ['sentences', 'paper_id', 'sentence_ids'], 'labels')
cval_texts, cval_labels = text_label_formatter(cval_df,
                                                 ['sentences', 'paper_id', 'sentence_ids'], 'labels')
ctest_texts, ctest_labels = text_label_formatter(ctest_df,
                                                 ['sentences', 'paper_id', 'sentence_ids'], 'labels')

# Model Set Up

## For BERT Embedding

In [10]:
# Define model ID
model_id = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

In [11]:
# Set Random seeds
seed_val = 17
random.seed(seed_val) ## Is this the only time I use random
np.random.seed(seed_val)
torch.manual_seed(seed_val) ## Do I need these, this is for pytorch
torch.cuda.manual_seed_all(seed_val)

### Dataloading

In [14]:
# Initialize encoder
max_length = 256
batch_size = 3
tokenizer = BertTokenizer.from_pretrained(model_id, 
                                          do_lower_case=True)

In [15]:
## Load training data
# Encode
encoded_ctrain = tokenizer.batch_encode_plus(
    ctrain_texts['sentences'].values,
    add_special_tokens=True, 
    return_attention_mask=True,
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors='pt'
)

# Split inputs into tensors
input_ids_ctrain = encoded_ctrain['input_ids']
attention_masks_ctrain = encoded_ctrain['attention_mask']
paper_id_ctrain = torch.tensor(ctrain_texts['paper_id'])
sentence_id_ctrain = torch.tensor(ctrain_texts['sentence_ids'])
labels_ctrain = torch.tensor(ctrain_labels.values)

# Make dataset
dataset_ctrain = TensorDataset(input_ids_ctrain, attention_masks_ctrain,
                               labels_ctrain, paper_id_ctrain,
                               sentence_id_ctrain)

# Make dataloader
dataloader_ctrain = DataLoader(dataset_ctrain, 
                              sampler=RandomSampler(dataset_ctrain), 
                              batch_size=batch_size)

In [16]:
## Load testing data
# Encode
encoded_cval = tokenizer.batch_encode_plus(
    cval_texts['sentences'].values,
    add_special_tokens=True, 
    return_attention_mask=True,
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors='pt'
)

# Split inputs into tensors
input_ids_cval = encoded_cval['input_ids']
attention_masks_cval = encoded_cval['attention_mask']
paper_id_cval = torch.tensor(cval_texts['paper_id'])
sentence_id_cval = torch.tensor(cval_texts['sentence_ids'])
labels_cval = torch.tensor(cval_labels.values)

# Make dataset
dataset_cval = TensorDataset(input_ids_cval, attention_masks_cval, labels_cval,
                             paper_id_cval, sentence_id_cval)

# Make dataloader
dataloader_cval = DataLoader(dataset_cval, 
                                   sampler=SequentialSampler(dataset_cval), 
                                   batch_size=batch_size)

In [17]:
## Load testing data
# Encode
encoded_ctest = tokenizer.batch_encode_plus(
    ctest_texts['sentences'].values,
    add_special_tokens=True, 
    return_attention_mask=True,
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors='pt'
)

# Make tensors
input_ids_ctest = encoded_ctest['input_ids']
attention_masks_ctest = encoded_ctest['attention_mask']
paper_id_ctest = torch.tensor(ctest_texts['paper_id'])
sentence_id_ctest = torch.tensor(ctest_texts['sentence_ids'])
labels_ctest = torch.tensor(ctest_labels.values)

# Make dataset
dataset_ctest = TensorDataset(input_ids_ctest, attention_masks_ctest,
                              labels_ctest, paper_id_ctest, sentence_id_ctest)
# Make dataloader
dataloader_ctest = DataLoader(dataset_ctest, 
                              sampler=RandomSampler(dataset_ctest), 
                              batch_size=batch_size)

### Embedding

In [18]:
## Definitions for embeddings section

def get_embeddings(dataloader):
  '''Returns Sentence Ids, Paper Ids, and Associated Hidden States'''
  bert_model.eval()

  last_hidden_states, paper_ids, sentence_ids, true_vals, preds = [],[],[],[],[]
  for batch in dataloader:
    batch = tuple(b.to(device) for b in batch)

    # Store abstract paper id
    paper_id = batch[3].cpu().numpy()
    paper_ids.append(paper_id)

    # Store sentence IDs
    sentence_id = batch[4].cpu().numpy()
    sentence_ids.append(sentence_id)
    
    # Store BERT inputs
    inputs = {'input_ids':      batch[0],
              'attention_mask': batch[1],
              'labels':         batch[2],
              }

    # Store labels
    label_ids = inputs['labels'].cpu().numpy()
    true_vals.append(label_ids)

    # Get BERT outputs
    with torch.no_grad():        
        outputs = bert_model(**inputs)

    # Store hidden states
    hidden_states = outputs[2]
    pooled_output = torch.cat(tuple([hidden_states[i] for i in [-4, -3, -2, 1]]), dim=-1)
    pooled_output = pooled_output[:,0,:]
    last_hidden_state = pooled_output.detach().cpu().numpy()
    last_hidden_states.append(last_hidden_state)

    # Store logits
    logits = outputs[1].detach().cpu().numpy()
    preds.append(logits)

  sentence_ids = np.concatenate(sentence_ids, axis=0)
  paper_ids = np.concatenate(paper_ids, axis=0)
  true_vals = np.concatenate(true_vals, axis=0)
  last_hidden_states = np.concatenate(last_hidden_states, axis=0)
  predictions = np.concatenate(preds, axis=0)

  return paper_ids, sentence_ids, last_hidden_states, true_vals, predictions

# Convert embeddings to dataframe for sorting
def convert_embeddings_df(p_id, s_id, hidden_states, labels, preds):
  '''Take embedding outputs from Dataloader/BERT, convert to dataframe and sort
     Arrange abstracts into order, regardless of dataloading shuffling'''
  a = []
  for (p_id, s_id, hidden, label, pred) in zip(p_id, s_id, hidden_states, labels, preds):
    a.append([p_id, s_id, hidden, label, pred])
  df = pd.DataFrame(a, columns=['paper_ids', 'sentence_ids', 'hidden_states',
                                'labels', 'predictions'])
  df.sort_values(by=['paper_ids', 'sentence_ids'], inplace=True)
  return df

# Convert embedding dataframe into lists
def get_hiddenstate_tensor(df, unique_p_ids):
  '''Get hidden state tensors from dataframe grouped by paper, in order of sentence
     Final tensor should be in shape (batch size, sequence length, hidden_state size)'''
  hidden_states = []
  for paper_id in unique_p_ids:
    batch_hidden_state = []
    # Get Hidden States
    for hidden_state in df[df['paper_ids'] == paper_id]['hidden_states']:
      batch_hidden_state.append(hidden_state)
    # Pad to Seq Length 10
    for i in range(10):
      try:
        batch_hidden_state[i]
      except:
        batch_hidden_state.append(np.zeros(batch_hidden_state[i-1].size)) ## Check this
    hidden_states.append(batch_hidden_state)
  return tf.convert_to_tensor(hidden_states)

def get_prediction_tensor(df, unique_p_ids):
  '''Get prediction logits from BERT output, grouped as a list by paper in order of sentence
     Final tensor should be shape (batch size, sequence length, num_classes)'''
  predictions = []
  for paper_id in unique_p_ids:
    batch_prediction = []
    for prediction in df[df['paper_ids'] == paper_id]['predictions']:
      batch_prediction.append(prediction)
    # Pad to Seq Length 10
    for i in range(10):
      try:
        batch_prediction[i]
      except:
        batch_prediction.append(np.array([0,0]))
    predictions.append(batch_prediction)
  return tf.convert_to_tensor(predictions)

def get_label_tensor(df, unique_p_ids):
  '''Get true labels from dataloader, grouped as a list by paper in order of sentence
    Final tensor should be shape (batch size, sequence length)'''
  labels = []
  for paper_id in unique_p_ids:
    batch_label = []
    for label in df[df['paper_ids'] == paper_id]['labels']:
      batch_label.append(label)
    # Pad to Seq Length 10 with mask indicator -1
    batch_label = (batch_label + 10*[-1])[:10]
    labels.append(batch_label)
  labels = tf.convert_to_tensor(labels)
  return tf.cast(labels, tf.float32)

def get_mask_tensor(df, unique_p_ids):
  '''Get mask from dataloader for each abstract, list of 1 (not masked) and 0s (masked)
    Final tensor should be shape (batch size, sequence length)'''
  masks = []
  for paper_id in unique_p_ids:
    batch_mask = []
    for label in df[df['paper_ids'] == paper_id]['labels']:
      batch_mask.append(1)
    batch_mask = (batch_mask + 10*[0])[:10]
    masks.append(batch_mask)
  return tf.convert_to_tensor(masks)


In [19]:
# Set BERT parameters
num_labels = 2

# Initialize Bert Model
bert_model = BertForSequenceClassification.from_pretrained(model_id,
                                                      num_labels=num_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=True)

# Load fine tuned model weights
bert_model.load_state_dict(torch.load(
                          '/content/drive/MyDrive/266/BERT_Fine_Tuning/claim_F1886.model',
                          map_location=torch.device('cpu')))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Ber

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [20]:
# Training Embeddings
df_ctrain = convert_embeddings_df(*get_embeddings(dataloader_ctrain))

ctrain_unique = df_ctrain['paper_ids'].unique()
ctrain_x = get_hiddenstate_tensor(df_ctrain, ctrain_unique)
print("Feature shape: ", ctrain_x.shape)
ctrain_predictions = get_prediction_tensor(df_ctrain, ctrain_unique)
print("BERT Predictions shape: ", ctrain_predictions.shape)
ctrain_y = get_label_tensor(df_ctrain, ctrain_unique)
print("Label shape: ", ctrain_y.shape)
ctrain_masks = get_mask_tensor(df_ctrain, ctrain_unique)
print("Mask shape: ", ctrain_masks.shape)

Feature shape:  (750, 10, 3072)
BERT Predictions shape:  (750, 10, 2)
Label shape:  (750, 10)
Mask shape:  (750, 10)


In [21]:
# Validation Embeddings
df_cval = convert_embeddings_df(*get_embeddings(dataloader_cval))

cval_unique = df_cval['paper_ids'].unique()
cval_x = get_hiddenstate_tensor(df_cval, cval_unique)
print("Feature shape: ", cval_x.shape)
cval_predictions = get_prediction_tensor(df_cval, cval_unique)
print("BERT Predictions shape: ", cval_predictions.shape)
cval_y = get_label_tensor(df_cval, cval_unique)
print("Label shape: ", cval_y.shape)
cval_masks = get_mask_tensor(df_cval, cval_unique)
print("Mask shape: ", cval_masks.shape)

Feature shape:  (375, 10, 3072)
BERT Predictions shape:  (375, 10, 2)
Label shape:  (375, 10)
Mask shape:  (375, 10)


In [22]:
# Test Embeddings
df_ctest = convert_embeddings_df(*get_embeddings(dataloader_ctest))

ctest_unique = df_ctest['paper_ids'].unique()
ctest_x = get_hiddenstate_tensor(df_ctest, ctest_unique)
print("Feature shape: ", ctest_x.shape)
ctest_predictions = get_prediction_tensor(df_ctest, ctest_unique)
print("BERT Predictions shape: ", ctest_predictions.shape)
ctest_y = get_label_tensor(df_ctest, ctest_unique)
print("Label shape: ", ctest_y.shape)
ctest_masks = get_mask_tensor(df_ctest, ctest_unique)
print("Mask shape: ", ctest_masks.shape)

Feature shape:  (375, 10, 3072)
BERT Predictions shape:  (375, 10, 2)
Label shape:  (375, 10)
Mask shape:  (375, 10)


## Create CRF Model



In [169]:
# Define function to determine accuracy from masked inputs
def crf_accuracy(y_true, y_pred):
  '''Calculate accuracy without masked inputs'''
  correct = np.sum(y_pred[0] == y_true)
  total = np.sum(y_true != -1)
  return correct / total

def crf_precision(y_true, y_pred):
  '''Calculate precision without masked inputs'''
  tp = np.sum((y_pred[0]==1) & (y_true==1))
  fp = np.sum((y_pred[0]==1) & (y_true==0))
  return tp / (fp + tp)

def crf_recall(y_true, y_pred):
  tp = np.sum((y_pred[0]==1) & (y_true==1))
  fn = np.sum((y_pred[0]==0) & (y_true==1))
  return tp / (tp+fn)

def crf_confusion_matrix(y_true, y_pred):
  tp = np.sum((y_pred[0]==1) & (y_true==1))
  fp = np.sum((y_pred[0]==1) & (y_true==0))
  tn = np.sum((y_pred[0]==0) & (y_true==0))
  fn = np.sum((y_pred[0]==0) & (y_true==1))
  return [[tp, fp], [fn, tn]]

In [139]:
# Define Base model
inputs = tf.keras.Input(shape=(None, 3072))
mask = mask = tf.keras.layers.Masking(mask_value=0)(inputs)
x = tf.keras.layers.Dense(786, activation='relu')(mask)
dropout = tf.keras.layers.Dropout(0.1)(x)
outputs = tf.keras.layers.Dense(786, activation='relu')(dropout)
base_model = tf.keras.Model(inputs=inputs, outputs=outputs)

# Wrap base model to add CRF layer and 
crf_model = CRFModel(base_model, 2)

In [140]:
# Compile model
lr = 1e-4
crf_model.compile(
    optimizer=tf.keras.optimizers.Adam(lr))
crf_model.summary()

Model: "crf_model_10"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_14 (InputLayer)          [(None, None, 3072)  0           []                               
                                ]                                                                 
                                                                                                  
 masking_9 (Masking)            (None, None, 3072)   0           ['input_14[0][0]']               
                                                                                                  
 dense_37 (Dense)               (None, None, 786)    2415378     ['masking_9[0][0]']              
                                                                                                  
 dropout_11 (Dropout)           (None, None, 786)    0           ['dense_37[0][0]']    

In [141]:
# Train model
crf_model.fit(x=ctrain_x, y=ctrain_y, validation_data=(cval_x, cval_y),epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9a177473a0>

In [170]:
# Get validation scores
val_scores = crf_model.predict(cval_x)
val_acc = crf_accuracy(cval_y, val_scores)
print("Accuracy is: ",val_acc)
val_precision = crf_precision(cval_y, val_scores)
print("Precision is: ",val_precision)
val_recall = crf_recall(cval_y, val_scores)
print("Recall is: ",val_recall)
val_cm = crf_confusion_matrix(cval_y, val_scores)
print("CM is: ",val_cm)

Accuracy is:  0.9327902240325866
Precision is:  0.856353591160221
Recall is:  0.7948717948717948
CM is:  [[465, 78], [120, 2283]]


In [171]:
# Get Test Scores
test_scores = crf_model.predict(ctest_x)
test_acc = crf_accuracy(ctest_y, test_scores)
print("Accuracy is: ",test_acc)
test_precision = crf_precision(ctest_y, test_scores)
print("Precision is: ",test_precision)
test_recall = crf_recall(ctest_y, test_scores)
print("Recall is: ",test_recall)
test_cm = crf_confusion_matrix(ctest_y, test_scores)
print("CM is: ",test_cm)

Accuracy is:  0.9249914763041255
Precision is:  0.8574338085539714
Recall is:  0.7373029772329247
CM is:  [[421, 70], [150, 2292]]


In [173]:
# Select value from tensor at index
def get_value_at_index(tensor, index):
    return tensor[index].item()

(375, 10)

In [12]:
# Make list of length 100 of random 0 and 1
labels = [random.randint(0,1) for i in range(100)]
preds = [random.randint(0,1) for i in range(100)]
position = [random.randint(0,9) for i in range(100)]

In [13]:
probability_dict = {
    0:    0.002667,
    1:    0.017333,
    2:    0.049333,
    3:    0.069333,
    4:    0.150667,
    5:    0.247110,
    6:    0.386986,
    7:    0.530374,
    8:    0.698473,
    9: 0.897196
}

In [15]:
# Make list of length 100 of random 0 and 1s with probability of 1s based on position
# Position determined from position list
preds = [random.choices([0,1], weights=[1-probability_dict[i], probability_dict[i]])[0] for i in position]

In [None]:
# Calculate accuracy of preds compared to labels
accuracy = sum([1 for i in range(len(labels)) if labels[i] == preds[i]]) / len(labels)
print("Accuracy is: ", accuracy)

# Calculate precision of preds compared to labels
tp = sum([1 for i in range(len(labels)) if labels[i] == 1 and preds[i] == 1])
fp = sum([1 for i in range(len(labels)) if labels[i] == 0 and preds[i] == 1])
precision = tp / (tp + fp)
print("Precision is: ", precision)

# Calculate recall of preds compared to labels
tp = sum([1 for i in range(len(labels)) if labels[i] == 1 and preds[i] == 1])
fn = sum([1 for i in range(len(labels)) if labels[i] == 1 and preds[i] == 0])
recall = tp / (tp + fn)
print("Recall is: ", recall)

In [None]:
# Find accuracy of preds compared to labels, separeted by value in position list
for i in range(10):
    print("Position: ", i)
    labels_i = [labels[j] for j in range(len(labels)) if position[j] == i]
    preds_i = [preds[j] for j in range(len(preds)) if position[j] == i]
    accuracy = sum([1 for j in range(len(labels_i)) if labels_i[j] == preds_i[j]]) / len(labels_i)
    print("Accuracy is: ", accuracy)

# Find precision of preds compared to labels, separeted by value in position list
for i in range(10):
    print("Position: ", i)
    labels_i = [labels[j] for j in range(len(labels)) if position[j] == i]
    preds_i = [preds[j] for j in range(len(preds)) if position[j] == i]
    tp = sum([1 for j in range(len(labels_i)) if labels_i[j] == 1 and preds_i[j] == 1])
    fp = sum([1 for j in range(len(labels_i)) if labels_i[j] == 0 and preds_i[j] == 1])
    precision = tp / (tp + fp)
    print("Precision is: ", precision)
    labels_i = [labels[j] for j in range(len(labels)) if position[j] == i]
    preds_i = [preds[j] for j in range(len(preds)) if position[j] == i]
    tp = sum([1 for j in range(len(labels_i)) if labels_i[j] == 1 and preds_i[j] == 1])
    fn = sum([1 for j in range(len(labels_i)) if labels_i[j] == 1 and preds_i[j] == 0])
    recall = tp / (tp + fn)
    print("Recall is: ", recall)
    labels_i = [labels[j] for j in range(len(labels)) if position[j] == i]
    preds_i = [preds[j] for j in range(len(preds)) if position[j] == i]
    accuracy = sum([1 for j in range(len(labels_i)) if labels_i[j] == preds_i[j]]) / len(labels_i)
    print("Accuracy is: ", accuracy)


# Find recall of preds compared to labels, separeted by value in position list
for i in range(10):
    print("Position: ", i)
    labels_i = [labels[j] for j in range(len(labels)) if position[j] == i]
    preds_i = [preds[j] for j in range(len(preds)) if position[j] == i]
    tp = sum([1 for j in range(len(labels_i)) if labels_i[j] == 1 and preds_i[j] == 1])
    fn = sum([1 for j in range(len(labels_i)) if labels_i[j] == 1 and preds_i[j] == 0])
    recall = tp / (tp + fn)
    print("Recall is: ", recall)

# Find F1 score of preds compared to labels, separeted by value in position list
for i in range(10):
    print("Position: ", i)
    labels_i = [labels[j] for j in range(len(labels)) if position[j] == i]
    preds_i = [preds[j] for j in range(len(preds)) if position[j] == i]
    tp = sum([1 for j in range(len(labels_i)) if labels_i[j] == 1 and preds_i[j] == 1])
    fp = sum([1 for j in range(len(labels_i)) if labels_i[j] == 0 and preds_i[j] == 1])
    precision = tp / (tp + fp)
    print("Precision is: ", precision)
    labels_i = [labels[j] for j in range(len(labels)) if position[j] == i]
    preds_i = [preds[j] for j in range(len(preds)) if position[j] == i]
    tp = sum([1 for j in range(len(labels_i)) if labels_i[j] == 1 and preds_i[j] == 1])
    fn = sum([1 for j in range(len(labels_i)) if labels_i[j] == 1 and preds_i[j] == 0])
    recall = tp / (tp + fn)
    print("Recall is: ", recall)
    labels_i = [labels[j] for j in range(len(labels)) if position[j] == i]
    preds_i = [preds[j] for j in range(len(preds)) if position[j] == i]
    accuracy = sum([1 for j in range(len(labels_i)) if labels_i[j] == preds_i[j]]) / len(labels_i)
    print("Accuracy is: ", accuracy)
    f1 = 2 * (precision * recall) / (precision + recall)
    print("F1 is: ", f1)




In [None]:
position_df = pd.DataFrame(rows, columns=['position', 'n', 'positives',
                                          'crf_accuracy', 'bert_accuracy',
                                          'crf_precision', 'bert_precision',
                                          'crf_recall', 'bert_recall',
                                          'crf_cm', 'bert_cm'])

In [None]:
# Add F1 score to dataframe
position_df['crf_f1'] = 2 * (position_df['crf_precision'] * position_df['crf_recall']) / (position_df['crf_precision'] + position_df['crf_recall'])
position_df['bert_f1'] = 2 * (position_df['bert_precision'] * position_df['bert_recall']) / (position_df['bert_precision'] + position_df['bert_recall'])


In [None]:
# Build model that takes in array of (batchsize, sequence size, embedding_size)
# And applies attention over the sequence size dimension
class Attention(keras.layers.Layer):
    def __init__(self, return_attention=False, **kwargs):
        super(Attention, self).__init__(**kwargs)
        self.return_attention = return_attention

    def build(self, input_shape):
        self.w = self.add_weight(name='att_weight', shape=(input_shape[-1], 1), initializer='normal')
        self.b = self.add_weight(name='att_bias', shape=(input_shape[1], 1), initializer='zeros')
        super(Attention, self).build(input_shape)

    def call(self, x):
        et = K.squeeze(K.tanh(K.dot(x, self.w) + self.b), axis=-1)
        at = K.softmax(et)
        at = K.expand_dims(at, axis=-1)
        output = x * at
        output = K.sum(output, axis=1)
        if self.return_attention:
            return [output, at]
        return output

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]), (input_shape[0], input_shape[1])]
        return input_shape[0], input_shape[-1]

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'return_attention': self.return_attention,
        })
        return config

# Create model
def create_model():
    input_layer = keras.layers.Input((None, 3072))
    lstm_layer = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True))(input_layer)
    attention_layer = Attention()(lstm_layer)
    output_layer = keras.layers.Dense(1, activation='sigmoid')(attention_layer)
    model = keras.models.Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Compile model
model = create_model()
model.summary()

# Train model
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.1)

# Evaluate model
model.evaluate(X_test, y_test)

In [None]:
# Build Keras biLSTM attention model
def create_model():
    input_layer = keras.layers.Input((None, 3072))
    lstm_layer = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True))(input_layer)
    attention_layer = Attention()(lstm_layer)
    output_layer = keras.layers.Dense(1, activation='sigmoid')(attention_layer)
    model = keras.models.Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Compile model
model = create_model()
model.summary()

# Train model
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.1)


In [None]:
# Embedding dimension of 3072
# Sequence of 10
# Batch size of 32

# Accept input of shape (sequence size, embedding size)
input_layer = tf.keras.layers.Input((10, 3072))

# Make empty embeddings layer in the same size as the input
embedding_layer = tf.keras.layers.Embedding(1, 3072, input_length=10)

# create keys layer
keys = embedding_layer(input_layer)

# Make attention matrices
one_vector = tf.Variable(tf.ones((1,1,1)))
batch_of_ones = tf.tile(one_vector, (tf.shape(input_layer)[0], 1, 1))
query_layer = tf.keras.layers.Dense(3072, use_bias=False)
query = query_layer(batch_of_ones)

# Calculate attention
attention_output = tf.keras.layers.Attention()([query, keys], return_attention_scores=True)

# Add dense layer
dense_layer = tf.keras.layers.Dense(1, activation='sigmoid')(attention_output[0])

# Create model
model = tf.keras.models.Model(inputs=input_layer, outputs=dense_layer)

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

New section

In [None]:
# Bert Model Params
num_labels = 2
lr = 1e-5
eps = 1e-8
epochs = 12
num_warmup_steps = 100
fine_tuned_weights = None
#'/content/drive/MyDrive/266/BERT_Fine_Tuning/finetuned_BERTlarge_epoch_2.model'

# Initialize Bert Model
bert_model = BertForSequenceClassification.from_pretrained(model_id,
                                                      num_labels=num_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=True)
if fine_tuned_weights:
  bert_model.load_state_dict(torch.load(fine_tuned_weights,
                                        map_location=torch.device('cpu')))
optimizer = AdamW(bert_model.parameters(),
                  lr=lr, 
                  eps=eps)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=num_warmup_steps,
                                            num_training_steps=len(dataloader_ctrain)*epochs)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)
print(device)

In [None]:
# Define function for BERT evaluation
def evaluate_bert(dataloader_val):

    bert_model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = bert_model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
# Train BERT model
for epoch in tqdm(range(1, epochs+1)):

    bert_model.train()

    loss_train_total = 0
    best_f1 = 0

    progress_bar = tqdm(dataloader_ctrain, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        bert_model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = bert_model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_ctrain)            
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate_bert(dataloader_cval)
    val_f1 = f1_score_func(predictions, true_vals)
    accuracy = overall_accuracy(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Accuracy: {accuracy}')

    # Save best model
    if val_f1 > best_f1:
        best_f1 = val_f1
        best_epoch = epoch
        torch.save(bert_model.state_dict(),
                   f'/content/drive/MyDrive/266/BERT_Fine_Tuning/BERTdiscourse_f1_{round(val_f1,4)}.model')

In [None]:
# Load best model
best_model_path = '/content/drive/MyDrive/266/BERT_Fine_Tuning/BERTdiscourse_f1_0.9219.model'
bert_model.load_state_dict(torch.load(best_model_path, map_location=torch.device('cpu')))
bert_model.to(device)

# Evaluate best model
_, predictions, true_vals = evaluate_bert(dataloader_cval)
val_f1 = f1_score_func(predictions, true_vals)

from sklearn.metrics import classification_report
print(classification_report(true_vals, np.argmax(predictions, axis=1), target_names=['0', '1']))

# Evaluate best model on test set
_, predictions, true_vals = evaluate_bert(dataloader_ctest)
val_f1 = f1_score_func(predictions, true_vals)



In [None]:
# Get accuracy, precision, recall, and f1 score from sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate model on test set
_, predictions, true_vals = evaluate_bert(dataloader_ctest)
recall = recall_score(true_vals, np.argmax(predictions, axis=1), average='weighted')
precision = precision_score(true_vals, np.argmax(predictions, axis=1), average='weighted')
f1 = f1_score(true_vals, np.argmax(predictions, axis=1), average='weighted')
accuracy = accuracy_score(true_vals, np.argmax(predictions, axis=1))

# Print results
print('Accuracy: {:.2f}'.format(accuracy))
print('Precision: {:.2f}'.format(precision))
print('Recall: {:.2f}'.format(recall))
print('F1: {:.2f}'.format(f1))


In [None]:
def random_undersampler(df, percent, label='target'):
  '''Undersample class 0 to match percent subset of class 1'''
  class_1 = df[df[label] == 1]
  class_1_sample = class_1.sample(frac=percent, replace=False)
  class_1_count = len(class_1_sample)
  # Overrepresented class
  class_0 = df[df[label] == 0]
  class_0_sample = class_0.sample(class_1_count)
  full_sample = pd.concat([class_0_sample, class_1_sample], axis=0)
  return full_sample.sample(frac=1, replace=False).reset_index(drop=True)

def random_oversampler(df, percent, label='target'):
  '''Oversample class 1 to match percent subset of class 0'''
    class_0 = df[df[label] == 0]
    class_0_sample = class_0.sample(frac=percent, replace=True)
    class_0_count = len(class_0_sample)
    # Underrepresented class
    class_1 = df[df[label] == 1]
    class_1_sample = class_1.sample(class_0_count, replace=True)
    full_sample = pd.concat([class_0_sample, class_1_sample], axis=0)
    return full_sample.sample(frac=1, replace=False).reset_index(drop=True)

In [16]:
# Make dataframe with one column, consisting of lists of dimension 2 of random floats
df = pd.DataFrame(np.random.rand(100,2))

In [18]:
# Combine columns into a third column list
df['combined'] = df.apply(lambda x: [x[0], x[1]], axis=1)

In [19]:
df

Unnamed: 0,0,1,combined
0,0.294665,0.530587,"[0.2946650026871097, 0.5305867556052941]"
1,0.191521,0.067900,"[0.19152078694749486, 0.06790035819129137]"
2,0.786985,0.656334,"[0.7869854599999133, 0.6563335217758555]"
3,0.637521,0.575603,"[0.6375208960436358, 0.575602893753034]"
4,0.039063,0.357814,"[0.03906291618886648, 0.35781360448354893]"
...,...,...,...
95,0.913303,0.590220,"[0.9133029199647259, 0.5902203718852678]"
96,0.171194,0.714474,"[0.17119434728109106, 0.714474477740643]"
97,0.238446,0.921805,"[0.23844605886430326, 0.9218051432493948]"
98,0.721274,0.676923,"[0.7212738667495618, 0.6769229790876689]"


In [20]:
# Select the index of the largest value in the combined column
df['max_index'] = df['combined'].apply(lambda x: np.argmax(x))

In [21]:
df

Unnamed: 0,0,1,combined,max_index
0,0.294665,0.530587,"[0.2946650026871097, 0.5305867556052941]",1
1,0.191521,0.067900,"[0.19152078694749486, 0.06790035819129137]",0
2,0.786985,0.656334,"[0.7869854599999133, 0.6563335217758555]",0
3,0.637521,0.575603,"[0.6375208960436358, 0.575602893753034]",0
4,0.039063,0.357814,"[0.03906291618886648, 0.35781360448354893]",1
...,...,...,...,...
95,0.913303,0.590220,"[0.9133029199647259, 0.5902203718852678]",0
96,0.171194,0.714474,"[0.17119434728109106, 0.714474477740643]",1
97,0.238446,0.921805,"[0.23844605886430326, 0.9218051432493948]",1
98,0.721274,0.676923,"[0.7212738667495618, 0.6769229790876689]",0
