# Installs

In [85]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [86]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import random

In [87]:
import torch
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Data Set Up

In [88]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [89]:
# Unzip folder
!unzip /content/drive/MyDrive/266/Data/claim_dataset.zip

KeyboardInterrupt: ignored

In [None]:
def random_undersampler(df, percent, label='target'):
  '''Undersample class 0 to match percent subset of class 1'''
  class_1 = df[df[label] == 1]
  class_1_sample = class_1.sample(frac=percent, replace=False)
  class_1_count = len(class_1_sample)
  # Overrepresented class
  class_0 = df[df[label] == 0]
  class_0_sample = class_0.sample(class_1_count)
  full_sample = pd.concat([class_0_sample, class_1_sample], axis=0)
  return full_sample.sample(frac=1, replace=False).reset_index(drop=True)

In [None]:
def random_oversampler(df, percent, label='target'):
  '''Oversample class 1 to match percent subset of class 0'''
  class_0 = df[df[label] == 0]
  class_0_sample = class_0.sample(frac=percent, replace=True)
  class_0_count = len(class_0_sample)
  # Underrepresented class
  class_1 = df[df[label] == 1]
  class_1_sample = class_1.sample(class_0_count, replace=True)
  full_sample = pd.concat([class_0_sample, class_1_sample], axis=0)
  return full_sample.sample(frac=1, replace=False).reset_index(drop=True)

In [None]:
def text_label_formatter(df, x_columns=['abstract_text'], label='target'):
  '''Split dataframe into features/x and labels'''
  x = pd.DataFrame()
  for col in x_columns:
    x[col] = df[col]
  labels = df[label]
  return x, labels

In [None]:
def explode_df(df):
  '''Convert claim dataset into one row per sentence format'''
  # Add sentence id to indicate order within abstract
  sentence_ids = []
  for label_list in df.labels:
    sentence_ids.append(list(range(len(label_list))))
  df['sentence_ids'] = sentence_ids

  # Explode labels, sentences, and ids
  df2 = df.explode(list(('labels','sentences', 'sentence_ids')),
                   ignore_index=True)
  df2['labels'] = df2['labels'].astype('int')
  df2['sentence_ids'] = df2['sentence_ids'].astype('int')
  return df2

In [90]:
# Load full json files
cval_df_raw = pd.read_json('/content/claim_dataset/validation_labels.json', lines=True)
ctrain_df_raw = pd.read_json('/content/claim_dataset/train_labels.json', lines=True)
ctest_df_raw = pd.read_json('/content/claim_dataset/test_labels.json', lines=True)

# Convert to single row per abstract format
cval_df = explode_df(cval_df_raw)
ctrain_df = explode_df(ctrain_df_raw)
ctest_df = explode_df(ctest_df_raw)

## Balance training dataset - Undersample
# ctrain_balanced = random_undersampler(ctrain_df, 1, 'labels')

## Balance training dataset - Oversample
ctrain_balanced = random_oversampler(ctrain_df, 1, 'labels')

# Prepare data for embedding dataloader
ctrain_texts, ctrain_labels = text_label_formatter(ctrain_df,
                                                 ['sentences', 'paper_id', 'sentence_ids'], 'labels')
cval_texts, cval_labels = text_label_formatter(cval_df,
                                                 ['sentences', 'paper_id', 'sentence_ids'], 'labels')
ctest_texts, ctest_labels = text_label_formatter(ctest_df,
                                                 ['sentences', 'paper_id', 'sentence_ids'], 'labels')

# Model Set Up

## For BERT Embedding

In [91]:
# Define model ID
model_id = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

In [92]:
# Set Random seeds
seed_val = 17
random.seed(seed_val) ## Is this the only time I use random
np.random.seed(seed_val)
torch.manual_seed(seed_val) ## Do I need these, this is for pytorch
torch.cuda.manual_seed_all(seed_val)

### Dataloading

In [93]:
# Initialize encoder
max_length = 256
batch_size = 10
tokenizer = BertTokenizer.from_pretrained(model_id, 
                                          do_lower_case=True)

In [94]:
## Load training data
# Encode
encoded_ctrain = tokenizer.batch_encode_plus(
    ctrain_texts['sentences'].values,
    add_special_tokens=True, 
    return_attention_mask=True,
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors='pt'
)

# Split inputs into tensors
input_ids_ctrain = encoded_ctrain['input_ids']
attention_masks_ctrain = encoded_ctrain['attention_mask']
paper_id_ctrain = torch.tensor(ctrain_texts['paper_id'])
sentence_id_ctrain = torch.tensor(ctrain_texts['sentence_ids'])
labels_ctrain = torch.tensor(ctrain_labels.values)

# Make dataset
dataset_ctrain = TensorDataset(input_ids_ctrain, attention_masks_ctrain,
                               labels_ctrain, paper_id_ctrain,
                               sentence_id_ctrain)

# Make dataloader
dataloader_ctrain = DataLoader(dataset_ctrain, 
                              sampler=RandomSampler(dataset_ctrain), 
                              batch_size=batch_size)

In [95]:
## Load validation data
# Encode
encoded_cval = tokenizer.batch_encode_plus(
    cval_texts['sentences'].values,
    add_special_tokens=True, 
    return_attention_mask=True,
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors='pt'
)

# Split inputs into tensors
input_ids_cval = encoded_cval['input_ids']
attention_masks_cval = encoded_cval['attention_mask']
paper_id_cval = torch.tensor(cval_texts['paper_id'])
sentence_id_cval = torch.tensor(cval_texts['sentence_ids'])
labels_cval = torch.tensor(cval_labels.values)

# Make dataset
dataset_cval = TensorDataset(input_ids_cval, attention_masks_cval, labels_cval,
                             paper_id_cval, sentence_id_cval)

# Make dataloader
dataloader_cval = DataLoader(dataset_cval, 
                                   sampler=SequentialSampler(dataset_cval), 
                                   batch_size=batch_size)

In [96]:
## Load testing data
# Encode
encoded_ctest = tokenizer.batch_encode_plus(
    ctest_texts['sentences'].values,
    add_special_tokens=True, 
    return_attention_mask=True,
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors='pt'
)

# Make tensors
input_ids_ctest = encoded_ctest['input_ids']
attention_masks_ctest = encoded_ctest['attention_mask']
paper_id_ctest = torch.tensor(ctest_texts['paper_id'])
sentence_id_ctest = torch.tensor(ctest_texts['sentence_ids'])
labels_ctest = torch.tensor(ctest_labels.values)

# Make dataset
dataset_ctest = TensorDataset(input_ids_ctest, attention_masks_ctest,
                              labels_ctest, paper_id_ctest, sentence_id_ctest)
# Make dataloader
dataloader_ctest = DataLoader(dataset_ctest, 
                              sampler=RandomSampler(dataset_ctest), 
                              batch_size=batch_size)

# BERT Model

## Model Initialization

In [97]:
# Bert Model Params
num_labels = 2
lr = 1e-5
eps = 1e-8
epochs = 12
num_warmup_steps = 100
fine_tuned_weights = None
# '/content/drive/MyDrive/266/BERT_Fine_Tuning/finetuned_BERTlarge_epoch_3.model'

# Initialize Bert Model
bert_model = BertForSequenceClassification.from_pretrained(model_id,
                                                      num_labels=num_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=True)
if fine_tuned_weights:
  bert_model.load_state_dict(torch.load(fine_tuned_weights,
                                        map_location=torch.device('cpu')))
optimizer = AdamW(bert_model.parameters(),
                  lr=lr, 
                  eps=eps)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=num_warmup_steps,
                                            num_training_steps=len(dataloader_ctrain)*epochs)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)
print(device)

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Ber

cuda




## Helper Functions

In [98]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def overall_accuracy(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(preds_flat == labels_flat) / len(labels_flat)

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [99]:
# Define function for BERT evaluation
def evaluate_bert(dataloader_val):

    bert_model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = bert_model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

## Train Model

In [100]:
# Train BERT model
for epoch in tqdm(range(1, epochs+1)):

    bert_model.train()

    loss_train_total = 0
    best_f1 = 0

    progress_bar = tqdm(dataloader_ctrain, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        bert_model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = bert_model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_ctrain)            
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate_bert(dataloader_cval)
    val_f1 = f1_score_func(predictions, true_vals)
    accuracy = overall_accuracy(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Accuracy: {accuracy}')

    # Save best model
    if val_f1 > best_f1:
        best_f1 = val_f1
        best_epoch = epoch
        torch.save(bert_model.state_dict(),
                   f'/content/drive/MyDrive/266/BERT_Fine_Tuning/BERTdiscourse_f1_{round(val_f1,4)}.model')

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/583 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.31236457989246014
Validation loss: 0.19283446722774436
F1 Score (Weighted): 0.9254136043782379
Accuracy: 0.9260013577732519


Epoch 2:   0%|          | 0/583 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.2066445637483942
Validation loss: 0.3108991678512134
F1 Score (Weighted): 0.9205023104166251
Accuracy: 0.9215885947046843


Epoch 3:   0%|          | 0/583 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.15049228331774608
Validation loss: 0.3757892957912221
F1 Score (Weighted): 0.9099671420813736
Accuracy: 0.9154786150712831


Epoch 4:   0%|          | 0/583 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.09924745400405072
Validation loss: 0.4271264529946828
F1 Score (Weighted): 0.9195541316279666
Accuracy: 0.9205702647657841


Epoch 5:   0%|          | 0/583 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.05635920471962966
Validation loss: 0.46604810861117896
F1 Score (Weighted): 0.9198669714591556
Accuracy: 0.9192124915139172


Epoch 6:   0%|          | 0/583 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [102]:
# Load best model
best_model_path = '/content/drive/MyDrive/266/BERT_Fine_Tuning/BERTdiscourse_f1_0.9192_notransfer_unbalance.model'
bert_model.load_state_dict(torch.load(best_model_path, map_location=torch.device('cpu')))
bert_model.to(device)

# Evaluate best model on validation
_, predictions, true_vals = evaluate_bert(dataloader_cval)
recall = recall_score(true_vals, np.argmax(predictions, axis=1), average='weighted')
precision = precision_score(true_vals, np.argmax(predictions, axis=1), average='weighted')
f1 = f1_score(true_vals, np.argmax(predictions, axis=1), average='weighted')
accuracy = accuracy_score(true_vals, np.argmax(predictions, axis=1))

# Print results
print('Accuracy: {:.4f}'.format(accuracy))
print('Precision: {:.4f}'.format(precision))
print('Recall: {:.4f}'.format(recall))
print('F1: {:.4f}'.format(f1))
print(classification_report(true_vals, np.argmax(predictions, axis=1), target_names=['0', '1'], digits=4))

Accuracy: 0.9202
Precision: 0.9200
Recall: 0.9202
F1: 0.9201
              precision    recall  f1-score   support

           0     0.9493    0.9513    0.9503      2361
           1     0.8017    0.7949    0.7983       585

    accuracy                         0.9202      2946
   macro avg     0.8755    0.8731    0.8743      2946
weighted avg     0.9200    0.9202    0.9201      2946



In [103]:
# Evaluate model on test set
_, predictions, true_vals = evaluate_bert(dataloader_ctest)
recall = recall_score(true_vals, np.argmax(predictions, axis=1), average='weighted')
precision = precision_score(true_vals, np.argmax(predictions, axis=1), average='weighted')
f1 = f1_score(true_vals, np.argmax(predictions, axis=1), average='weighted')
accuracy = accuracy_score(true_vals, np.argmax(predictions, axis=1))

# Print results
print('Accuracy: {:.4f}'.format(accuracy))
print('Precision: {:.4f}'.format(precision))
print('Recall: {:.4f}'.format(recall))
print('F1: {:.4f}'.format(f1))
print(classification_report(true_vals, np.argmax(predictions, axis=1), target_names=['0', '1'], digits=4))

Accuracy: 0.9182
Precision: 0.9170
Recall: 0.9182
F1: 0.9175
              precision    recall  f1-score   support

           0     0.9447    0.9543    0.9495      2362
           1     0.8026    0.7688    0.7853       571

    accuracy                         0.9182      2933
   macro avg     0.8736    0.8616    0.8674      2933
weighted avg     0.9170    0.9182    0.9175      2933

