# Installs


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
# BERT imports
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification, AutoTokenizer, AutoModelForMaskedLM
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
from sklearn.metrics import f1_score

In [None]:
import random

# Data Set Up

### Dataset:
* PubMed_20k_RCT_numbers_replaced_with_at_sign

### Formatting:
* abstract_id: 8 numeric code assigned to each abstract
* line_id: abstractID_sequence in abstract (labels each line's location)
* abstract_text: sentence in abstract
* line_number: sequence in abstract
* total_lines: length of abstract
* target: Label [BACKGROUND, RESULTS, CONCLUSIONS, METHODS, OBJECTIVE]

### Files
* dev.csv: validation
* test.csv: test
* train.csv: train

In [None]:
### Define functions for data handling

# Define function to subset data randomly to balance classes
def random_undersampler(df, percent, label='target'):
  '''Undersample class 0 to match percent subset of class 1'''
  class_1 = df[df[label] == 1]
  class_1_sample = class_1.sample(frac=percent, replace=False)
  class_1_count = len(class_1_sample)
  # Overrepresented class
  class_0 = df[df[label] == 0]
  class_0_sample = class_0.sample(class_1_count)
  full_sample = pd.concat([class_0_sample, class_1_sample], axis=0)
  return full_sample.sample(frac=1, replace=False).reset_index(drop=True)

# Define function to split dataframe
def text_label_formatter(df, features=['abstract_text'], label='target'):
  '''Split dataframe into features and labels'''
  vals = pd.DataFrame()
  for col in features:
    vals[col] = df[col]
  labels = df[label]
  return vals, labels

In [None]:
# Get Data
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Set dataset directory
dataset_dir = "/content/drive/MyDrive/266/Data/PubMed_20k_RCT_numbers_replaced_with_at_sign/"

# Load data
val_df = pd.read_csv(dataset_dir + "dev.csv")
test_df = pd.read_csv(dataset_dir + "test.csv")
train_df = pd.read_csv(dataset_dir + "train.csv")

df_list = [val_df, test_df, train_df]
train_df.head()

Unnamed: 0,abstract_id,line_id,abstract_text,line_number,total_lines,target
0,24293578,24293578_0_12,To investigate the efficacy of @ weeks of dail...,0,12,OBJECTIVE
1,24293578,24293578_1_12,A total of @ patients with primary knee OA wer...,1,12,METHODS
2,24293578,24293578_2_12,Outcome measures included pain reduction and i...,2,12,METHODS
3,24293578,24293578_3_12,Pain was assessed using the visual analog pain...,3,12,METHODS
4,24293578,24293578_4_12,Secondary outcome measures included the Wester...,4,12,METHODS


In [None]:
### Convert classes to binary data

# Find the set of possible target values
print(val_df['target'].unique())
for df in df_list:
  print(df['target'].value_counts())

# Replace targets with binary map, conclusions as 1 and rest as 0
target_map = {'BACKGROUND':0, 'OBJECTIVE':0, 'METHODS':0, 'RESULTS':0, 'CONCLUSIONS':1}
label_dict = {'Other':0, 'Conclusions':1}
for df in df_list:
  df.replace({"target":target_map}, inplace=True)

# Confirm success
for df in df_list:
  print(df['target'].value_counts())
val_df.head()

['BACKGROUND' 'OBJECTIVE' 'METHODS' 'RESULTS' 'CONCLUSIONS']
METHODS        9964
RESULTS        9841
CONCLUSIONS    4582
BACKGROUND     3449
OBJECTIVE      2376
Name: target, dtype: int64
METHODS        9897
RESULTS        9713
CONCLUSIONS    4571
BACKGROUND     3621
OBJECTIVE      2333
Name: target, dtype: int64
METHODS        59353
RESULTS        57953
CONCLUSIONS    27168
BACKGROUND     21727
OBJECTIVE      13839
Name: target, dtype: int64
0    25630
1     4582
Name: target, dtype: int64
0    25564
1     4571
Name: target, dtype: int64
0    152872
1     27168
Name: target, dtype: int64


Unnamed: 0,abstract_id,line_id,abstract_text,line_number,total_lines,target
0,24290286,24290286_0_10,IgE sensitization to Aspergillus fumigatus and...,0,10,0
1,24290286,24290286_1_10,It is not clear whether these patients would b...,1,10,0
2,24290286,24290286_2_10,We sought to determine whether a @-month cours...,2,10,0
3,24290286,24290286_3_10,Asthmatic patients who were IgE sensitized to ...,3,10,0
4,24290286,24290286_4_10,Primary outcomes were improvement in quality o...,4,10,0


In [None]:
### Balance classes, select portion of data for more achievable training times
# Subset fraction
fraction = 0.01

# Subset to half of dataframe, balance classes in training
train_subset = random_undersampler(train_df, fraction)
test_subset = test_df.sample(frac=fraction)
val_subset = val_df.sample(frac=fraction)

# Confirm subsetting success
df_list = [train_subset, test_subset, val_subset]
for df in df_list:
    print(df['target'].value_counts())

0    272
1    272
Name: target, dtype: int64
0    269
1     32
Name: target, dtype: int64
0    266
1     36
Name: target, dtype: int64


In [None]:
### Prepare data for tokenization, etc

# Split data into testing and training
train_texts, train_labels = text_label_formatter(train_subset)
val_texts, val_labels = text_label_formatter(val_subset)
test_texts, test_labels = text_label_formatter(test_subset)

# Model Set Up

## BERT model Parameters

In [None]:
# Define model ID
model_id = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

In [None]:
# Set Random seeds
seed_val = 17
random.seed(seed_val) ## Is this the only time I use random
np.random.seed(seed_val)
torch.manual_seed(seed_val) ## Do I need these, this is for pytorch
torch.cuda.manual_seed_all(seed_val)

## DataLoading

In [None]:
# Initialize Tokenizer
max_length = 256
batch_size = 10
tokenizer = BertTokenizer.from_pretrained(model_id, 
                                          do_lower_case=True)

In [None]:
# Encode training data
encoded_data_train = tokenizer.batch_encode_plus(
    train_texts['abstract_text'].values,
    add_special_tokens=True, 
    return_attention_mask=True,
    truncation=True, 
    pad_to_max_length=True, 
    max_length=max_length, 
    return_tensors='pt'
)

# Prep data
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_labels.values)

# Form into dataset
dataset_train = TensorDataset(input_ids_train, attention_masks_train,
                              labels_train)

# Put into dataloader
dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)



In [None]:
# Encode validation data
encoded_data_val = tokenizer.batch_encode_plus(
    val_texts['abstract_text'].values,
    add_special_tokens=True, 
    return_attention_mask=True,
    truncation=True, 
    pad_to_max_length=True, 
    max_length=max_length, 
    return_tensors='pt'
)

# Prep data
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(val_labels.values)

# Form into dataset
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

# Put into dataloader
dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [None]:
# Encode test data
encoded_data_test = tokenizer.batch_encode_plus(
    test_texts['abstract_text'].values,
    add_special_tokens=True, 
    return_attention_mask=True,
    truncation=True, 
    pad_to_max_length=True, 
    max_length=max_length, 
    return_tensors='pt'
)

# Prep data
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(test_labels.values)

# Form into dataset
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

# Put into dataloader
dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)

## BERT model initialization

In [None]:
# Bert Model Params
num_labels = 2
lr = 1e-5
eps = 1e-8
epochs = 12
num_warmup_steps = 100

# Initialize Bert Model
bert_model = BertForSequenceClassification.from_pretrained(model_id,
                                                      num_labels=num_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=True)
optimizer = AdamW(bert_model.parameters(),
                  lr=lr, 
                  eps=eps)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=num_warmup_steps,
                                            num_training_steps=len(dataloader_train)*epochs)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)
print(device)

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Ber

cpu




# Training

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def overall_accuracy(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(preds_flat == labels_flat) / len(labels_flat)

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
# Define function for BERT evaluation
def evaluate_bert(dataloader_val):

    bert_model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = bert_model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
# Train BERT model
for epoch in tqdm(range(1, epochs+1)):

    bert_model.train()

    loss_train_total = 0
    best_f1 = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        bert_model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = bert_model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate_bert(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    accuracy = overall_accuracy(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Accuracy: {accuracy}')

    # Save best model
    if val_f1 > best_f1:
        best_f1 = val_f1
        best_epoch = epoch
        torch.save(bert_model.state_dict(),
                   f'/content/drive/MyDrive/266/BERT_Fine_Tuning/BERTdiscourse_f1_{round(val_f1,4)}.model')

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/55 [00:00<?, ?it/s]

KeyboardInterrupt: ignored