<a href="https://colab.research.google.com/github/hkayesh/causal-qa/blob/master/Causal_QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install necessary packages

In [1]:
! pip install transformers



In [2]:
! pip install torch



### Import packages

In [3]:
%tensorflow_version 2.x
import tensorflow as tf

import re
import json
import torch
import warnings
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

from google.colab import drive 
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import CamembertForSequenceClassification, CamembertTokenizer
from transformers import AlbertForSequenceClassification, AlbertTokenizer

from smart_open import smart_open
from gensim.summarization.textcleaner import split_sentences
from sklearn.model_selection import train_test_split
from transformers.optimization import AdamW
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, matthews_corrcoef



TensorFlow 2.x selected.


### Mount Google Drive 
(from shamolbit@gmail.com)

In [4]:
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Define Constant Values

In [0]:
RANDOM_SEED = 1
BATCH_SIZE = 32
MAX_SEQ_LEN = 128
PRETRAINED_MODEL_NAME = 'bert-base-uncased'
NUM_EPOCHS = 4
DATASET_DIR_PATH = 'gdrive/My Drive/Research Data/CausalQA/datasets/'
# NEWS_ARTICLES_FILE_PATH = 'gdrive/My Drive/Research Data/CausalQA/signalmedia-1m.jsonl'
NEWS_ARTICLES_FILE_PATH = 'gdrive/My Drive/Research Data/CausalQA/causal_pairs_10k_articles.csv'

#### Set random seeds for reproducibility 

In [6]:
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f8378d03710>

### Load Pretraiend model and tokenizer

In [0]:
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME)
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

# model = RobertaForSequenceClassification.from_pretrained("roberta-base")
# tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# model = CamembertForSequenceClassification.from_pretrained()
# tokenizer = CamembertTokenizer.from_pretrained()

# model = AlbertForSequenceClassification.from_pretrained()
# tokenizer = AlbertTokenizer.from_pretrained()


### Load Training Data

In [8]:
causal_pairs_df = pd.read_csv(NEWS_ARTICLES_FILE_PATH, lineterminator='\n', error_bad_lines=False)
print(causal_pairs_df.shape)
causal_pairs_df.head()



(7230, 3)


Unnamed: 0,sentence,sequence_a,sequence_b
0,the postal order was chosen because of its cen...,its central location and its proximity to the ...,the postal order was chosen
1,"posted on sep 22, 2015\rif you were a basketba...",you were a basketball fan who was born in the 80s,you were lucky enough to witness the beauty th...
2,individual dreams and visions have been aborte...,words of death spoken over them,individual dreams and visions have been aborted
3,homes and marriages have been destroyed becaus...,poisonous words,homes and marriages have been destroyed
4,companies and businesses have fallen because o...,words of envy and jealousy,companies and businesses have fallen


In [9]:
train_dataset_causal_df = causal_pairs_df[['sequence_a', 'sequence_b']]
train_dataset_causal_df['label'] = 'causal'
train_dataset_causal_df.head()
train_dataset_causal_df.shape

train_dataset_not_causal_df = pd.DataFrame()
train_dataset_not_causal_df['sequence_a'] = train_dataset_causal_df['sequence_a'].sample(
    train_dataset_causal_df.shape[0], random_state=1).tolist()
train_dataset_not_causal_df['sequence_b'] = train_dataset_causal_df['sequence_b'].sample(
    train_dataset_causal_df.shape[0], random_state=2).tolist()
train_dataset_not_causal_df['label'] = 'not_ causal'

train_dataset = pd.concat([train_dataset_causal_df, train_dataset_not_causal_df])
print(train_dataset.shape)
train_dataset.head()



(14460, 3)


Unnamed: 0,sequence_a,sequence_b,label
0,its central location and its proximity to the ...,the postal order was chosen,causal
1,you were a basketball fan who was born in the 80s,you were lucky enough to witness the beauty th...,causal
2,words of death spoken over them,individual dreams and visions have been aborted,causal
3,poisonous words,homes and marriages have been destroyed,causal
4,words of envy and jealousy,companies and businesses have fallen,causal


#### Prepare training dataset

In [0]:
seq_a_list = train_dataset['sequence_a'].tolist()
seq_b_list = train_dataset['sequence_b'].tolist()
labels = [1 if label=='causal' else 0 for label in train_dataset['label'].tolist()]

input_ids = []
token_type_ids = []
attention_masks = []

for seq_a, seq_b in zip(seq_a_list, seq_b_list):
  encoded_data = tokenizer.encode_plus(text=seq_a, text_pair=seq_b, 
                                       max_length=MAX_SEQ_LEN, pad_to_max_length=True)
  input_ids.append(encoded_data['input_ids'])
  token_type_ids.append(encoded_data['token_type_ids'])
  attention_masks.append(encoded_data['attention_mask'])

# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids, labels, random_state=RANDOM_SEED, test_size=0.25)

# # Use train_test_split to split our data into train and validation sets for training
# train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(train_val_inputs, train_val_labels, 
#                                                             random_state=1, test_size=0.1)

train_token_types, validation_token_types, _, _ = train_test_split(token_type_ids, input_ids,
                                             random_state=1, test_size=0.25)

train_masks, validation_masks, _, _ = train_test_split(
    attention_masks, input_ids, random_state=RANDOM_SEED, test_size=.25)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_token_types = torch.tensor(train_token_types)
validation_token_types = torch.tensor(validation_token_types)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

# Select a batch size for training. 
batch_size = BATCH_SIZE

# Create an iterator of our data with torch DataLoader 
train_data = TensorDataset(train_inputs, train_token_types, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_token_types, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


### Fine-tuning model and validation


In [11]:
# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)


# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(params=optimizer_grouped_parameters, lr=2e-5, weight_decay=0.01)

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
  
# Store our loss and accuracy for plotting
train_loss_set = []
# Number of training epochs 
epochs = NUM_EPOCHS

model.cuda()

# BERT training loop
for epoch in range(epochs):  
  print('Epoch {}/{}'.format(epoch+1, epochs))
  
  ## TRAINING
  
  # Set our model to training mode
  model.train()  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_token_type_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss, _ = model(input_ids = b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask, labels=b_labels)
    # loss, _ = model(input_ids = b_input_ids, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
  print("Train loss: {:.4f}".format(tr_loss/nb_tr_steps))
       
  ## VALIDATION

  # Put model in evaluation mode
  model.eval()
  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_token_type_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask) 
      # logits = model(b_input_ids, attention_mask=b_input_mask)    
    # Move logits and labels to CPU
    logits = np.array(logits[0].cpu())
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
  print("Validation Accuracy: {:.4f}\n".format(eval_accuracy/nb_eval_steps))

# plot training performance
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()

Epoch 1/4
Train loss: 0.4876
Validation Accuracy: 0.7840

Epoch 2/4


KeyboardInterrupt: ignored

### Evaluate Model

In [0]:
def evaluate(model, test_inputs, token_types, test_masks, test_labels):
  # Convert all of our data into torch tensors, the required datatype for our model
  prediction_inputs = torch.tensor(test_inputs)
  prediction_token_types = torch.tensor(token_types)
  prediction_masks = torch.tensor(test_masks)
  prediction_labels = torch.tensor(test_labels)

  # Select a batch size for training. 
  batch_size = 32

  # Create an iterator of our data with torch DataLoader 
  prediction_data = TensorDataset(prediction_inputs, prediction_token_types, prediction_masks, prediction_labels)
  prediction_sampler = SequentialSampler(prediction_data)
  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

  ## Prediction on test set
  # Put model in evaluation mode
  model.eval()
  # Tracking variables 
  predictions , true_labels = [], []
  # Predict 
  for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_token_type_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)
      # logits = model(b_input_ids, attention_mask=b_input_mask)
    # Move logits and labels to CPU
    logits = logits[0].cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()  
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)
    
  matthews_set = []
  for i in range(len(true_labels)):
    matthews = matthews_corrcoef(true_labels[i],
                  np.argmax(predictions[i], axis=1).flatten())
    matthews_set.append(matthews)
    
  # Flatten the predictions and true values for aggregate Matthew's evaluation on the whole dataset
  flat_predictions = [item for sublist in predictions for item in sublist]
  flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
  flat_true_labels = [item for sublist in true_labels for item in sublist]

  scores = {
    'matthews_corrcoef_acc': matthews_corrcoef(flat_true_labels, flat_predictions),
    'precision': precision_score(flat_true_labels, flat_predictions),
    'recall': recall_score(flat_true_labels, flat_predictions),
    'f1_score': f1_score(flat_true_labels, flat_predictions),
    'accuracy': accuracy_score(flat_true_labels, flat_predictions)
  }

  return scores



In [0]:
semeval_file_path = DATASET_DIR_PATH + 'semeval-benchmark-v1.csv'
risk_file_path = DATASET_DIR_PATH + 'risk-models-benchmark-v1.csv'
nato_sfa_file_path = DATASET_DIR_PATH + 'nato-sfa-benchmark-v1.csv'
ce_me_file_path = DATASET_DIR_PATH + 'ce-me-benchmark-v1.csv'

In [0]:
sem_eval_df = pd.read_csv(semeval_file_path, header=None)
sem_eval_df.head()

In [0]:
risk_df = pd.read_csv(risk_file_path, header=None)
risk_df.head()

In [0]:

nato_sfa_df = pd.read_csv(nato_sfa_file_path, header=None)
nato_sfa_df.head()


In [0]:
ce_me_df = pd.read_csv(ce_me_file_path, header=None)
ce_me_df.head()

In [0]:

warnings.filterwarnings("ignore", category=RuntimeWarning) 

dataset_dfs = [sem_eval_df, nato_sfa_df, risk_df, ce_me_df]
dataset_names = ['SemEval', 'NATO-SFA', 'Risk Models', 'CE Pairs']
print('Dataset, accuracy, Precision, Recall, F1-score')

for dataset_df, dataset_name in zip(dataset_dfs, dataset_names):
  seq_a_list = dataset_df[0].tolist()
  seq_b_list = dataset_df[1].tolist()
  labels = [1 if label=='causal' else 0 for label in dataset_df[2].tolist()]

  input_ids = []
  token_type_ids = []
  attention_masks = []

  for seq_a, seq_b in zip(seq_a_list, seq_b_list):
    encoded_data = tokenizer.encode_plus(text=seq_a, text_pair=seq_b, max_length=10, pad_to_max_length=True)
    input_ids.append(encoded_data['input_ids'])
    token_type_ids.append(encoded_data['token_type_ids'])
    attention_masks.append(encoded_data['attention_mask'])

  scores = evaluate(model, input_ids, token_type_ids, attention_masks, labels)
  print('{}, {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(dataset_name, scores['accuracy'], scores['precision'], scores['recall'], scores['f1_score']))

  # print('Classification accuracy of dataset {0} is {1:0.2%}'.format(dataset_name, scores['matthews_corrcoef_acc']))
