## Claim verification Training

In [1]:
import random
import math
from torch.utils.data import Dataset
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
# from transformers import RobertaForSequenceClassification, RobertaTokenizerFast, BertConfig
from sklearn.utils.class_weight import compute_class_weight
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
import time
import warnings
import numpy as np
from collections import defaultdict
warnings.simplefilter(action='ignore', category=FutureWarning)

TRAIN_PATH = "./data/train-claims.json"
EVI_PATH = "./data/evidence.json"
PAD = "[PAD]"
CLS = '[CLS]'
SEP = '[SEP]'
PAD_LEN = 50
RANDOM_SEED = 42
SUPPORTS = "SUPPORTS"
REFUTES = "REFUTES"
NOT_ENOUGH_INFO = "NOT_ENOUGH_INFO"
DISPUTED = "DISPUTED"
RELATED = 1
MAX_EVI = 3
NOT_RELATED = 0
num_class = 4
label_trans = {SUPPORTS: 0, REFUTES: 1, NOT_ENOUGH_INFO: 2, DISPUTED: 3}

In [2]:

def relation_sep(claim, evidence, max_evi):
    ## Match the input into Bert input formate, including adding [CLS] and [SEP]
    ## inputs are tokens
    output = [CLS] + claim + [SEP]
    seg_li = [0 for _ in range(len(claim)+2)]
    cur_seg = 1
    for evi in evidence:
        output += evi + [SEP]
        seg_li += [cur_seg for _ in range(len(evi)+1)]
        # cur_seg += 1
    for i in range(len(evidence), max_evi):
        output += [PAD for _ in range(len(evi))] + [SEP]
        seg_li += [cur_seg for _ in range(len(evi)+1)]
        # cur_seg += 1
    return output, seg_li

## read file
claim_df = pd.read_json(TRAIN_PATH, orient="index")
evi_df = pd.read_json(EVI_PATH, orient="index").rename({0: "evi_text"}, axis=1)
veri_df = claim_df.rename({"claim_text": "claim", "claim_label": "label", "evidences": "evidence"}, axis=1).reset_index(drop=True)
veri_df["evidence"] = veri_df.apply(lambda x: [evi_df.loc[i].values[0] for i in x["evidence"]], axis=1)
veri_df["evidence"] = veri_df.apply(lambda x: x.evidence if len(x.evidence) <= MAX_EVI else x.evidence[:MAX_EVI], axis=1)

## Resample with different order of evidence
sample_pos = 0.4
new_sample_df = pd.DataFrame(columns = ["claim", "label", "evidence"])
for cur_record in veri_df.values:
    if random.random() < sample_pos and len(cur_record[2])>1:
        new_sample = random.sample(cur_record[2], len(cur_record[2]))
        while(new_sample != cur_record[2]):
            new_sample = random.sample(cur_record[2], len(cur_record[2]))
        new_sample_df = new_sample_df.append({"claim": cur_record[0], "evidence": new_sample, "label": cur_record[1]}, ignore_index=True)


veri_df = pd.concat([veri_df, new_sample_df]).reset_index()
print(veri_df.head(1))


   index                                              claim     label  \
0      0  Not only is there no scientific evidence that ...  DISPUTED   

                                            evidence  
0  [At very high concentrations (100 times atmosp...  


In [3]:

def pad_sen(input, max_len):
    ## Add padding to sequence
    if len(input) > max_len: return input[:max_len]
    return input+[PAD for _ in range(max_len-len(input))]

def saperate_evi(claim, evidence):
    ## seperate and concatenate claim and evidence
    output = [CLS] + claim + [SEP] + evidence + [SEP]
    seg_li = [0 for _ in range(len(claim)+2)] + [1 for _ in range(len(evidence)+1)]
    return output, seg_li

class RelClassifier(nn.Module):
    def __init__(self, num_class, dropout = 0.1):
        super(RelClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertForSequenceClassification.from_pretrained('bert-base-uncased', problem_type="multi_label_classification")
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        self.linear = nn.Linear(768, num_class)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        self.num_class = num_class

    def forward(self, seq, attn_masks, seg_li):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''
        
        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks, token_type_ids = seg_li, output_hidden_states=True)
        cont_reps = outputs.hidden_states[-1][:,0]
        dropout_output = self.dropout(cont_reps)
        linear_output = self.linear(dropout_output)
        return linear_output
    
def get_accuracy(output, labels):
    # return accuracy of the prediction based on class score
    return (output.argmax(dim=1) == labels.argmax(dim=1)).sum().item() / len(labels)

def evaluate(b_model, criterion, dataloader, gpu):
    ## evaluate accuracy and loss
    b_model.eval()
    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, seg_li, labels in dataloader:
            seq, attn_masks, seg_li, labels = seq.cuda(gpu), attn_masks.cuda(gpu), seg_li.cuda(gpu), labels.cuda(gpu)
            output = b_model(seq, attn_masks, seg_li).cuda(gpu)
            mean_loss += criterion(output, labels)
            mean_acc += get_accuracy(output, labels)
            count += 1

    return mean_acc / count, mean_loss / count

def train(b_model, criterion, opti, train_loader, dev_loader, max_eps, gpu, file_pre = "model_"):
    ## Train model
    best_acc = 0
    best_loss = 99
    st = time.time()
    for ep in range(max_eps):
        
        b_model.train()
        for it, (seq, attn_masks, seg_li, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, seg_li, labels = seq.cuda(gpu), attn_masks.cuda(gpu), seg_li.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            output = b_model(seq, attn_masks, seg_li)
            
            #Computing loss
            loss = criterion(output, labels)

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
            
            ## print
            if it % 30 == 0:
                acc = get_accuracy(output, labels)
                print("O: ", output[:3])
                print("L: ", labels[:3])
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()
        
        ## save best model
        dev_acc, dev_loss = evaluate(b_model, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc and dev_loss <= best_loss:
            print("Accuracy improved from {} to {}, Loss improved from {} to {}, saving model...".format(best_acc, dev_acc, best_loss, dev_loss))
            print("Saved: " + file_pre + 'model.dat')
            best_acc = dev_acc
            best_loss = dev_loss
            torch.save(b_model.state_dict(), file_pre + 'model.dat')



In [4]:
class ValDataset(Dataset):
    def __init__(self, input_df, max_len_claim, max_len_evi, max_evi, num_class):
        self.df = input_df
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_len_claim = max_len_claim
        self.max_len_evi = max_len_evi
        self.max_evi = max_evi
        self.num_class = num_class
            
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        ## claim_text, claim_label, evidences
        #Selecting the sentence and label at the specified index in the data frame
        claim = self.df.loc[index, 'claim']
        label = self.df.loc[index, 'label']
        evidence = self.df.loc[index, 'evidence']

        # Preprocessing the claim, tokenization, padding, getting attention tag, getting category tag, put to GPU
        claim = self.tokenizer.tokenize(claim)
        evidence = [self.tokenizer.tokenize(i) for i in evidence]
        claim = pad_sen(claim, self.max_len_claim)
        evidence = [pad_sen(i, self.max_len_evi) for i in evidence]
        input_token, seg_li = relation_sep(claim, evidence, self.max_evi)
        attn_mask = [1 if token != PAD else 0 for token in input_token]
        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        input_id = torch.tensor(input_token) #Converting the list to a pytorch tensor
        attn_mask = torch.tensor(attn_mask)
        seg_li = torch.tensor(seg_li)
        labels = np.zeros(self.num_class)
        np.put(labels, label, 1)
        label = torch.tensor(labels)

        
        return input_id, attn_mask, seg_li, label


### Preprocess and Training

In [5]:
TEST_FRA = 0.2

# read data
train_veri_df, test_veri_df = train_test_split(veri_df, test_size = TEST_FRA, shuffle=True, random_state=RANDOM_SEED)
train_veri_df["label"] = train_veri_df["label"].apply(lambda x: label_trans[x])
test_veri_df["label"] = test_veri_df["label"].apply(lambda x: label_trans[x])
train_veri_df = train_veri_df.reset_index(drop=True)
test_veri_df = test_veri_df.reset_index(drop=True)


In [6]:

## read trained data
PAD_CLAIM = 64
PAD_EVIDENCE = 64
BATCH_SIZE = 10
NO_WORKER = 4
MAX_EVI = 3
num_class_val = 4


# prepare data loader
train_veri_ds = ValDataset(train_veri_df, PAD_CLAIM, PAD_EVIDENCE, MAX_EVI, num_class)
train_veri_dl = DataLoader(train_veri_ds, batch_size = BATCH_SIZE, num_workers = NO_WORKER)

test_veri_ds = ValDataset(test_veri_df, PAD_CLAIM, PAD_EVIDENCE, MAX_EVI, num_class)
test_veri_dl = DataLoader(test_veri_ds, batch_size = BATCH_SIZE, num_workers = NO_WORKER)


In [7]:
num_epoch = 8
labels = range(num_class_val)

# train model
class_weights = compute_class_weight("balanced", classes=labels, y=train_veri_df.label.values)
class_weights = torch.tensor(class_weights, dtype=torch.float).cuda()
b_model = RelClassifier(num_class_val, dropout=0.5)
opti = optim.Adam(b_model.parameters(), lr = 2e-5)
criterion = nn.CrossEntropyLoss(weight = class_weights)
gpu = 0

# b_model.load_state_dict(torch.load("sstcls_0.dat"))
b_model.cuda(gpu) #Enable gpu support for the model
train(b_model, criterion, opti, train_veri_dl, test_veri_dl, num_epoch, gpu, file_pre = "val_")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

O:  tensor([[-0.6654,  0.2820, -0.3985,  0.7442],
        [ 0.2701,  0.2534, -0.1323,  0.0957],
        [ 0.2245, -0.3873, -0.1946,  0.6253]], device='cuda:0',
       grad_fn=<SliceBackward0>)
L:  tensor([[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]], device='cuda:0', dtype=torch.float64)
Iteration 0 of epoch 0 complete. Loss: 1.2129192155815567; Accuracy: 0.3; Time taken (s): 2.466339111328125
O:  tensor([[-0.0678, -0.4956, -0.2541,  0.4276],
        [ 0.9738, -0.4358,  0.0339, -0.2770],
        [ 0.2607, -0.4874, -0.0370,  0.2732]], device='cuda:0',
       grad_fn=<SliceBackward0>)
L:  tensor([[0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.]], device='cuda:0', dtype=torch.float64)
Iteration 30 of epoch 0 complete. Loss: 1.5902034730444428; Accuracy: 0.4; Time taken (s): 11.029784679412842
O:  tensor([[ 0.9879, -1.5796,  0.1103, -0.1894],
        [ 0.5925, -0.6061,  0.2398, -0.0989],
        [-0.9534, -0.3553,  1.0800, -0.7018]], device='cud

### Evaluating

In [8]:
from sklearn.metrics import confusion_matrix

## define prediction
def predict_val(b_model, dataloader, gpu):
    b_model.eval()
    pred = []
    with torch.no_grad():
        for seq, attn_masks, seg_li, _ in dataloader:
            seq, attn_masks, seg_li = seq.cuda(gpu), attn_masks.cuda(gpu), seg_li.cuda(gpu)
            output = b_model(seq, attn_masks, seg_li)

            ## match label
            pred += torch.argmax(output, dim=1).tolist()

    return pred

## train
gpu = 0 
true_label = test_veri_df.label.values.tolist()
pred_label = predict_val(b_model, test_veri_dl, gpu)

## Print out confusion matrix
print(pred_label)
print(true_label)
print(confusion_matrix(true_label, pred_label))
print(np.mean(np.array(true_label) == np.array(pred_label)))

[0, 0, 1, 0, 0, 1, 0, 0, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 1, 0, 2, 0, 1, 2, 2, 0, 0, 1, 2, 0, 2, 0, 3, 2, 2, 0, 0, 2, 0, 0, 2, 0, 2, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 2, 3, 2, 1, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0, 1, 2, 2, 0, 0, 0, 3, 1, 0, 2, 0, 0, 0, 0, 0, 1, 3, 0, 0, 1, 2, 0, 1, 0, 1, 2, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 2, 0, 0, 2, 0, 0, 1, 2, 0, 2, 3, 0, 2, 0, 2, 1, 1, 2, 0, 1, 1, 1, 2, 2, 0, 0, 0, 0, 2, 2, 2, 1, 0, 0, 0, 0, 1, 2, 0, 0, 2, 0, 0, 2, 3, 0, 0, 2, 0, 3, 0, 0, 2, 3, 3, 1, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 3, 2, 0, 0, 0, 1, 2, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 3, 0, 2, 0, 0, 3, 1, 2, 2, 0, 1, 2, 1, 0, 1, 2, 2, 1, 0, 2, 0, 2, 3, 0, 2, 0, 2, 2, 0, 1, 0, 2, 0, 0, 0, 2, 2, 0, 2, 0, 1, 0, 0, 1, 0, 1, 2, 0, 3, 2, 0, 2, 3, 0, 0, 2, 0, 0, 2, 1, 2, 0, 0, 0, 2, 3, 1, 2, 1, 3, 0, 2, 0, 1, 3, 2, 2, 1, 0, 0, 1, 2, 0, 2, 0, 1, 3, 2, 1, 0, 2, 0, 2, 1, 0, 0, 2, 