## BM25 + NER + BERT

In [1]:
from rank_bm25 import BM25Okapi
import pandas as pd
import string
import json
import spacy
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import tensorflow as tf
from sklearn.metrics import confusion_matrix
from collections import defaultdict
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)



# TRAIN_PATH = "./data/train-claims.json"
# TRAIN_PATH = "./data/dev-claims.json"
TRAIN_PATH = "./data/test-claims-unlabelled.json"
EVI_PATH = "./data/evidence.json"
NER_PATH = "./data/spacy_ner_tot.json" ## MUST
VAL_MODEL_PATH = "./val_model.dat" ## MUST
REL_MODEL_PATH = "./rela_model.dat" ## MUST
BM25_PATH = "./data/bm_claim_test.pkl" ## Producable
VERI_DF_PATH = './data/train_filt.pkl'

PAD_LEN = 50
RANDOM_SEED = 42
SUPPORTS = "SUPPORTS"
REFUTES = "REFUTES"
NOT_ENOUGH_INFO = "NOT_ENOUGH_INFO"
DISPUTED = "DISPUTED"
RELATED = 1
NOT_RELATED = 0

spacy_nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter_stemmer = PorterStemmer()
puncu_remove = str.maketrans('', '', string.punctuation)



def preprocess(input):
    # tokenize
    tokens = word_tokenize(input.lower().translate(puncu_remove))
    # remove stop words, lemmatize
    tokens = [lemmatizer.lemmatize(i) for i in tokens if i not in stop_words]
    
    return tokens

def get_n_bm25(bm25, text, n):
    # Retrive and select top n bm25 score
    token = preprocess(text)
    scores = bm25.get_scores(token)
    scores = np.array([i for i in enumerate(scores)])
    top_n_indices = [[int(i[0]), float(i[1])] for i in scores[scores[:, 1].argsort()][-n:]]
    print([i[0] for i in top_n_indices])
    return [i[0] for i in top_n_indices]

2023-05-14 23:50:16.437049: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-14 23:50:16.660449: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-14 23:50:19.411804: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-14 23:50:19.437684: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bu

In [2]:
claim_id_df = pd.read_json(TRAIN_PATH, orient="index")
evi_id_df = pd.read_json(EVI_PATH, orient="index").rename({0: "evi_text"}, axis=1)
claim_df = claim_id_df.reset_index(names="claim")
evi_df = evi_id_df.reset_index(names="evidence")

## MB25

In [3]:
# ## 43 min
# max_evi = 10

# ## preparing MB25 ranking
# with tf.device('/GPU:0'):
#     document = evi_df["evi_text"].values
#     document = [preprocess(i) for i in document]
#     bm25 = BM25Okapi(document)

# ## rank and find top n for each claim
# with tf.device('/GPU:0'):
#     pred_df = claim_df[["claim", "claim_text"]]
#     # pred_df = claim_df[["claim", "claim_text", "claim_label", "evidences"]]
#     pred_df["pred_evidence"] = pred_df.apply(lambda x: [evi_df.loc[i]["evidence"] for i in get_n_bm25(bm25, x["claim_text"], max_evi)], axis=1)
#     print(pred_df.head(1))

# pred_df.to_pickle(BM25_PATH)


MB25 Evaluation

In [4]:
pred_df = pd.read_pickle(BM25_PATH)

# tot_cr = 0
# tot_np = 0
# tot_more = 0
# for true_e, pred in pred_df[["evidences", "pred_evidence"]].values:
#     cur_cr = 0
#     cur_more = 0
#     for evi in pred:
#         if evi in true_e:
#             cur_cr += 1
#         else:
#             cur_more += 1
#     cur_np = len(true_e) - cur_cr
#     tot_cr += cur_cr
#     tot_np += cur_np
#     tot_more += cur_more

# print("True Positive:", tot_cr)
# print("False Positive:", tot_more)
# print("False Negative:", tot_np)
# print("Percentage Retrieval: ", tot_cr / (tot_cr + tot_np))


## NER matching

In [5]:
## read premade NER file
with open(NER_PATH, "r") as cur_file:
        ner_dict = json.loads(cur_file.read())

In [6]:
max_evi = 0
ner_hurdle = 500


ner_df = pd.DataFrame(columns=["claim", "evidence"])

## Matching NER in claim with NER in evidence
for cur_id, cur_text in claim_df[["claim", "claim_text"]].values[:100]:
    doc = spacy_nlp(cur_text)
    for ent in doc.ents:
        if ent.label_ in ner_dict and ent.text.lower().translate(puncu_remove) in ner_dict[ent.label_]:
            if len(ner_dict[ent.label_][ent.text.lower().translate(puncu_remove)]) > ner_hurdle: continue
            for evi in ner_dict[ent.label_][ent.text.lower().translate(puncu_remove)]:
                ner_df = ner_df.append({"claim": cur_id, "evidence":evi}, ignore_index=True)
            
print(ner_df.head(1))


        claim         evidence
0  claim-1003  evidence-561136


NER Evaluation

In [7]:

# ner_dict = defaultdict(list)
# ner_ev_df = pd.DataFrame(columns=["claim", "evidences", "pred_evidence"])

# for record in ner_df.values:
#     ner_dict[record[0]].append(record[1])

# for record in claim_df.values:
#     if record[0] in ner_dict:
#         ner_ev_df = ner_ev_df.append({"claim": record[0], "pred_evidence": ner_dict[record[0]], "evidences": record[3]}, ignore_index = True)
#     else:
#         ner_ev_df = ner_ev_df.append({"claim": record[0], "pred_evidence": [], "evidences": record[3]}, ignore_index = True)

# tot_cr = 0
# tot_np = 0
# tot_more = 0
# for true_e, pred in ner_ev_df[["evidences", "pred_evidence"]].values:
#     cur_cr = 0
#     cur_more = 0
#     for evi in pred:
#         if evi in true_e:
#             cur_cr += 1
#         else:
#             cur_more += 1
#     cur_np = len(true_e) - cur_cr
#     tot_cr += cur_cr
#     tot_np += cur_np
#     tot_more += cur_more

# print("True Positive:", tot_cr)
# print("False Positive:", tot_more)
# print("False Negative:", tot_np)
# print("Percentage Retrieval: ", tot_cr / (tot_cr + tot_np))

preprocessing for bert model, concatenanting 

In [8]:
rel_df = pd.DataFrame(columns=["claim", "evidence"])
for claim, evidence in pred_df[["claim", "pred_evidence"]].values:
    for evi in evidence:
        rel_df = rel_df.append({"claim": claim, "evidence":evi}, ignore_index=True)
rel_df = pd.concat([rel_df, ner_df])
rel_df = rel_df.reset_index(drop=True).rename({"claim": "claim_id", "evidence": "evidence_id"}, axis=1)
print(rel_df)


        claim_id       evidence_id
0     claim-1001   evidence-334722
1     claim-1001   evidence-570367
2     claim-1001   evidence-445405
3     claim-1001    evidence-67154
4     claim-1001   evidence-360246
...          ...               ...
8878  claim-2840  evidence-1087946
8879  claim-2840  evidence-1116441
8880  claim-2840  evidence-1144991
8881  claim-2840  evidence-1160433
8882  claim-2840  evidence-1170600

[8883 rows x 2 columns]


In [9]:
rel_df['claim'] = rel_df.apply(lambda x: claim_id_df.loc[x["claim_id"]]["claim_text"], axis=1)
rel_df['evidence'] = rel_df.apply(lambda x: evi_id_df.loc[x["evidence_id"]]["evi_text"], axis=1)
rel_df["label"] = 0

print(rel_df.columns)

Index(['claim_id', 'evidence_id', 'claim', 'evidence', 'label'], dtype='object')


## Relevance modeling

In [10]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
import torch.nn as nn
import torch
import time


PAD = "[PAD]"
CLS = '[CLS]'
SEP = '[SEP]'
PAD_LEN = 50
RANDOM_SEED = 42
gpu = 0
SUPPORTS = "SUPPORTS"
REFUTES = "REFUTES"
NOT_ENOUGH_INFO = "NOT_ENOUGH_INFO"
DISPUTED = "DISPUTED"
RELATED = 1
NOT_RELATED = 0
label_trans = {SUPPORTS: 0, REFUTES: 1, NOT_ENOUGH_INFO: 2, DISPUTED: 3}

In [11]:

class RelDataset(Dataset):
    def __init__(self, input_df, max_len_claim, max_len_evi, num_class):
        self.df = input_df
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_len_claim = max_len_claim
        self.max_len_evi = max_len_evi
        self.num_class = num_class
            
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        ## claim_text, claim_label, evidences
        #Selecting the sentence and label at the specified index in the data frame
        claim = self.df.loc[index, 'claim']
        label = self.df.loc[index, 'label']
        evidence = self.df.loc[index, 'evidence']

        #Preprocessing the claim
        claim = self.tokenizer.tokenize(claim)
        evidence = self.tokenizer.tokenize(evidence)
        claim = pad_sen(claim, self.max_len_claim)
        evidence = pad_sen(evidence, self.max_len_evi)
        input_token, seg_li = saperate_evi(claim, evidence)
        attn_mask = [1 if token != PAD else 0 for token in input_token]
        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        input_id = torch.tensor(input_token) #Converting the list to a pytorch tensor
        attn_mask = torch.tensor(attn_mask)
        seg_li = torch.tensor(seg_li)
        labels = np.zeros(self.num_class)
        np.put(labels, label, 1)
        label = torch.tensor(labels)
        
        # print(input_token, attn_mask, seg_li)
        
        return input_id, attn_mask, seg_li, label

def pad_sen(input, max_len):
    ## Add padding to sequence
    if len(input) > max_len: return input[:max_len]
    return input+[PAD for _ in range(max_len-len(input))]

def saperate_evi(claim, evidence):
    ## seperate and concatenate claim and evidence
    output = [CLS] + claim + [SEP] + evidence + [SEP]
    seg_li = [0 for _ in range(len(claim)+2)] + [1 for _ in range(len(evidence)+1)]
    return output, seg_li

class RelClassifier(nn.Module):
    def __init__(self, num_class, dropout = 0.1):
        super(RelClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertForSequenceClassification.from_pretrained('bert-base-uncased', problem_type="multi_label_classification")
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        self.linear = nn.Linear(768, num_class)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        self.num_class = num_class

    def forward(self, seq, attn_masks, seg_li):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''
        
        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks, token_type_ids = seg_li, output_hidden_states=True)
        cont_reps = outputs.hidden_states[-1][:,0]
        dropout_output = self.dropout(cont_reps)
        linear_output = self.linear(dropout_output)
        return linear_output
    
def get_accuracy(output, labels):
    # return accuracy of the prediction based on class score
    return (output.argmax(dim=1) == labels.argmax(dim=1)).sum().item() / len(labels)

def evaluate(b_model, criterion, dataloader, gpu):
    ## evaluate accuracy and loss
    b_model.eval()
    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, seg_li, labels in dataloader:
            seq, attn_masks, seg_li, labels = seq.cuda(gpu), attn_masks.cuda(gpu), seg_li.cuda(gpu), labels.cuda(gpu)
            output = b_model(seq, attn_masks, seg_li).cuda(gpu)
            mean_loss += criterion(output, labels)
            mean_acc += get_accuracy(output, labels)
            count += 1

    return mean_acc / count, mean_loss / count

def train(b_model, criterion, opti, train_loader, dev_loader, max_eps, gpu, file_pre = "model_"):
    ## Train model
    best_acc = 0
    best_loss = 99
    st = time.time()
    for ep in range(max_eps):
        
        b_model.train()
        for it, (seq, attn_masks, seg_li, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, seg_li, labels = seq.cuda(gpu), attn_masks.cuda(gpu), seg_li.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            output = b_model(seq, attn_masks, seg_li)
            
            #Computing loss
            loss = criterion(output, labels)

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
            
            ## print
            if it % 30 == 0:
                acc = get_accuracy(output, labels)
                print("O: ", output[:3])
                print("L: ", labels[:3])
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()
        
        ## save best model
        dev_acc, dev_loss = evaluate(b_model, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc and dev_loss <= best_loss:
            print("Accuracy improved from {} to {}, Loss improved from {} to {}, saving model...".format(best_acc, dev_acc, best_loss, dev_loss))
            print("Saved: " + file_pre + 'model.dat')
            best_acc = dev_acc
            best_loss = dev_loss
            torch.save(b_model.state_dict(), file_pre + 'model.dat')



In [12]:
PAD_CLAIM = 64
PAD_EVIDENCE = 64
BATCH_SIZE = 32
NO_WORKER = 4
num_class_rel = 2
gpu = 0 #gpu ID


## load dataset and model
rel_ds = RelDataset(rel_df[['claim', 'evidence', 'label']], PAD_CLAIM, PAD_EVIDENCE, num_class_rel)
rel_dl = DataLoader(rel_ds, batch_size = BATCH_SIZE, num_workers = NO_WORKER)
rel_mod = RelClassifier(num_class_rel)
rel_mod.load_state_dict(torch.load(REL_MODEL_PATH))
rel_mod.cuda(gpu) #Enable gpu support for the model


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

RelClassifier(
  (bert_layer): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=

### Predicting

In [13]:
def predict_rel(net, dataloader, gpu):
    net.eval()
    pred = []
    max_score = []
    with torch.no_grad():
        for seq, attn_masks, seg_li, _ in dataloader:
            seq, attn_masks, seg_li = seq.cuda(gpu), attn_masks.cuda(gpu), seg_li.cuda(gpu)
            output = net(seq, attn_masks, seg_li)

            for record in output:
                pred.append(record.argmax().item())
                max_score.append(record.max().item())

    return pred, max_score

gpu = 0 #gpu ID

In [14]:
pred_label, log_score = predict_rel(rel_mod, rel_dl, gpu)


rel_df["score"] = log_score
rel_df["pred_label"] = pred_label

### Reformate for next training

In [49]:
SCORE_HEARDLE = 1
MAX_EVI = 3

## Selet evidence selection
train_df = rel_df.loc[rel_df["score"] > SCORE_HEARDLE]
train_df = train_df.loc[train_df["pred_label"] == RELATED]
train_df = train_df.drop("pred_label", axis = 1)
top_dict = defaultdict(list)

## Sep dataframe into dictionary and merge
for record in train_df.values:
    top_dict[record[0]].append((record[5], record[1]))

for key in top_dict.keys():
    candi = sorted(top_dict[key], reverse = True)
    candi = candi[:MAX_EVI] if len(candi) > MAX_EVI else candi
    candi = [i[-1] for i in candi]
    top_dict[key] = candi

print(top_dict)



defaultdict(<class 'list'>, {'claim-1001': ['evidence-895046', 'evidence-277435'], 'claim-1003': ['evidence-829081', 'evidence-430670', 'evidence-18442'], 'claim-1009': ['evidence-482300', 'evidence-649967', 'evidence-74163'], 'claim-1020': ['evidence-382341', 'evidence-754568', 'evidence-195294'], 'claim-1028': ['evidence-18609', 'evidence-154614', 'evidence-644769'], 'claim-1034': ['evidence-572246', 'evidence-23786', 'evidence-164796'], 'claim-1048': ['evidence-970539', 'evidence-508793', 'evidence-735431'], 'claim-109': ['evidence-4318', 'evidence-508668', 'evidence-384007'], 'claim-1135': ['evidence-1198124', 'evidence-647256', 'evidence-265967'], 'claim-1141': ['evidence-1139877', 'evidence-591257', 'evidence-591257'], 'claim-1156': ['evidence-1142116', 'evidence-78739', 'evidence-409638'], 'claim-1173': ['evidence-987493', 'evidence-666596', 'evidence-127142'], 'claim-1202': ['evidence-107116'], 'claim-1212': ['evidence-697698', 'evidence-594802', 'evidence-86066'], 'claim-1230'

In [50]:

## Convert dictionary to dataframe
# veri_df = pd.DataFrame(columns = ["claim", "evidence", "label", "evidence_id", "claim_id"])
veri_df = pd.DataFrame(columns = ["claim", "evidence", "evidence_id", "claim_id"])

for key in top_dict.keys():
    claim = claim_id_df.loc[key].values[0]
    # label = claim_id_df.loc[key].values[1]
    evidence = [evi_id_df.loc[i].values[0] for i in top_dict[key]]
    # veri_df = veri_df.append({"claim": claim, "evidence": evidence, "label": label, "evidence_id": top_dict[key], "claim_id": key}, ignore_index=True)
    veri_df = veri_df.append({"claim": claim, "evidence": evidence, "evidence_id": top_dict[key], "claim_id": key}, ignore_index=True)

veri_df.to_pickle(VERI_DF_PATH)

In [53]:
veri_df

Unnamed: 0,claim,evidence,evidence_id,claim_id
0,‘This study goes beyond statistical correlatio...,[Correlations have been identified between hig...,"[evidence-895046, evidence-277435]",claim-1001
1,"A recent study in Nature Geoscience, for insta...","[Since the last glacial maximum about 20,000 y...","[evidence-829081, evidence-430670, evidence-18...",claim-1003
2,‘Arctic ice conditions have been tracking at r...,[Arctic sea ice extent ice hit an all-time low...,"[evidence-482300, evidence-649967, evidence-74...",claim-1009
3,“The global reef crisis does not necessarily m...,[With widespread degradation of highly biodive...,"[evidence-382341, evidence-754568, evidence-19...",claim-1020
4,A second coat of paint has much less of an eff...,"[A dog 's coat may be a double coat, made up o...","[evidence-18609, evidence-154614, evidence-644...",claim-1028
...,...,...,...,...
132,The Alaskan tundra is warming so quickly it ha...,[Recent warming is followed by carbon dioxide ...,"[evidence-705706, evidence-855683, evidence-28...",claim-952
133,“Arctic land stores about twice as much carbon...,[Both the decay and the burning of wood releas...,"[evidence-22100, evidence-450316, evidence-620...",claim-972
134,“Warm weather worsened the most recent five-ye...,"[Between 2011 and 2014, California experienced...","[evidence-957389, evidence-434312, evidence-17...",claim-979
135,Each year sees the disappearance of thousands ...,[In August 2002 a flood caused by over a week ...,"[evidence-1116814, evidence-1117775, evidence-...",claim-1425


retrieval bert relevance evaluation

In [18]:

# rel_bert_dict = defaultdict(list)
# bert_ev_df = pd.DataFrame(columns=["claim", "evidences", "pred_evidence"])

# for record in veri_df.values:
#     rel_bert_dict[record[4]] = record[3]

# for record in claim_df.values:
#     if record[0] in rel_bert_dict:
#         bert_ev_df = bert_ev_df.append({"claim": record[0], "pred_evidence": rel_bert_dict[record[0]], "evidences": record[3]}, ignore_index = True)
#     else:
#         bert_ev_df = bert_ev_df.append({"claim": record[0], "pred_evidence": [], "evidences": record[3]}, ignore_index = True)

# tot_cr = 0
# tot_np = 0
# tot_more = 0
# for true_e, pred in bert_ev_df[["evidences", "pred_evidence"]].values:
#     cur_cr = 0
#     cur_more = 0
#     for evi in pred:
#         if evi in true_e:
#             cur_cr += 1
#         else:
#             cur_more += 1
#     cur_np = len(true_e) - cur_cr
#     tot_cr += cur_cr
#     tot_np += cur_np
#     tot_more += cur_more

# print("True Positive:", tot_cr)
# print("False Positive:", tot_more)
# print("False Negative:", tot_np)
# print("Percentage Retrieval: ", tot_cr / (tot_cr + tot_np))



# Claim verification

In [54]:
def relation_sep(claim, evidence, max_evi):
    ## inputs are tokens
    output = [CLS] + claim + [SEP]
    seg_li = [0 for _ in range(len(claim)+2)]
    cur_seg = 1
    for evi in evidence:
        output += evi + [SEP]
        seg_li += [cur_seg for _ in range(len(evi)+1)]
        # cur_seg += 1
    for i in range(len(evidence), max_evi):
        output += [PAD for _ in range(len(evi))] + [SEP]
        seg_li += [cur_seg for _ in range(len(evi)+1)]
        # cur_seg += 1
    return output, seg_li

class ValDataset(Dataset):
    def __init__(self, input_df, max_len_claim, max_len_evi, max_evi, num_class):
        self.df = input_df
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_len_claim = max_len_claim
        self.max_len_evi = max_len_evi
        self.max_evi = max_evi
        self.num_class = num_class
            
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        ## claim_text, claim_label, evidences
        #Selecting the sentence and label at the specified index in the data frame
        claim = self.df.loc[index, 'claim']
        label = self.df.loc[index, 'label']
        evidence = self.df.loc[index, 'evidence']

        #Preprocessing the claim
        claim = self.tokenizer.tokenize(claim)
        evidence = [self.tokenizer.tokenize(i) for i in evidence]
        claim = pad_sen(claim, self.max_len_claim)
        evidence = [pad_sen(i, self.max_len_evi) for i in evidence]
        input_token, seg_li = relation_sep(claim, evidence, self.max_evi)
        attn_mask = [1 if token != PAD else 0 for token in input_token]
        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        input_id = torch.tensor(input_token) #Converting the list to a pytorch tensor
        attn_mask = torch.tensor(attn_mask)
        seg_li = torch.tensor(seg_li)
        labels = np.zeros(self.num_class)
        np.put(labels, label, 1)
        label = torch.tensor(labels)
        return input_id, attn_mask, seg_li, label

def predict_val(net, dataloader, gpu):
    net.eval()
    pred = []
    with torch.no_grad():
        for seq, attn_masks, seg_li, _ in dataloader:
            seq, attn_masks, seg_li = seq.cuda(gpu), attn_masks.cuda(gpu), seg_li.cuda(gpu)
            ## HERE
            output = net(seq, attn_masks, seg_li)
            # output = np.array(output.tolist())
            for score in output:
                if score[0].item() < 0.6 and score[1].item() < 0.6:
                    pred.append(label_trans[NOT_ENOUGH_INFO])
                    
                elif np.absolute(score[0].item() - score[1].item()) < 0.05:
                    pred.append(label_trans[DISPUTED])
                else:
                    pred.append(torch.argmax(score).item())
    return pred

In [55]:
veri_df = pd.read_pickle(VERI_DF_PATH)

val_df = veri_df[["claim", "evidence", "evidence_id", "claim_id"]]
val_df["label"] = 0

# read data
# val_df = veri_df[["claim", "evidence", "label", "evidence_id", "claim_id"]]
# val_df["label"] = veri_df.apply(lambda x: label_trans[x["label"]], axis = 1)


In [56]:
# read data / model and set configuration
MAX_EVI = 3
num_class_val = 4
val_mod = RelClassifier(num_class_val)
val_mod.load_state_dict(torch.load(VAL_MODEL_PATH))
val_mod.cuda(gpu) #Enable gpu support for the model

val_ds = ValDataset(val_df[['claim', 'evidence', 'label']], PAD_CLAIM, PAD_EVIDENCE, MAX_EVI, num_class_val)
val_dl = DataLoader(val_ds, batch_size = BATCH_SIZE, num_workers = NO_WORKER)


gpu = 0 #gpu ID
pred_label = predict_val(val_mod, val_dl, gpu)
print(pred_label)
val_df["pred_label"] = pred_label


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

[0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 1, 0, 2, 0, 2, 2, 0, 3, 1, 1, 1, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 2, 3, 2, 0, 1, 0, 2, 0, 1, 0, 0, 1, 0, 2, 0, 1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 2, 0, 2, 1, 2, 2, 2, 1, 0, 2, 2, 2, 2, 2, 0, 1, 0, 3, 0, 0, 0, 2, 2, 1, 0, 0, 2, 2, 2, 1, 1, 1, 0, 2, 2, 2, 0, 1, 0, 2, 2, 1, 3, 2, 2, 1, 1, 0, 1, 2, 2, 0, 2, 3, 2, 2, 2, 0, 1, 2]


Classification Bert evaluation

In [22]:
# print(confusion_matrix(val_df.label.values.tolist(), pred_label))
# print(pred_label)
# print(val_df.label.values.tolist())
# print(np.mean(np.array(val_df.label.values.tolist()) == np.array(pred_label)))

## Make output

In [57]:
rev_label_trans = {0: SUPPORTS, 1: REFUTES, 2: NOT_ENOUGH_INFO, 3: DISPUTED}

red_df = val_df[["claim", "pred_label", "evidence_id" ,"claim_id"]].rename({"pred_label": "label"}, axis=1)
red_df = red_df.set_index("claim_id")
known_claim = red_df.index.values
drop_df = pd.DataFrame(columns=["claim", "claim_text", "claim_label", "evidences"])
for record in claim_df.values:
    if record[0] in known_claim:
        drop_df = drop_df.append({"claim": record[0], "claim_text": red_df.loc[record[0]]["claim"], "claim_label": rev_label_trans[red_df.loc[record[0]]["label"]], "evidences": red_df.loc[record[0]]["evidence_id"]}, ignore_index=True)
    else:
        drop_df = drop_df.append({"claim": record[0], "claim_text": record[1], "claim_label": NOT_ENOUGH_INFO, "evidences": []}, ignore_index=True)
drop_df = drop_df.set_index("claim")
print(drop_df)
drop_df.to_json("test-claims-predictions.json", orient="index")



                                                   claim_text  \
claim                                                           
claim-1001  ‘This study goes beyond statistical correlatio...   
claim-1003  A recent study in Nature Geoscience, for insta...   
claim-1009  ‘Arctic ice conditions have been tracking at r...   
claim-1020  “The global reef crisis does not necessarily m...   
claim-1028  A second coat of paint has much less of an eff...   
...                                                       ...   
claim-910   The cement, iron and steel, and petroleum refi...   
claim-942   ‘We could be decades too fast, or decades too ...   
claim-952   The Alaskan tundra is warming so quickly it ha...   
claim-972   “Arctic land stores about twice as much carbon...   
claim-979   “Warm weather worsened the most recent five-ye...   

                claim_label                                          evidences  
claim                                                                    

In [58]:
count_dict  = defaultdict(int)
for record in drop_df.values:
    count_dict[record[1]] += 1
    continue
print(record)

print(count_dict)

['“Warm weather worsened the most recent five-year drought, which included the driest four-year period on record in terms of statewide precipitation.'
 'SUPPORTS'
 list(['evidence-957389', 'evidence-434312', 'evidence-178433'])]
defaultdict(<class 'int'>, {'SUPPORTS': 48, 'NOT_ENOUGH_INFO': 79, 'REFUTES': 21, 'DISPUTED': 5})
