In [12]:
import torch
import os
import json
import pickle

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
with open("/kaggle/input/quantemp/test_claims_quantemp.json", 'rb') as f:
    test_data = json.load(f)
        
test_data[-1]

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


{'crawled_date': '2022-09-24',
 'country_of_origin': 'india',
 'label': 'False',
 'url': 'https://www.indiatoday.in/fact-check/story/fact-check-woman-seen-with-rahul-gandhi-is-not-amulya-leona-student-who-chanted-pak-zindabad-2004338-2022-09-24',
 'lang': 'en',
 'claim': "During the Bharat Jodo Yatra, Rahul Gandhi was seen with Amulya Leona Noronha, the woman who was arrested for saying 'Pakistan Zindabad' during an anti-CAA rally in 2020.",
 'doc': 'AFWA\'s investigation found that the woman seen in the viral photo is not Amulya Leona Noronha. Rahul Gandhi and members of the Congress party, as part of the Bharat Jodo Yatra, have met locals and leaders alike as they move from city to city on foot. Of the many, many photos from the road, one shows Gandhi posing with a young woman. The Congress leader was giving her a side-hug. This photo has been widely shared with some questionable claims. They alleged that the woman in the photo is Amulya Leona Noronha, a student who chanted “Pakistan

# Test Data

In [13]:
model_type = "TEMPORAL" # "TOP5" "DECOMP" "TEMPORAL" "ORACLE"

In [14]:
if model_type == "DECOMP":
    with open("/kaggle/input/quantemp-decomp-data/nli_test_decomposed_reranked_top1.json", 'rb') as f:
        test_top5 = json.load(f)
elif model_type == "TEMPORAL":
    with open("/kaggle/input/quantemp-temporal-rerank/nli_input_test_reranktop5_temporal.json", 'rb') as f:
        test_top5 = json.load(f)
elif model_type == "TOP5":
    with open("/kaggle/input/quantemp-evidence-snippets/test/nli_input_test_reranktop5.json", 'rb') as f:
        test_top5 = json.load(f)
elif model_type == "ORACLE":
    with open("/kaggle/input/quantemp/test_claims_quantemp.json", 'rb') as f:
        test_top5 = json.load(f)

assert len(test_data) == len(test_top5)

# Model Loading

In [15]:
from torch import nn
from transformers import AutoModel, T5ForSequenceClassification
from transformers import T5ForSequenceClassification, T5Tokenizer

class MultiClassClassifier(nn.Module):
  def __init__(self, model_path, labels_count, hidden_dim=512, mlp_dim=256, dropout=0.1, freeze_model=False):
    super().__init__()

    # automodel gives T5Model
    self.model = AutoModel.from_pretrained(model_path, output_hidden_states=True, output_attentions=True)
    self.mlp = nn.Sequential(
        nn.Linear(hidden_dim, mlp_dim),
        nn.ReLU(),
        nn.Linear(mlp_dim, labels_count)
    )

    if freeze_model:
      print("freezing layers")
      for param in self.model.parameters():
          param.requires_grad = False

  def forward(self, input_ids, attention_mask):
    model_out = self.model.encoder.forward(input_ids=input_ids, attention_mask=attention_mask)
    hs = model_out.last_hidden_state
    batch_size, _seq_length, hidden_size = hs.shape
    # take the first token of the last hidden state, because it captures all the context
    x = hs.view(batch_size, -1, hidden_size)[:, 0, :]
    x = self.mlp(x)
    
    # x = self.dropout(x)
    # x = self.mlp(x.float())
    return x

if model_type == "DECOMP":
    tokenizer = T5Tokenizer.from_pretrained("/kaggle/input/fact-checker-nt5/pytorch/claim-decomp-v2/1/model_roberta_large_oracle")
    model = MultiClassClassifier("nielsr/nt5-small-rc1", hidden_dim=512, mlp_dim=256, labels_count=3, freeze_model=False)
    checkpoint = torch.load("/kaggle/input/fact-checker-nt5/pytorch/claim-decomp-v2/1/checkpoint.pt")
    
elif model_type == "TOP5":
    tokenizer = T5Tokenizer.from_pretrained("/kaggle/input/fact-checker-nt5/pytorch/rerank-top5/1/model_roberta_large_oracle")
    model = MultiClassClassifier("nielsr/nt5-small-rc1", hidden_dim=512, mlp_dim=256, labels_count=3, freeze_model=False)
    checkpoint = torch.load("/kaggle/input/fact-checker-nt5/pytorch/rerank-top5/1/checkpoint.pt")
    
elif model_type == "ORACLE":
    tokenizer = T5Tokenizer.from_pretrained("/kaggle/input/fact-checker-nt5/pytorch/no-preprocess/2/nt5_trained_nocos2-20240521T084838Z-001/nt5_trained_nocos2")
    model = MultiClassClassifier("nielsr/nt5-small-rc1", hidden_dim=512, mlp_dim=256, labels_count=3, freeze_model=False)
    checkpoint = torch.load("/kaggle/input/fact-checker-nt5/pytorch/no-preprocess/2/checkpoint.pt")
    
elif model_type == "TEMPORAL":
    tokenizer = T5Tokenizer.from_pretrained("/kaggle/input/fact-checker-nt5/pytorch/temporal-rerank/1/model_roberta_large_oracle")
    model = MultiClassClassifier("nielsr/nt5-small-rc1", hidden_dim=512, mlp_dim=256, labels_count=3, freeze_model=False)
    checkpoint = torch.load("/kaggle/input/fact-checker-nt5/pytorch/temporal-rerank/1/checkpoint.pt")
    
model.load_state_dict(checkpoint)
model.to(device)
print("loaded")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


loaded


In [16]:
taxonomy_labels = {}
labels = {}
for data in [test_data]:
  for d in data:
    d['taxonomy_label'] = d['taxonomy_label'].strip()
    l = d['taxonomy_label']
    if l in taxonomy_labels:
      taxonomy_labels[l] += 1
    else:
      taxonomy_labels[l] = 1

    l = d['label']
    if l in labels:
      labels[l] += 1
    else:
      labels[l] = 1

print(taxonomy_labels)
print(labels)

{'statistical': 1210, 'temporal': 683, 'comparison': 255, 'interval': 347}
{'Conflicting': 598, 'True': 474, 'False': 1423}


In [17]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
LE.fit(list(labels.keys()))

LE_taxonomy = LabelEncoder()
LE_taxonomy.fit(list(taxonomy_labels.keys()))

test_labels = torch.tensor(LE.transform([fact["label"] for fact in test_data]))
test_taxonomy_labels = torch.tensor(LE_taxonomy.transform([fact["taxonomy_label"] for fact in test_data]))

print(list(labels.keys()))
print(test_labels[:20])

print(list(taxonomy_labels.keys()))
print(test_taxonomy_labels[:20])

['Conflicting', 'True', 'False']
tensor([0, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 0, 1, 1, 1, 1, 1])
['statistical', 'temporal', 'comparison', 'interval']
tensor([2, 2, 3, 2, 3, 2, 2, 3, 3, 3, 2, 2, 2, 2, 2, 3, 3, 2, 3, 0])


In [18]:
def get_features_top5_decomp(data):
  features = []
  evidences = []

  for index, fact in enumerate(data):
    claim = fact["claim"]

    feature = "[Claim]:"+claim+"[Questions]:"+fact['decomposed_questions']+"[Evidences]:"+fact["evidence"]
    features.append(feature)
  return features

def get_features_oracle(data):
  features = []
  evidences = []

  for index, fact in enumerate(data):
    claim = fact["claim"]

    feature = "[Claim]:"+claim+"[Evidences]:"+fact["doc"]
    features.append(feature)
  return features

def get_features(data):
  features = []
  evidences = []

  for index, fact in enumerate(data):
    claim = fact["claim"]

    feature = "[Claim]:"+claim+"[Evidences]:"+fact["evidence"]
    features.append(feature)
  return features


if model_type == "DECOMP":
    test_features = get_features_top5_decomp(test_top5)
elif model_type == "ORACLE":
    test_features = get_features_oracle(test_top5)
elif model_type == "TEMPORAL" or model_type == "TOP5":
    test_features = get_features(test_top5)

In [19]:
from tqdm import tqdm

def encode(features):
  input_ids = []
  attention_masks = []
  for sent in tqdm(features):
      # `encode_plus` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      #   (5) Pad or truncate the sentence to `max_length`
      #   (6) Create attention masks for [PAD] tokens.
      encoded_dict = tokenizer.encode_plus(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = 256,           # Pad & truncate all sentences.
                          padding = 'max_length',
                          truncation=True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )

      # Add the encoded sentence to the list.
      input_ids.append(encoded_dict['input_ids'])

      # And its attention mask (simply differentiates padding from non-padding).
      attention_masks.append(encoded_dict['attention_mask'])
  # Convert the lists into tensors.
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  return input_ids, attention_masks

test_input_ids, test_attention_masks = encode(test_features)

# Print sentence 0, now as a list of IDs.
print('Original: ', test_features[0])
print('Token IDs:', test_input_ids[0])

100%|██████████| 2495/2495 [00:07<00:00, 312.19it/s]

Original:  [Claim]:"The non-partisan Congressional Budget Office concluded ObamaCare will cost the U.S. more than 800,000 jobs."[Evidences]:this non-partisan congressional budget office report confirms that obamacare will cost america 2 million jobs ... budget analysts said in the most detailed ... ... earlier this year, the head of the congressional budget office testified before congress that implementation of obamacare would cost 800,000 jobs.  ... jun 14, 2011  ... congressional budget office testified before congress that implementation of obamacare would cost 800,000 jobs." bachmann made nearly the ... the director of the congressional budget office testified last year that obamacare will destroy 800,000 jobs and this summer the u.s. chamber of commerce ... 2014-02-24  the non-partisan congressional budget office recently reported that obamacare will shrink the economy by the equivalent of 2.5 million full-time ...
Token IDs: tensor([  784,   254,   521,   603,   908,    10,   12




In [20]:
# create DataLoaders
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler


test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels, test_taxonomy_labels)

BATCH_SIZE = 16
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset),
            batch_size = BATCH_SIZE
        )

In [23]:
# ========================================
#               Testing
# ========================================
from collections import Counter
import time
import datetime
from torch.optim import AdamW
from torch import nn
import numpy as np

loss_func = nn.CrossEntropyLoss()

def test(model, dataloader):
    print("")
    print("Running Test set...")

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    correct_counts = Counter()
    total_counts = Counter()
    
    results = {}

    # Evaluate data for one epoch
    for batch in tqdm(dataloader):

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)

        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        b_taxonomy_labels = batch[3]

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():

            # Forward pass, calculate logit predictions.
            logits = model(b_input_ids,b_input_mask)

        # Accumulate the validation loss.
        loss = loss_func(logits, b_labels)
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        
        # Populate the dictionary
        predictions = np.argmax(logits, axis=1)
        categories = LE_taxonomy.inverse_transform(b_taxonomy_labels)
        for pred, label, category in zip(predictions, label_ids, categories):   
            if category not in results:
                results[category] = {'predictions': [], 'labels': []}
            results[category]['predictions'].append(pred)
            results[category]['labels'].append(label)


    for category in results:
        for nested in results[category]:
            results[category][nested] = np.array(results[category][nested])

    return results

    

In [24]:
res = test(model, test_dataloader)


Running Test set...


100%|██████████| 156/156 [00:05<00:00, 28.06it/s]


In [None]:
import numpy as np


from sklearn.metrics import precision_score, recall_score, f1_score

# average = macro/weighted/micro
def compute_metrics(pred_flat, labels_flat):
    accuracy = np.sum(pred_flat.flatten() == labels_flat.flatten()) / len(labels_flat.flatten())
    
    p_m = precision_score(labels_flat, pred_flat, average='macro')
    p_w = precision_score(labels_flat, pred_flat, average='weighted')
    r_m = recall_score(labels_flat, pred_flat, average='macro')
    r_w = recall_score(labels_flat, pred_flat, average='weighted')

    
    f1_m = f1_score(labels_flat, pred_flat, average='macro')
    f1_w = f1_score(labels_flat, pred_flat, average='weighted')
    
    return accuracy, p_m, p_w, r_m, r_w, f1_m, f1_w

In [None]:
total_preds = np.array([])
total_labels = np.array([])

print(f"# {model_type}")

# per taxonomy label
for category in res.keys():
    pred = res[category]['predictions']
    labels = res[category]['labels']
    acc, p_m, p_w, r_m, r_w, f1_m, f1_w = compute_metrics(pred, labels)
    
    print("")
    print(f"""[{category}]:
    M-P : {p_m*100:.2f}
    W-P : {p_w*100:.2f}
    M-R : {r_m*100:.2f}
    W-R : {r_w*100:.2f}
    M-F1: {f1_m*100:.2f}
    W-F1: {f1_w*100:.2f}""")
    
    total_preds = np.concatenate((total_preds, pred))
    total_labels = np.concatenate((total_labels, labels))


scores = f1_score(total_labels, total_preds, average=None)
for i, score in enumerate(scores):
    l = LE.inverse_transform([i])[0]
    print("")
    print(f"""[{l}]:
    F1: {score*100:.2f}""")

acc, p_m, p_w, r_m, r_w, f1_m, f1_w = compute_metrics(total_preds, total_labels)
print(f"""

Total:
    M-P : {p_m*100:.2f}
    W-P : {p_w*100:.2f}
    M-R : {r_m*100:.2f}
    W-R : {r_w*100:.2f}
    M-F1: {f1_m*100:.2f}
    W-F1: {f1_w*100:.2f}""")