# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import torchdata.datapipes as dp
import torchtext.transforms as T
import spacy
from torchtext.vocab import build_vocab_from_iterator
eng = spacy.load("en_core_web_sm")

In [2]:
import pandas as pd

evidences = pd.read_json('/content/drive/MyDrive/nlp/data/evidence.json', orient='index')
train_claims = pd.read_json('/content/drive/MyDrive/nlp/data/train-claims.json', orient='index')
dev_claims = pd.read_json('/content/drive/MyDrive/nlp/data/dev-claims.json', orient='index')

#update column names
evidences.reset_index(inplace=True)
evidences.columns = ['evidence_id', 'evidence_text']

train_claims.reset_index(inplace=True)
train_claims.rename(columns={'index': 'claim_id'}, inplace=True)

dev_claims.reset_index(inplace=True)
dev_claims.rename(columns={'index': 'claim_id'}, inplace=True)

evidence_id = evidences['evidence_id']
evidence_text = evidences['evidence_text']
evidence_idx = evidences.index.tolist()

evidence_id_dict = dict(zip(evidence_id, evidence_idx))

train_claims_text = train_claims['claim_text']
train_evidence_ids = train_claims['evidences']
train_claim_labels = train_claims['claim_label']
#map evidence_id to their corrosponding index for faster processing
train_evidence_idxs = train_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

dev_claims_text = dev_claims['claim_text']
dev_claim_labels = dev_claims['claim_label']
dev_evidence_ids = dev_claims['evidences']
dev_evidence_idxs = dev_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

In [None]:
test_claims = pd.read_json('/content/drive/MyDrive/nlp/data/test-claims-unlabelled.json', orient='index')
test_claims.reset_index(inplace=True)
test_claims.columns = ['claim_id', 'claim_text']
test_claims_text = test_claims['claim_text']
test_claims_id = test_claims['claim_id']

In [None]:
dev_claim_ids = dev_claims['claim_id']

In [3]:
import json
dev_evidence_indices = json.load(open("/content/drive/MyDrive/nlp/data/reranked_indices.json", "r"))
test_evidence_indices = json.load(open("/content/drive/MyDrive/nlp/data/test_reranked_indices.json", "r"))

In [4]:
dev_k_indices = [sublist[:5] if len(sublist) >= 5 else sublist + [None] * (5 - len(sublist)) for sublist in dev_evidence_indices]
test_k_indices = [sublist[:5] if len(sublist) >= 5 else sublist + [None] * (5 - len(sublist)) for sublist in test_evidence_indices]

In [5]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

tt = TweetTokenizer()
stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_data(text):
    tokens = tt.tokenize(text.lower())
    return tokens

train_claims_text_processed = train_claims_text.apply(preprocess_data)
dev_claims_text_processed = dev_claims_text.apply(preprocess_data)
evidence_text_processed = evidence_text.apply(preprocess_data)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
test_claims_text_processed = test_claims_text.apply(preprocess_data)

In [None]:
test_claims_text_processed

0      [‘, this, study, goes, beyond, statistical, co...
1      [a, recent, study, in, nature, geoscience, ,, ...
2      [‘, arctic, ice, conditions, have, been, track...
3      [“, the, global, reef, crisis, does, not, nece...
4      [a, second, coat, of, paint, has, much, less, ...
                             ...                        
148    [the, cement, ,, iron, and, steel, ,, and, pet...
149    [‘, we, could, be, decades, too, fast, ,, or, ...
150    [the, alaskan, tundra, is, warming, so, quickl...
151    [“, arctic, land, stores, about, twice, as, mu...
152    [“, warm, weather, worsened, the, most, recent...
Name: claim_text, Length: 153, dtype: object

In [6]:
from collections import Counter, OrderedDict
def build_vocab(texts, min_freq=3):
    # Count all the words
    word_freq = Counter()
    for text in texts:
        word_freq.update(text)

    # Start vocab from special tokens
    vocab = OrderedDict({
        "<pad>": 0,
        "<unk>": 1,
        "<sos>": 2,
        "<eos>": 3
    })
    index = 4  # Start indexing from 4 because 0-3 are reserved for special tokens
    for word, freq in word_freq.items():
        if freq >= min_freq:  # Only include words that meet the frequency threshold
            vocab[word] = index
            index += 1

    return vocab

# Build vocabulary using only evidence texts and applying the frequency threshold
vocab = build_vocab(evidence_text_processed, min_freq=3)

In [None]:
test_claims_numerical = test_claims_text_processed.apply(lambda x: numericalize(x, vocab))

In [7]:
label_map = {
    "REFUTES": 0,
    "SUPPORTS": 1,
    "NOT_ENOUGH_INFO": 2,
    "DISPUTED": 3
}
train_claim_labels = train_claims['claim_label'].map(label_map)
dev_claim_labels = dev_claims['claim_label'].map(label_map)

In [13]:
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

from torch.utils.data import Dataset, DataLoader
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def text_to_indices(text, vocab):
    return [vocab.get(word, vocab["<unk>"]) for word in text]

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

class BinaryClaimEvidenceDataset(Dataset):
    def __init__(self, claims, evidence_indices, evidences, claim_labels, vocab):
        self.claims = claims
        self.evidence_indices = evidence_indices
        self.evidences = evidences
        self.claim_labels = claim_labels
        self.vocab = vocab
        self.pairs = self.create_pairs()

    def create_pairs(self):
        pairs = []
        for idx, claim in enumerate(self.claims):
            label = self.claim_labels[idx]
            if label in [0, 1]:  # Only consider REFUTES (0) and SUPPORTS (1)
                candidate_pos_indices = self.evidence_indices[idx]
                positive_evidences = [self.evidences[i] for i in candidate_pos_indices]
                for evidence in positive_evidences:
                    pairs.append((claim, evidence, label))
        return pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        claim, evidence, label = self.pairs[idx]
        claim_indices = text_to_indices(claim, self.vocab)
        evidence_indices = text_to_indices(evidence, self.vocab)
        claim_indices = [self.vocab["<sos>"]] + claim_indices + [self.vocab["<eos>"]]
        evidence_indices = [self.vocab["<sos>"]] + evidence_indices + [self.vocab["<eos>"]]
        return claim_indices, evidence_indices, label

def custom_collate_fn(batch):
    claims, evidences, labels = zip(*batch)
    claims_tensor = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in claims], batch_first=True, padding_value=vocab["<pad>"]).to(device)
    evidences_tensor = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in evidences], batch_first=True, padding_value=vocab["<pad>"]).to(device)
    labels_tensor = torch.tensor(labels, dtype=torch.float).to(device)
    return claims_tensor, evidences_tensor, labels_tensor


# Prepare the data
train_dataset = BinaryClaimEvidenceDataset(train_claims_text_processed, train_evidence_idxs, evidence_text_processed, train_claim_labels, vocab)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)
dev_dataset = BinaryClaimEvidenceDataset(dev_claims_text_processed, dev_evidence_idxs, evidence_text_processed, dev_claim_labels, vocab)
dev_loader = DataLoader(dev_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate_fn)


(tensor([[    2,  1802,  1756, 15963,  7915,  9555,   112,   534, 14083,     7,
         14102,     7,    10, 52150,    31,     3,  3399,     7,  4881, 59233,
          6915, 12663,     7, 12754,     7,  1807,     7, 52150,     7, 11756,
            10,  2057,  5862, 14083,   279,  1392,    25,     1,  2060,  7387,
         12662,   126,  5401,  1544,    10,  8777, 12697,  1267,    19,   172,
            31,     3]], device='cuda:0'), tensor([1.], device='cuda:0'))


# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## 2.1 Bi-Directional GRU with Attention

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# class ClaimEvidenceModel(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
#         super(ClaimEvidenceModel, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
#         self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
#         self.fc = nn.Linear(hidden_dim * 2, output_dim)
#         self.attention = nn.Linear(hidden_dim * 2, 1, bias=False)
#         self.pad_idx = pad_idx

#     def forward(self, claims, evidence_lists):
#         # Embed and encode claims
#         embedded_claims = self.embedding(claims)
#         claims_mask = (claims != self.pad_idx).unsqueeze(2).float()
#         embedded_claims *= claims_mask

#         _, hidden_claims = self.gru(embedded_claims)
#         hidden_claims = torch.cat((hidden_claims[-2,:,:], hidden_claims[-1,:,:]), dim=1)

#         batch_size = claims.size(0)
#         max_logits = torch.full((batch_size, self.fc.out_features), float('-inf'), device=claims.device)

#         for i in range(batch_size):
#             evidences = evidence_lists[i]
#             embedded_evidences = self.embedding(evidences)
#             evidences_mask = (evidences != self.pad_idx).unsqueeze(2).float()
#             embedded_evidences *= evidences_mask

#             _, hidden_evidences = self.gru(embedded_evidences)
#             hidden_evidences = torch.cat((hidden_evidences[-2,:,:], hidden_evidences[-1,:,:]), dim=1)

#             logits = self.fc(hidden_evidences)
#             attention_weights = F.softmax(self.attention(hidden_evidences), dim=0)
#             evidence_representation = torch.sum(attention_weights * logits, dim=0, keepdim=True)

#             max_logits[i, :] = torch.max(logits, dim=0).values

#         return F.log_softmax(max_logits, dim=1)

class ClaimEvidenceModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super(ClaimEvidenceModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.attention = nn.Linear(hidden_dim * 2, 1, bias=False)
        self.pad_idx = pad_idx

    def forward(self, claims, evidences):
        # Embed and encode claims
        embedded_claims = self.embedding(claims)
        claims_mask = (claims != self.pad_idx).unsqueeze(2).float()
        embedded_claims *= claims_mask

        _, hidden_claims = self.gru(embedded_claims)
        hidden_claims = torch.cat((hidden_claims[-2,:,:], hidden_claims[-1,:,:]), dim=1)

        # Embed and encode evidences
        embedded_evidences = self.embedding(evidences)
        evidences_mask = (evidences != self.pad_idx).unsqueeze(2).float()
        embedded_evidences *= evidences_mask

        _, hidden_evidences = self.gru(embedded_evidences)
        hidden_evidences = torch.cat((hidden_evidences[-2,:,:], hidden_evidences[-1,:,:]), dim=1)

        # Combine claim and evidence representations
        combined_representation = hidden_claims + hidden_evidences
        logits = self.fc(combined_representation)

        return logits




# Model instantiation
model = ClaimEvidenceModel(vocab_size=len(vocab), embedding_dim=100, hidden_dim=256, output_dim=len(label_map), pad_idx=vocab['<pad>'])


## 2.2 Transformer with self-attention

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from torch import Tensor
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import math


class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(0)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: torch.Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:, :token_embedding.size(1), :])


class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: torch.Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


class TransformerEncoderLayer(nn.Module):
    def __init__(self, emb_size, num_heads, dim_feedforward, dropout):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(emb_size, num_heads, dropout=dropout, batch_first=True)
        self.linear1 = nn.Linear(emb_size, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, emb_size)
        self.norm1 = nn.LayerNorm(emb_size)
        self.norm2 = nn.LayerNorm(emb_size)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.activation = F.relu

    def forward(self, src: torch.Tensor, src_mask: torch.Tensor = None, src_key_padding_mask: torch.Tensor = None):
        src2, _ = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src


class AttentionPooling(nn.Module):
    def __init__(self, emb_size):
        super(AttentionPooling, self).__init__()
        self.attention = nn.Linear(emb_size, 1)

    def forward(self, x, mask):
        # x: [batch_size, seq_len, emb_size]
        # mask: [batch_size, seq_len]
        scores = self.attention(x)  # [batch_size, seq_len, 1]
        scores = scores.masked_fill(mask.unsqueeze(-1) == 0, float('-inf'))  # Mask padding tokens
        attn_weights = F.softmax(scores, dim=1)  # [batch_size, seq_len, 1]
        context = torch.sum(attn_weights * x, dim=1)  # [batch_size, emb_size]
        return context


class ClaimVerificationTransformer(nn.Module):
    def __init__(self, num_layers: int, emb_size: int, nhead: int, vocab_size: int, dim_feedforward: int, dropout: float = 0.1, maxlen: int = 5000, pad_idx: int = 0):
        super(ClaimVerificationTransformer, self).__init__()
        self.embedding = TokenEmbedding(vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout, maxlen)
        self.transformer_claim = nn.ModuleList([
            TransformerEncoderLayer(emb_size, nhead, dim_feedforward, dropout)
            for _ in range(num_layers)
        ])
        self.transformer_evidence = nn.ModuleList([
            TransformerEncoderLayer(emb_size, nhead, dim_feedforward, dropout)
            for _ in range(num_layers)
        ])
        self.attention_pooling = AttentionPooling(emb_size)
        self.fc = nn.Linear(emb_size, 1)  # Single output for binary classification
        self.pad_idx = pad_idx

    def forward(self, claims: torch.Tensor, evidences: torch.Tensor):
        # Embed and encode claims
        embedded_claims = self.embedding(claims)
        embedded_claims = self.positional_encoding(embedded_claims)
        claims_mask = (claims != self.pad_idx)

        for layer in self.transformer_claim:
            embedded_claims = layer(embedded_claims, src_key_padding_mask=~claims_mask)
        claims_encoded = self.attention_pooling(embedded_claims, claims_mask)  # Attention pooling

        # Embed and encode evidences
        embedded_evidences = self.embedding(evidences)
        embedded_evidences = self.positional_encoding(embedded_evidences)
        evidences_mask = (evidences != self.pad_idx)

        for layer in self.transformer_evidence:
            embedded_evidences = layer(embedded_evidences, src_key_padding_mask=~evidences_mask)
        evidences_encoded = self.attention_pooling(embedded_evidences, evidences_mask)  # Attention pooling

        # Combine claim and evidence representations
        combined_representation = claims_encoded + evidences_encoded
        logits = self.fc(combined_representation)
        return logits

# class ClaimVerificationTransformer(nn.Module):
#     def __init__(self, num_layers: int, emb_size: int, nhead: int, vocab_size: int, dim_feedforward: int, num_classes: int, dropout: float = 0.1, maxlen: int = 5000, pad_idx: int = 0):
#         super(ClaimVerificationTransformer, self).__init__()
#         self.embedding = TokenEmbedding(vocab_size, emb_size)
#         self.positional_encoding = PositionalEncoding(emb_size, dropout, maxlen)
#         self.transformer_claim = nn.ModuleList([
#             TransformerEncoderLayer(emb_size, nhead, dim_feedforward, dropout)
#             for _ in range(num_layers)
#         ])
#         self.transformer_evidence = nn.ModuleList([
#             TransformerEncoderLayer(emb_size, nhead, dim_feedforward, dropout)
#             for _ in range(num_layers)
#         ])
#         self.word_attention = nn.Linear(emb_size, 1, bias=False)
#         self.sentence_attention = nn.Linear(emb_size, 1, bias=False)
#         self.fc = nn.Linear(emb_size, num_classes)
#         self.pad_idx = pad_idx

#     def forward(self, claims: Tensor, evidence_lists: Tensor):
#         # Embed and encode claims
#         embedded_claims = self.embedding(claims)
#         embedded_claims = self.positional_encoding(embedded_claims)
#         claims_mask = (claims == self.pad_idx)

#         for layer in self.transformer_claim:
#             embedded_claims = layer(embedded_claims, src_key_padding_mask=claims_mask)
#         claims_encoded, _ = torch.max(embedded_claims, dim=1)  # Max pooling

#         batch_size = claims.size(0)
#         evidence_representations = []

#         for i in range(batch_size):
#             evidences = evidence_lists[i]
#             embedded_evidences = self.embedding(evidences)
#             embedded_evidences = self.positional_encoding(embedded_evidences)
#             evidences_mask = (evidences == self.pad_idx)

#             for layer in self.transformer_evidence:
#                 embedded_evidences = layer(embedded_evidences, src_key_padding_mask=evidences_mask)
#             evidences_encoded, _ = torch.max(embedded_evidences, dim=1)  # Max pooling

#             # Word-level attention mechanism
#             word_attention_weights = F.softmax(self.word_attention(evidences_encoded), dim=0)
#             evidence_representation = torch.sum(word_attention_weights * evidences_encoded, dim=0)
#             evidence_representations.append(evidence_representation)

#         evidence_representations = torch.stack(evidence_representations)

#         # Sentence-level attention mechanism
#         sentence_attention_weights = F.softmax(self.sentence_attention(evidence_representations), dim=0)
#         sentence_representation = torch.sum(sentence_attention_weights * evidence_representations, dim=0)

#         combined_representation = sentence_representation + claims_encoded
#         logits = self.fc(combined_representation)
#         return F.log_softmax(logits, dim=1)




In [40]:
num_refutes = (train_claim_labels == 0).sum()
num_supports = (train_claim_labels == 1).sum()
pos_weight = num_refutes / (num_supports)

pos_weight

0.3834296724470135

In [42]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, classification_report

def train_model(model, train_loader, dev_loader, criterion, optimizer, device, num_epochs=10, grad_clip=1.0, threshold=0.82):
    model = model.to(device)  # Ensure the model is on the right device
    criterion = criterion.to(device)  # Also move the criterion to the GPU if available

    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for claims, evidences, labels in train_loader:
            claims = claims.to(device)
            evidences = evidences.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            logits = model(claims, evidences).squeeze()
            loss = criterion(logits, labels)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{num_epochs} - Training Loss: {avg_train_loss}')

        # Evaluate on the development set
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for claims, evidences, labels in dev_loader:
                claims = claims.to(device)
                evidences = evidences.to(device)
                labels = labels.to(device)

                logits = model(claims, evidences).squeeze()
                val_loss = criterion(logits, labels)
                total_val_loss += val_loss.item()

                probs = torch.sigmoid(logits)  # Convert logits to probabilities
                preds = torch.where(probs > threshold, 1, 0).cpu().numpy()  # Convert probabilities to 0 and 1

                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(dev_loader)

        # Calculate evaluation metrics
        accuracy = accuracy_score(all_labels, all_preds)
        f1 = classification_report(all_labels, all_preds, target_names=['REFUTES', 'SUPPORTS'], zero_division=0, output_dict=True)['macro avg']['f1-score']

        print(f'Epoch {epoch + 1}/{num_epochs} - Validation Loss: {avg_val_loss}')
        print(f'Accuracy: {accuracy}')
        print(f'F1 Score: {f1}')
        print(classification_report(all_labels, all_preds, target_names=['REFUTES', 'SUPPORTS'], zero_division=0))

        # Step the learning rate scheduler
        scheduler.step(avg_val_loss)

# Training the model
model = ClaimVerificationTransformer(num_layers=3, emb_size=512, nhead=8, vocab_size=len(vocab), dim_feedforward=512, dropout=0.7, pad_idx=vocab["<pad>"])
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight).to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Assuming `device` is defined (e.g., `device = torch.device("cuda" if torch.cuda.is_available() else "cpu")`)
train_model(model, train_loader, dev_loader, criterion, optimizer, device=device, num_epochs=15)







Epoch 1/15 - Training Loss: 0.3870027944841216
Epoch 1/15 - Validation Loss: 0.43992309272289276
Accuracy: 0.25
F1 Score: 0.2
              precision    recall  f1-score   support

     REFUTES       0.25      1.00      0.40        57
    SUPPORTS       0.00      0.00      0.00       171

    accuracy                           0.25       228
   macro avg       0.12      0.50      0.20       228
weighted avg       0.06      0.25      0.10       228

Epoch 2/15 - Training Loss: 0.3769318875485817
Epoch 2/15 - Validation Loss: 0.3574479669332504
Accuracy: 0.25
F1 Score: 0.2
              precision    recall  f1-score   support

     REFUTES       0.25      1.00      0.40        57
    SUPPORTS       0.00      0.00      0.00       171

    accuracy                           0.25       228
   macro avg       0.12      0.50      0.20       228
weighted avg       0.06      0.25      0.10       228

Epoch 3/15 - Training Loss: 0.37365244816889803
Epoch 3/15 - Validation Loss: 0.355329349637031

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [57]:
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

def evaluate_model(model, claims, evidence_idxs, evidences, claim_labels, vocab, pad_idx, device, support_threshold=0.5, refute_threshold=0.005):
    model.eval()
    all_preds = []
    all_labels = []
    all_evidence_predictions = []
    all_evidence_probs = []

    for claim_tokens, evidence_idx_list, true_label in zip(claims, evidence_idxs, claim_labels):
        # Numericalize the claim tokens
        claim_indices = text_to_indices(claim_tokens, vocab)
        claim_indices = [vocab["<sos>"]] + claim_indices + [vocab["<eos>"]]
        claim_tensor = torch.tensor(claim_indices, dtype=torch.long).unsqueeze(0).to(device)  # Add batch dimension

        evidence_tensors = []
        for idx in evidence_idx_list:
            evidence_tokens = evidences[idx]
            evidence_indices = text_to_indices(evidence_tokens, vocab)
            evidence_indices = [vocab["<sos>"]] + evidence_indices + [vocab["<eos>"]]
            evidence_tensor = torch.tensor(evidence_indices, dtype=torch.long).to(device)
            evidence_tensors.append(evidence_tensor)

        # Pad evidence tensors to the same length
        evidence_tensors_padded = pad_sequence(evidence_tensors, batch_first=True, padding_value=pad_idx).to(device)

        evidence_predictions = []
        evidence_probs = []
        with torch.no_grad():
            for evidence_tensor in evidence_tensors_padded:
                evidence_tensor = evidence_tensor.unsqueeze(0)  # Add batch dimension
                logits = model(claim_tensor, evidence_tensor).squeeze()
                prob = torch.sigmoid(logits).item()
                evidence_probs.append(prob)
                if prob > support_threshold:
                    evidence_predictions.append(1)  # SUPPORTS
                elif prob < refute_threshold:
                    evidence_predictions.append(0)  # REFUTES
                else:
                    evidence_predictions.append(2)  # NOT ENOUGH INFO

        aggregated_prediction = aggregate_predictions(evidence_predictions)
        all_preds.append(aggregated_prediction)
        all_labels.append(true_label)
        all_evidence_predictions.append(evidence_predictions)
        all_evidence_probs.append(evidence_probs)

    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=['REFUTES', 'SUPPORTS', 'NOT ENOUGH INFO', 'DISPUTED'], zero_division=0)
    return accuracy, report, all_preds, all_labels, all_evidence_predictions,all_evidence_probs

# def aggregate_predictions(evidence_predictions):
#     counter = Counter(evidence_predictions)
#     num_supports = counter[1]
#     num_refutes = counter[0]
#     num_not_enough_info = counter[2]

#     # Determine the majority class with a focus on conflict handling
#     if num_supports > 0 and num_refutes > 0:
#         if num_supports > num_refutes and num_supports > num_not_enough_info:
#             return 1  # SUPPORTS
#         elif num_refutes > num_supports and num_refutes > num_not_enough_info:
#             return 0  # REFUTES
#         else:
#             return 3  # DISPUTED if the majority is not clear
#     elif num_supports > num_refutes:
#         return 1  # SUPPORTS
#     elif num_refutes > num_supports:
#         return 0  # REFUTES
#     else:
#         return 2  # NOT ENOUGH INFO
def aggregate_predictions(evidence_predictions):
    counter = Counter(evidence_predictions)
    num_supports = counter[1]
    num_refutes = counter[0]

    # Handle conflicts: if both SUPPORTS and REFUTES are present
    if num_supports > 0 and num_refutes > 0:
        return 3  # DISPUTED if there are both SUPPORTS and REFUTES

    # Determine the class if all predictions are either SUPPORTS or REFUTES (with or without NOT ENOUGH INFO)
    if num_supports > 0 and num_refutes == 0:
        return 1  # SUPPORTS

    if num_refutes > 0 and num_supports == 0:
        return 0  # REFUTES

    # Default to NOT ENOUGH INFO if there are no SUPPORTS or REFUTES
    return 2


# Evaluate the model on the training set
accuracy, report, all_preds, all_labels, all_evidence_predictions,_ = evaluate_model(model, train_claims_text_processed, train_evidence_idxs, evidence_text_processed, train_claim_labels, vocab, vocab["<pad>"], device)
print(f'Accuracy: {accuracy}')
print(report)


Accuracy: 0.4584690553745928
                 precision    recall  f1-score   support

        REFUTES       0.47      0.42      0.44       199
       SUPPORTS       0.52      0.74      0.61       519
NOT ENOUGH INFO       0.31      0.24      0.27       386
       DISPUTED       0.00      0.00      0.00       124

       accuracy                           0.46      1228
      macro avg       0.32      0.35      0.33      1228
   weighted avg       0.39      0.46      0.42      1228



In [56]:
all_evidence_predictions

[[2, 2, 2],
 [0, 0],
 [1, 1],
 [2, 2, 2, 2, 0],
 [0, 2, 2, 2, 2],
 [1, 1, 1, 1, 1],
 [1, 1],
 [1, 2, 2],
 [1, 1, 1, 1, 1],
 [1],
 [1],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1],
 [0, 0, 0, 0, 0],
 [1],
 [1],
 [0, 0, 0],
 [1, 1, 1, 1, 1],
 [1],
 [1, 2, 1, 1],
 [1, 1],
 [2, 2],
 [2, 2, 2, 2],
 [1, 1, 1],
 [2, 2, 2, 2],
 [2, 2],
 [1],
 [1, 1, 1, 1, 1],
 [2],
 [2],
 [2],
 [2, 2, 2, 2, 2],
 [1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 1, 1],
 [2, 2, 2, 2, 2],
 [0, 0],
 [1],
 [2, 2],
 [2, 2, 2],
 [2, 2, 2, 2, 2],
 [0, 0, 0],
 [1, 1, 1, 2, 2],
 [1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1, 1],
 [0, 0, 0],
 [1, 1],
 [1, 1, 1, 1, 1],
 [0, 0],
 [2, 2],
 [1],
 [1, 1, 1, 1],
 [0],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 1],
 [2, 2],
 [1, 1, 1, 1, 1],
 [0, 0, 0, 0, 0],
 [1],
 [0],
 [1, 1],
 [0, 0, 0, 0, 0],
 [2, 2, 2, 2],
 [2, 2],
 [1, 1],
 [0, 2, 2, 2],
 [2, 0, 2, 2, 2],
 [1, 1, 1, 1],
 [2, 1, 1],
 [1, 1],
 [1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1],
 [1, 1, 1],
 [2],
 [2, 1, 2, 2, 2],
 [2, 2, 2, 2,

In [None]:
import torch
from sklearn.metrics import accuracy_score, classification_report

def train_model(model, train_loader, dev_loader, criterion, optimizer, device, num_epochs=10, grad_clip=1.0):
    model = model.to(device)  # Ensure the model is on the right device
    criterion = criterion.to(device)  # Also move the criterion to the GPU if available

    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for claims, evidences, labels in train_loader:
            claims = claims.to(device)
            evidences = [e.to(device) for e in evidences]  # Move each batch of evidences to GPU
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(claims, evidences)
            loss = criterion(outputs, labels)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{num_epochs} - Training Loss: {avg_train_loss}')

        # Evaluate on the development set
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for claims, evidences, labels in dev_loader:
                claims = claims.to(device)
                evidences = [e.to(device) for e in evidences]
                labels = labels.to(device)

                outputs = model(claims, evidences)
                val_loss = criterion(outputs, labels)
                total_val_loss += val_loss.item()

                _, predicted = torch.max(outputs, 1)
                all_preds.extend(predicted.cpu().numpy())  # Move predictions back to CPU for scoring
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(dev_loader)
        # Calculate evaluation metrics
        accuracy = accuracy_score(all_labels, all_preds)


        print(f'Epoch {epoch + 1}/{num_epochs} - Validation Loss: {avg_val_loss}')
        print(classification_report(all_labels, all_preds, zero_division=0))


        # Step the learning rate scheduler
        scheduler.step(avg_val_loss)



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_label_counts = train_claim_labels.value_counts()
# Convert label counts to a Tensor
train_label_counts_tensor = torch.tensor(train_label_counts.sort_index().values, dtype=torch.float)

# Calculate weights: inversely proportional to the class frequencies
class_weights = 1.0 / train_label_counts_tensor

# Normalize weights so that the smallest weight is 1.0
class_weights = class_weights / class_weights.min()

# Move weights to the correct device (GPU or CPU)
class_weights = class_weights.to(device)

# Model instantiation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = ClaimEvidenceModel(vocab_size=len(vocab), embedding_dim=100, hidden_dim=128, output_dim=len(label_map), pad_idx=vocab['<pad>'])
model = ClaimVerificationTransformer(
    num_layers=2,
    emb_size=360,
    nhead=4,
    vocab_size=len(vocab),
    dim_feedforward=512,
    num_classes=len(label_map),
    dropout=0.5,
    maxlen=5000,
    pad_idx=vocab['<pad>']
)
optimizer = torch.optim.Adam(model.parameters())
# criterion = nn.CrossEntropyLoss(weight=class_weights, ignore_index=vocab['<pad>'])
criterion = nn.NLLLoss(weight=class_weights)

# Load data
train_dataset = ClaimEvidenceDataset(train_claims_numerical, train_claim_labels, train_evidence_idxs, evidence_numerical)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=custom_collate_fn)
dev_dataset = ClaimEvidenceDataset(dev_claims_numerical, dev_claim_labels, dev_k_indices, evidence_numerical)
dev_loader = DataLoader(dev_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate_fn)

# Training and evaluating the model
train_model(model, train_loader, dev_loader, criterion, optimizer, device=device, num_epochs=20)





Epoch 1/20 - Training Loss: 2.172918736934662
Epoch 1/20 - Validation Loss: 1.4890036582946777
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        18
           1       0.18      1.00      0.30        27
           2       0.00      0.00      0.00        68
           3       0.00      0.00      0.00        41

    accuracy                           0.18       154
   macro avg       0.04      0.25      0.08       154
weighted avg       0.03      0.18      0.05       154

Epoch 2/20 - Training Loss: 1.4130306601524354
Epoch 2/20 - Validation Loss: 1.6923393607139587
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        18
           1       0.18      0.96      0.30        27
           2       0.44      0.06      0.10        68
           3       0.00      0.00      0.00        41

    accuracy                           0.19       154
   macro avg       0.16      0.26      0.10     

In [None]:
from sklearn.metrics import accuracy_score
dev_dataset = ClaimEvidenceDataset(dev_claims_numerical, dev_claim_labels, dev_evidence_idxs, evidence_numerical)

dev_loader = DataLoader(dev_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate_fn)

model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for claims, evidences, labels in dev_loader:
        claims = claims.to(device)
        evidences = [e.to(device) for e in evidences]  # List comprehension to move each batch of evidences to GPU
        labels = labels.to(device)
        outputs = model(claims, evidences)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)

In [None]:
evidence_idx_to_id_dict = {idx: id for id, idx in evidence_id_dict.items()}

label_map_inverse = {v: k for k, v in label_map.items()}



dev_label_predictions = [label_map_inverse[pred] for pred in all_preds]
dev_converted_evidence_ids = [[evidence_idx_to_id_dict[idx] for idx in indices] for indices in dev_k_indices]

results = {}
for i, claim_id in enumerate(dev_claim_ids):
    results[claim_id] = {
        "claim_text": dev_claims_text[i],
        "claim_label": dev_label_predictions[i],
        "evidences": dev_converted_evidence_ids[i]
    }


In [None]:
with open('/content/drive/MyDrive/nlp/data/dev_predictions.json', 'w') as file:
    json.dump(results, file)

In [None]:
test_dataset = ClaimEvidenceDataset(test_claims_numerical, None, test_k_indices, evidence_numerical)

test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate_fn)

model.eval()
all_preds = []
with torch.no_grad():
    for claims, evidences,_ in test_loader:  # Note that labels are not loaded
        claims = claims.to(device)
        evidences = [e.to(device) for e in evidences]  # Ensure evidences are moved to GPU
        outputs = model(claims, evidences)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())

# `all_preds` now contains the predicted labels for your test dataset


In [None]:
# Inverse mapping for label_map
label_map_inverse = {v: k for k, v in label_map.items()}
# Convert numerical predictions to label strings
label_predictions = [label_map_inverse[pred] for pred in all_preds]


In [None]:
# Assuming evidence_id_dict maps from IDs to indices, create the inverse mapping
evidence_idx_to_id_dict = {idx: id for id, idx in evidence_id_dict.items()}

# Now convert indices to IDs (assuming you have a list of indices)
# example_indices = [1, 2, 3]  # Replace with actual indices if needed
# converted_ids = [evidence_idx_to_id_dict[idx] for idx in example_indices]
converted_evidence_ids = [[evidence_idx_to_id_dict[idx] for idx in indices] for indices in test_k_indices]

In [None]:
converted_evidence_ids[0]

['evidence-55562',
 'evidence-1032935',
 'evidence-60163',
 'evidence-225665',
 'evidence-377026']

In [None]:
test_claims_id
test_claims_text
label_predictions
converted_evidence_ids


In [None]:
json.dump(label_predictions, open("/content/drive/MyDrive/nlp/data/test_label_predictions.json", "w"))

In [None]:
results = {}
for i, claim_id in enumerate(test_claims_id):
    results[claim_id] = {
        "claim_text": test_claims_text[i],
        "claim_label": label_predictions[i],
        "evidences": converted_evidence_ids[i]
    }

In [None]:
import json

# Convert to JSON string
json_output = json.dumps(results, indent=4)
print(json_output)

# Save to a JSON file
with open('/content/drive/MyDrive/nlp/data/test_predictions.json', 'w') as file:
    json.dump(results, file)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*