# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import torchdata.datapipes as dp
import torchtext.transforms as T
import spacy
from torchtext.vocab import build_vocab_from_iterator
eng = spacy.load("en_core_web_sm")

In [None]:
import pandas as pd

evidences = pd.read_json('/content/drive/MyDrive/nlp/data/evidence.json', orient='index')
train_claims = pd.read_json('/content/drive/MyDrive/nlp/data/train-claims.json', orient='index')
dev_claims = pd.read_json('/content/drive/MyDrive/nlp/data/dev-claims.json', orient='index')

#update column names
evidences.reset_index(inplace=True)
evidences.columns = ['evidence_id', 'evidence_text']

train_claims.reset_index(inplace=True)
train_claims.rename(columns={'index': 'claim_id'}, inplace=True)

dev_claims.reset_index(inplace=True)
dev_claims.rename(columns={'index': 'claim_id'}, inplace=True)

evidence_id = evidences['evidence_id']
evidence_text = evidences['evidence_text']
evidence_idx = evidences.index.tolist()

evidence_id_dict = dict(zip(evidence_id, evidence_idx))

train_claims_text = train_claims['claim_text']
train_evidence_ids = train_claims['evidences']
train_claim_labels = train_claims['claim_label']
#map evidence_id to their corrosponding index for faster processing
train_evidence_idxs = train_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

dev_claims_text = dev_claims['claim_text']
dev_claim_labels = dev_claims['claim_label']
dev_evidence_ids = dev_claims['evidences']
dev_evidence_idxs = dev_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

In [None]:
test_claims = pd.read_json('/content/drive/MyDrive/nlp/data/test-claims-unlabelled.json', orient='index')
test_claims.reset_index(inplace=True)
test_claims.columns = ['claim_id', 'claim_text']
test_claims_text = test_claims['claim_text']
test_claims_id = test_claims['claim_id']

In [None]:
dev_claim_ids = dev_claims['claim_id']

In [None]:
import json
dev_evidence_indices = json.load(open("/content/drive/MyDrive/nlp/data/reranked_indices.json", "r"))
test_evidence_indices = json.load(open("/content/drive/MyDrive/nlp/data/test_reranked_indices.json", "r"))

In [None]:
dev_k_indices = [sublist[:5] if len(sublist) >= 5 else sublist + [None] * (5 - len(sublist)) for sublist in dev_evidence_indices]
test_k_indices = [sublist[:5] if len(sublist) >= 5 else sublist + [None] * (5 - len(sublist)) for sublist in test_evidence_indices]

In [None]:
def evaluate_evidence_retrieval(predicted_indices_list, actual_indices_list, k=5):
    assert len(predicted_indices_list) == len(actual_indices_list), "Both inputs must have the same length."

    total_recall = 0.0
    total_precision = 0.0
    total_fscore = 0.0
    num_claims = len(predicted_indices_list)

    for predicted_indices, actual_indices in zip(predicted_indices_list, actual_indices_list):
        # Convert tensors in predicted_indices to integers if they are not already
        predicted_indices = [index.item() if isinstance(index, torch.Tensor) else index for index in predicted_indices]

        # Retrieve the top k predictions
        top_k_predicted = set(predicted_indices[:k])
        actual_indices_set = set(actual_indices)

        # Calculate the number of correct predictions
        correct_predictions = len(top_k_predicted.intersection(actual_indices_set))

        # Calculate metrics
        if correct_predictions > 0:
            recall = float(correct_predictions) / len(actual_indices_set)
            precision = float(correct_predictions) / k
            if (precision + recall) != 0:
                fscore = 2 * (precision * recall) / (precision + recall)
            else:
                fscore = 0.0
        else:
            recall = 0.0
            precision = 0.0
            fscore = 0.0

        # Accumulate the metrics to calculate averages later
        total_recall += recall
        total_precision += precision
        total_fscore += fscore

    # Calculate average metrics
    average_recall = total_recall / num_claims
    average_precision = total_precision / num_claims
    average_fscore = total_fscore / num_claims

    return {
        "average_recall": average_recall,
        "average_precision": average_precision,
        "average_fscore": average_fscore
    }

results = evaluate_evidence_retrieval(dev_evidence_indices, dev_evidence_idxs)

In [None]:
print(results)

{'average_recall': 0.131060606060606, 'average_precision': 0.07142857142857141, 'average_fscore': 0.08493609565038136}


In [None]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

tt = TweetTokenizer()
stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_data(text):
    tokens = tt.tokenize(text.lower())
    return tokens

train_claims_text_processed = train_claims_text.apply(preprocess_data)
dev_claims_text_precessed = dev_claims_text.apply(preprocess_data)
evidence_text_processed = evidence_text.apply(preprocess_data)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
test_claims_text_processed = test_claims_text.apply(preprocess_data)

In [None]:
test_claims_text_processed

0      [‘, this, study, goes, beyond, statistical, co...
1      [a, recent, study, in, nature, geoscience, ,, ...
2      [‘, arctic, ice, conditions, have, been, track...
3      [“, the, global, reef, crisis, does, not, nece...
4      [a, second, coat, of, paint, has, much, less, ...
                             ...                        
148    [the, cement, ,, iron, and, steel, ,, and, pet...
149    [‘, we, could, be, decades, too, fast, ,, or, ...
150    [the, alaskan, tundra, is, warming, so, quickl...
151    [“, arctic, land, stores, about, twice, as, mu...
152    [“, warm, weather, worsened, the, most, recent...
Name: claim_text, Length: 153, dtype: object

In [None]:
from collections import Counter, OrderedDict
def build_vocab(texts, min_freq=3):
    # Count all the words
    word_freq = Counter()
    for text in texts:
        word_freq.update(text)

    # Start vocab from special tokens
    vocab = OrderedDict({
        "<pad>": 0,
        "<unk>": 1,
        "<sos>": 2,
        "<eos>": 3
    })
    index = 4  # Start indexing from 4 because 0-3 are reserved for special tokens
    for word, freq in word_freq.items():
        if freq >= min_freq:  # Only include words that meet the frequency threshold
            vocab[word] = index
            index += 1

    return vocab

# Build vocabulary using only evidence texts and applying the frequency threshold
vocab = build_vocab(evidence_text_processed, min_freq=3)

In [None]:
def numericalize(text, vocab):
    # Convert text to numerical representation, adding start and end tokens
    return [vocab["<sos>"]] + [vocab.get(word, vocab["<unk>"]) for word in text] + [vocab["<eos>"]]

train_claims_numerical = train_claims_text_processed.apply(lambda x: numericalize(x, vocab))
dev_claims_numerical = dev_claims_text_precessed.apply(lambda x: numericalize(x, vocab))
evidence_numerical = evidence_text_processed.apply(lambda x: numericalize(x, vocab))


In [None]:
test_claims_numerical = test_claims_text_processed.apply(lambda x: numericalize(x, vocab))

In [None]:
label_map = {
    "DISPUTED": 0,
    "REFUTES": 1,
    "SUPPORTS": 2,
    "NOT_ENOUGH_INFO": 3
}
train_claim_labels = train_claims['claim_label'].map(label_map)
dev_claim_labels = dev_claims['claim_label'].map(label_map)

In [None]:
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

from torch.utils.data import Dataset, DataLoader
import torch

class ClaimEvidenceDataset(Dataset):
    def __init__(self, claims, claim_labels, evidence_indices, evidences):
        """
        claims: List of numericalized claims.
        claim_labels: List of labels for the claims.
        evidence_indices: List of lists, where each sublist contains indices of evidences associated with the claim.
        evidences: List of all numericalized evidences.
        """
        self.claims = claims
        self.claim_labels = claim_labels
        self.evidence_indices = evidence_indices
        self.evidences = evidences

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        claim = self.claims[idx]
        #label = self.claim_labels[idx]
        evidence_idxs = self.evidence_indices[idx]
        evidences = [self.evidences[i] for i in evidence_idxs]
    #     return claim, evidences, label
        if self.claim_labels is not None:
            label = self.claim_labels[idx]
            return claim, evidences, label
        else:
            # Return None or a dummy value for label if not available
            return claim, evidences, -1

def custom_collate_fn(batch):
    claims, evidence_lists, labels = zip(*batch)

    # Padding claims
    claims_padded = pad_sequence([torch.tensor(claim) for claim in claims], batch_first=True, padding_value=vocab['<pad>'])

    # Padding evidences independently for each claim
    evidence_padded = [pad_sequence([torch.tensor(evidence) for evidence in evidences], batch_first=True, padding_value=vocab['<pad>']) for evidences in evidence_lists]

    labels_tensor = torch.tensor(labels, dtype=torch.long)

    return claims_padded, evidence_padded, labels_tensor

train_dataset = ClaimEvidenceDataset(train_claims_numerical, train_claim_labels, train_evidence_idxs, evidence_numerical)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=custom_collate_fn)





# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


# class ClaimEvidenceModel(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
#         super(ClaimEvidenceModel, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
#         self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
#         self.fc = nn.Linear(hidden_dim * 2, output_dim)
#         self.attention = nn.Linear(hidden_dim * 2, 1, bias=False)

#     def forward(self, claims, evidence_lists):
#         # Embed and encode claims
#         embedded_claims = self.embedding(claims)
#         _, hidden_claims = self.gru(embedded_claims)  # [num_layers * num_directions, batch_size, hidden_dim]
#         hidden_claims = torch.cat((hidden_claims[-2,:,:], hidden_claims[-1,:,:]), dim=1)

#         batch_size = claims.size(0)
#         # Initialize tensor to store max logits from each evidence set
#         max_logits = torch.full((batch_size, self.fc.out_features), float('-inf'), device=claims.device)

#         for i in range(batch_size):
#             evidences = evidence_lists[i]
#             embedded_evidences = self.embedding(evidences)
#             _, hidden_evidences = self.gru(embedded_evidences)
#             hidden_evidences = torch.cat((hidden_evidences[-2,:,:], hidden_evidences[-1,:,:]), dim=1)

#             # Calculate logits for each evidence
#             logits = self.fc(hidden_evidences)  # Output shape: [num_evidences, output_dim]

#             # Apply softmax to get probabilities for attention (alternative approach)
#             attention_weights = F.softmax(self.attention(hidden_evidences), dim=0)
#             evidence_representation = torch.sum(attention_weights * logits, dim=0, keepdim=True)

#             # Use max pooling across the logits
#             max_logits[i, :] = torch.max(logits, dim=0).values

#         return F.log_softmax(max_logits, dim=1)

class ClaimEvidenceModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super(ClaimEvidenceModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.attention = nn.Linear(hidden_dim * 2, 1, bias=False)
        self.pad_idx = pad_idx

    def forward(self, claims, evidence_lists):
        # Embed and encode claims
        embedded_claims = self.embedding(claims)
        claims_mask = (claims != self.pad_idx).unsqueeze(2).float()
        embedded_claims *= claims_mask

        _, hidden_claims = self.gru(embedded_claims)
        hidden_claims = torch.cat((hidden_claims[-2,:,:], hidden_claims[-1,:,:]), dim=1)

        batch_size = claims.size(0)
        max_logits = torch.full((batch_size, self.fc.out_features), float('-inf'), device=claims.device)

        for i in range(batch_size):
            evidences = evidence_lists[i]
            embedded_evidences = self.embedding(evidences)
            evidences_mask = (evidences != self.pad_idx).unsqueeze(2).float()
            embedded_evidences *= evidences_mask

            _, hidden_evidences = self.gru(embedded_evidences)
            hidden_evidences = torch.cat((hidden_evidences[-2,:,:], hidden_evidences[-1,:,:]), dim=1)

            logits = self.fc(hidden_evidences)
            attention_weights = F.softmax(self.attention(hidden_evidences), dim=0)
            evidence_representation = torch.sum(attention_weights * logits, dim=0, keepdim=True)

            max_logits[i, :] = torch.max(logits, dim=0).values

        return F.log_softmax(max_logits, dim=1)






# Model instantiation
model = ClaimEvidenceModel(vocab_size=len(vocab), embedding_dim=100, hidden_dim=256, output_dim=len(label_map), pad_idx=vocab['<pad>'])


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from torch import Tensor
import math

class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])


class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

class TransformerEncoderLayer(nn.Module):
    def __init__(self, emb_size, num_heads, dim_feedforward, dropout):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(emb_size, num_heads, dropout=dropout, batch_first=True)
        self.linear1 = nn.Linear(emb_size, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, emb_size)
        self.norm1 = nn.LayerNorm(emb_size)
        self.norm2 = nn.LayerNorm(emb_size)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.activation = F.relu

    def forward(self, src: Tensor, src_mask: Tensor = None, src_key_padding_mask: Tensor = None):
        src2, _ = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src



class ClaimVerificationTransformer(nn.Module):
    def __init__(self, num_layers: int, emb_size: int, nhead: int, vocab_size: int, dim_feedforward: int, num_classes: int, dropout: float = 0.1, maxlen: int = 5000, pad_idx: int = 0):
        super(ClaimVerificationTransformer, self).__init__()
        self.embedding = TokenEmbedding(vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout, maxlen)
        encoder_layers = nn.TransformerEncoderLayer(emb_size, nhead, dim_feedforward, dropout, batch_first=True)
        self.transformer_claim = nn.TransformerEncoder(encoder_layers, num_layers)
        self.transformer_evidence = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(emb_size, num_classes)
        self.attention = nn.Linear(emb_size, 1)
        self.pad_idx = pad_idx

    def forward(self, claims: Tensor, evidence_lists: Tensor):
        # Embed and encode claims
        embedded_claims = self.embedding(claims)
        embedded_claims = self.positional_encoding(embedded_claims)
        claims_mask = (claims == self.pad_idx)
        claims_encoded = self.transformer_claim(embedded_claims, src_key_padding_mask=claims_mask)
        #claims_encoded = claims_encoded.mean(dim=1)  # Average pooling
        claims_encoded, _ = torch.max(claims_encoded, dim=1)  # Max pooling

        batch_size = claims.size(0)
        evidence_representations = []

        for i in range(batch_size):
            evidences = evidence_lists[i]
            embedded_evidences = self.embedding(evidences)
            embedded_evidences = self.positional_encoding(embedded_evidences)
            evidences_mask = (evidences == self.pad_idx)
            evidences_encoded = self.transformer_evidence(embedded_evidences, src_key_padding_mask=evidences_mask)
            evidences_encoded = evidences_encoded.mean(dim=1)  # Average pooling
            #evidences_encoded, _ = torch.max(evidences_encoded, dim=1)  # Max pooling

            # Attention mechanism
            attention_weights = F.softmax(self.attention(evidences_encoded), dim=0)
            evidence_representation = torch.sum(attention_weights * evidences_encoded, dim=0)
            evidence_representations.append(evidence_representation)

        evidence_representations = torch.stack(evidence_representations)
        combined_representation = evidence_representations + claims_encoded
        logits = self.fc(combined_representation)
        return F.log_softmax(logits, dim=1)

# class ClaimVerificationTransformer(nn.Module):
#     def __init__(self, num_layers: int, emb_size: int, nhead: int, vocab_size: int, dim_feedforward: int, num_classes: int, dropout: float = 0.1, maxlen: int = 5000, pad_idx: int = 0):
#         super(ClaimVerificationTransformer, self).__init__()
#         self.embedding = TokenEmbedding(vocab_size, emb_size)
#         self.positional_encoding = PositionalEncoding(emb_size, dropout, maxlen)
#         self.transformer_claim = nn.ModuleList([
#             TransformerEncoderLayer(emb_size, nhead, dim_feedforward, dropout)
#             for _ in range(num_layers)
#         ])
#         self.transformer_evidence = nn.ModuleList([
#             TransformerEncoderLayer(emb_size, nhead, dim_feedforward, dropout)
#             for _ in range(num_layers)
#         ])
#         self.word_attention = nn.Linear(emb_size, 1, bias=False)
#         self.sentence_attention = nn.Linear(emb_size, 1, bias=False)
#         self.fc = nn.Linear(emb_size, num_classes)
#         self.pad_idx = pad_idx

#     def forward(self, claims: Tensor, evidence_lists: Tensor):
#         # Embed and encode claims
#         embedded_claims = self.embedding(claims)
#         embedded_claims = self.positional_encoding(embedded_claims)
#         claims_mask = (claims == self.pad_idx)

#         for layer in self.transformer_claim:
#             embedded_claims = layer(embedded_claims, src_key_padding_mask=claims_mask)
#         claims_encoded, _ = torch.max(embedded_claims, dim=1)  # Max pooling

#         batch_size = claims.size(0)
#         evidence_representations = []

#         for i in range(batch_size):
#             evidences = evidence_lists[i]
#             embedded_evidences = self.embedding(evidences)
#             embedded_evidences = self.positional_encoding(embedded_evidences)
#             evidences_mask = (evidences == self.pad_idx)

#             for layer in self.transformer_evidence:
#                 embedded_evidences = layer(embedded_evidences, src_key_padding_mask=evidences_mask)
#             evidences_encoded, _ = torch.max(embedded_evidences, dim=1)  # Max pooling

#             # Word-level attention mechanism
#             word_attention_weights = F.softmax(self.word_attention(evidences_encoded), dim=0)
#             evidence_representation = torch.sum(word_attention_weights * evidences_encoded, dim=0)
#             evidence_representations.append(evidence_representation)

#         evidence_representations = torch.stack(evidence_representations)

#         # Sentence-level attention mechanism
#         sentence_attention_weights = F.softmax(self.sentence_attention(evidence_representations), dim=0)
#         sentence_representation = torch.sum(sentence_attention_weights * evidence_representations, dim=0)

#         combined_representation = sentence_representation + claims_encoded
#         logits = self.fc(combined_representation)
#         return F.log_softmax(logits, dim=1)




# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
import torch
from sklearn.metrics import accuracy_score, classification_report

def train_model(model, train_loader, dev_loader, criterion, optimizer, device, num_epochs=10, grad_clip=1.0):
    model = model.to(device)  # Ensure the model is on the right device
    criterion = criterion.to(device)  # Also move the criterion to the GPU if available

    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for claims, evidences, labels in train_loader:
            claims = claims.to(device)
            evidences = [e.to(device) for e in evidences]  # Move each batch of evidences to GPU
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(claims, evidences)
            loss = criterion(outputs, labels)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{num_epochs} - Training Loss: {avg_train_loss}')

        # Evaluate on the development set
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for claims, evidences, labels in dev_loader:
                claims = claims.to(device)
                evidences = [e.to(device) for e in evidences]
                labels = labels.to(device)

                outputs = model(claims, evidences)
                val_loss = criterion(outputs, labels)
                total_val_loss += val_loss.item()

                _, predicted = torch.max(outputs, 1)
                all_preds.extend(predicted.cpu().numpy())  # Move predictions back to CPU for scoring
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(dev_loader)
        # Calculate evaluation metrics
        accuracy = accuracy_score(all_labels, all_preds)


        print(f'Epoch {epoch + 1}/{num_epochs} - Validation Loss: {avg_val_loss}')
        print(classification_report(all_labels, all_preds, zero_division=0))


        # Step the learning rate scheduler
        scheduler.step(avg_val_loss)



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_label_counts = train_claim_labels.value_counts()
# Convert label counts to a Tensor
train_label_counts_tensor = torch.tensor(train_label_counts.sort_index().values, dtype=torch.float)

# Calculate weights: inversely proportional to the class frequencies
class_weights = 1.0 / train_label_counts_tensor

# Normalize weights so that the smallest weight is 1.0
class_weights = class_weights / class_weights.min()

# Move weights to the correct device (GPU or CPU)
class_weights = class_weights.to(device)

# Model instantiation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = ClaimEvidenceModel(vocab_size=len(vocab), embedding_dim=100, hidden_dim=128, output_dim=len(label_map), pad_idx=vocab['<pad>'])
model = ClaimVerificationTransformer(
    num_layers=2,
    emb_size=360,
    nhead=4,
    vocab_size=len(vocab),
    dim_feedforward=512,
    num_classes=len(label_map),
    dropout=0.5,
    maxlen=5000,
    pad_idx=vocab['<pad>']
)
optimizer = torch.optim.Adam(model.parameters())
# criterion = nn.CrossEntropyLoss(weight=class_weights, ignore_index=vocab['<pad>'])
criterion = nn.NLLLoss(weight=class_weights)

# Load data
train_dataset = ClaimEvidenceDataset(train_claims_numerical, train_claim_labels, train_evidence_idxs, evidence_numerical)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=custom_collate_fn)
dev_dataset = ClaimEvidenceDataset(dev_claims_numerical, dev_claim_labels, dev_k_indices, evidence_numerical)
dev_loader = DataLoader(dev_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate_fn)

# Training and evaluating the model
train_model(model, train_loader, dev_loader, criterion, optimizer, device=device, num_epochs=20)





Epoch 1/20 - Training Loss: 2.172918736934662
Epoch 1/20 - Validation Loss: 1.4890036582946777
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        18
           1       0.18      1.00      0.30        27
           2       0.00      0.00      0.00        68
           3       0.00      0.00      0.00        41

    accuracy                           0.18       154
   macro avg       0.04      0.25      0.08       154
weighted avg       0.03      0.18      0.05       154

Epoch 2/20 - Training Loss: 1.4130306601524354
Epoch 2/20 - Validation Loss: 1.6923393607139587
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        18
           1       0.18      0.96      0.30        27
           2       0.44      0.06      0.10        68
           3       0.00      0.00      0.00        41

    accuracy                           0.19       154
   macro avg       0.16      0.26      0.10     

In [None]:
from sklearn.metrics import accuracy_score
dev_dataset = ClaimEvidenceDataset(dev_claims_numerical, dev_claim_labels, dev_evidence_idxs, evidence_numerical)

dev_loader = DataLoader(dev_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate_fn)

model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for claims, evidences, labels in dev_loader:
        claims = claims.to(device)
        evidences = [e.to(device) for e in evidences]  # List comprehension to move each batch of evidences to GPU
        labels = labels.to(device)
        outputs = model(claims, evidences)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)

In [None]:
evidence_idx_to_id_dict = {idx: id for id, idx in evidence_id_dict.items()}

label_map_inverse = {v: k for k, v in label_map.items()}



dev_label_predictions = [label_map_inverse[pred] for pred in all_preds]
dev_converted_evidence_ids = [[evidence_idx_to_id_dict[idx] for idx in indices] for indices in dev_k_indices]

results = {}
for i, claim_id in enumerate(dev_claim_ids):
    results[claim_id] = {
        "claim_text": dev_claims_text[i],
        "claim_label": dev_label_predictions[i],
        "evidences": dev_converted_evidence_ids[i]
    }


In [None]:
with open('/content/drive/MyDrive/nlp/data/dev_predictions.json', 'w') as file:
    json.dump(results, file)

In [None]:
test_dataset = ClaimEvidenceDataset(test_claims_numerical, None, test_k_indices, evidence_numerical)

test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate_fn)

model.eval()
all_preds = []
with torch.no_grad():
    for claims, evidences,_ in test_loader:  # Note that labels are not loaded
        claims = claims.to(device)
        evidences = [e.to(device) for e in evidences]  # Ensure evidences are moved to GPU
        outputs = model(claims, evidences)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())

# `all_preds` now contains the predicted labels for your test dataset


In [None]:
# Inverse mapping for label_map
label_map_inverse = {v: k for k, v in label_map.items()}
# Convert numerical predictions to label strings
label_predictions = [label_map_inverse[pred] for pred in all_preds]


In [None]:
# Assuming evidence_id_dict maps from IDs to indices, create the inverse mapping
evidence_idx_to_id_dict = {idx: id for id, idx in evidence_id_dict.items()}

# Now convert indices to IDs (assuming you have a list of indices)
# example_indices = [1, 2, 3]  # Replace with actual indices if needed
# converted_ids = [evidence_idx_to_id_dict[idx] for idx in example_indices]
converted_evidence_ids = [[evidence_idx_to_id_dict[idx] for idx in indices] for indices in test_k_indices]

In [None]:
converted_evidence_ids[0]

['evidence-55562',
 'evidence-1032935',
 'evidence-60163',
 'evidence-225665',
 'evidence-377026']

In [None]:
test_claims_id
test_claims_text
label_predictions
converted_evidence_ids


In [None]:
json.dump(label_predictions, open("/content/drive/MyDrive/nlp/data/test_label_predictions.json", "w"))

In [None]:
results = {}
for i, claim_id in enumerate(test_claims_id):
    results[claim_id] = {
        "claim_text": test_claims_text[i],
        "claim_label": label_predictions[i],
        "evidences": converted_evidence_ids[i]
    }

In [None]:
import json

# Convert to JSON string
json_output = json.dumps(results, indent=4)
print(json_output)

# Save to a JSON file
with open('/content/drive/MyDrive/nlp/data/test_predictions.json', 'w') as file:
    json.dump(results, file)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*