# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [2]:
## import libraries

# Standard libraries
import math
import string
import pandas as pd
import numpy as np
import json

# Machine learning and deep learning libraries
import tensorflow
import keras
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.data import Field, LabelField, Example, Dataset
from torchtext.data.utils import get_tokenizer

# Natural Language Processing (NLP) libraries
import nltk
import spacy
from nltk.corpus import stopwords

# Visualization
import matplotlib.pyplot as plt

In [3]:
## reading in json files

"""
Description of json files
* [train-claims,dev-claims].json: JSON files for the labelled training and development set; 
* evidence.json: JSON file containing a large number of evidence passages (i.e. the “knowledge source”); 
* dev-claims-baseline.json: JSON file containing predictions of a baseline system on the development set;
"""

## relative file paths

## baseline system - will not be used for any training/evaluation
devClaimsBaselineFile='./data/dev-claims-baseline.json'
## use this for model training
trainClaimsFile='./data/train-claims.json'
## use this set for hyperparameter tuning and evaluation metric 
devClaimsFile='./data/dev-claims.json'
## evidence files need to be downloaded through https://drive.google.com/file/d/1JlUzRufknsHzKzvrEjgw8D3n_IRpjzo6/view?usp=sharing as it is to big to be uploaded to github
evidenceFile='./data/evidence.json'


tokenizer=get_tokenizer("basic_english")


In [4]:

# Define tokenizer
tokenizer=get_tokenizer("basic_english")
stopWords=set(stopwords.words('english'))
punctuation=string.punctuation

def tokenizeAndStopwordRemoval(text):
    tokens=tokenizer(text)
    filteredTokens=[word for word in tokens if (word not in stopWords) and (word not in punctuation)]
    return filteredTokens


TEXT=torchtext.data.Field(tokenize=tokenizeAndStopwordRemoval,
                          init_token='<sos>',
                          eos_token='<sos>',
                          lower=True)
LABEL = torchtext.data.LabelField(dtype=torch.float)

# Load datasets
def load_json_data(json_file, text_field, label_field):
    with open(json_file, 'r') as file:
        data=json.load(file)
    examples=[]
    for i,claimText in data.items():
        text=claimText['claim_text']
        label=claimText['evidences']
        examples.append(Example.fromlist([text, label], fields=[('text', text_field), ('label', label_field)]))
    return examples

train_examples=load_json_data(trainClaimsFile, TEXT, LABEL)
dev_examples=load_json_data(devClaimsFile, TEXT, LABEL)

train_dataset = Dataset(train_examples, fields=[('text', TEXT), ('label', LABEL)])
dev_dataset = Dataset(dev_examples, fields=[('text', TEXT), ('label', LABEL)])

print(dev_dataset.fields)
for i, example in enumerate(dev_dataset.examples):
    if i < 5:
        print(vars(example))

# Build vocabulary
TEXT.build_vocab(train_dataset)
LABEL.build_vocab(train_dataset)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Batchify function
def batchify(dataset, bsz, text_field):
    if not dataset.examples:
        print("No examples to process in batchify function.")
        return torch.tensor([])  # Return an empty tensor if there are no examples
    data = text_field.process([getattr(x, 'text') for x in dataset.examples])
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

# Batch sizes
batch_size = 20
eval_batch_size = 10

# Process the datasets
train_data = batchify(train_dataset, batch_size, TEXT)
dev_data = batchify(dev_dataset, eval_batch_size, TEXT)

bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

{'text': <torchtext.data.field.Field object at 0x000001BFF57791C0>, 'label': <torchtext.data.field.LabelField object at 0x000001BFF59AD460>}
{'text': ['[south', 'australia]', 'expensive', 'electricity', 'world'], 'label': ['evidence-67732', 'evidence-572512']}
{'text': ['3', 'per', 'cent', 'total', 'annual', 'global', 'emissions', 'carbon', 'dioxide', 'humans', 'australia', 'prod\xaduces', '1', '3', 'per', 'cent', '3', 'per', 'cent', 'amount', 'emissions', 'reductio\xadn', 'effect', 'global', 'climate'], 'label': ['evidence-996421', 'evidence-1080858', 'evidence-208053', 'evidence-699212', 'evidence-832334']}
{'text': ['means', 'world', '1c', 'warmer', 'pre-industrial', 'times'], 'label': ['evidence-889933', 'evidence-694262']}
{'text': ['“as', 'happens', 'zika', 'may', 'also', 'good', 'model', 'second', 'worrying', 'effect', '—', 'disease', 'mutation'], 'label': ['evidence-422399', 'evidence-702226', 'evidence-286834', 'evidence-472751', 'evidence-641043']}
{'text': ['greenland', 'los

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [5]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        #triu returns the upper triangular part of a matrix (2-D tensor) or batch of matrices (see section below)
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

In [6]:
#triu returns the upper triangular part of a matrix (2-D tensor) or batch of matrices (see section below)
torch.triu(torch.ones(3, 3))

tensor([[1., 1., 1.],
        [0., 1., 1.],
        [0., 0., 1.]])

In [7]:
# Masking
def masking():
  sz = 4
  mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
  mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))

  return mask

masking()

tensor([[0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf],
        [0., 0., 0., 0.]])

In [8]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term) #0::2 means starting with index 0, step = 2
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
    
ntokens = len(TEXT.vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)




# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [9]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab.stoi)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, dev_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  1.86s | valid loss  5.46 | valid ppl   235.84
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   2 | time:  1.65s | valid loss  5.49 | valid ppl   242.87
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   3 | time:  1.64s | valid loss  5.67 | valid ppl   289.12
-----------------------------------------------------------------------------------------


In [12]:
test_loss = evaluate(best_model, dev_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

model.eval()
output=model(dev_data)
probabilities=torch.nn.functional.softmax(output,dim=-1)
_,predictedIndex=torch.max(probabilities,dim=-1)
for i in predictedIndex:
    print(predictedIndex[i])

print(LABEL.vocab.stoi)

| End of training | test loss  5.67 | test ppl   289.12
tensor([[  2, 354,   2,   2,   2,   2,   2,   2,   2,   2],
        [ 73,   2,   2,   2,   2,   2,   2,   2,   2,   2],
        [  2, 354,   2,   2,   2,   2,   2,   2,   2,   2],
        [  2, 354,   2,   2,   2,   2,   2,   2,   2,   2],
        [  2, 354,   2,   2,   2,   2,   2,   2,   2,   2],
        [  2, 354,   2,   2,   2,   2,   2,   2,   2,   2],
        [  2, 354,   2,   2,   2,   2,   2,   2,   2,   2],
        [  2, 354,   2,   2,   2,   2,   2,   2,   2,   2],
        [  2, 354,   2,   2,   2,   2,   2,   2,   2,   2],
        [  2, 354,   2,   2,   2,   2,   2,   2,   2,   2]])
tensor([[  2, 354,   2,   2,   2,   2,   2,   2,   2,   2],
        [ 73,   2,   2,   2,   2,   2,   2,   2,   2,   2],
        [  2, 354,   2,   2,   2,   2,   2,   2,   2,   2],
        [  2, 354,   2,   2,   2,   2,   2,   2,   2,   2],
        [  2, 354,   2,   2,   2,   2,   2,   2,   2,   2],
        [  2, 354,   2,   2,   2,   2,   2,

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*