# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [4]:
## import libraries

!pip3 install torchtext==0.4.0

# Standard libraries
import math
import string
import pandas as pd
import numpy as np
import json

# Machine learning and deep learning libraries
import tensorflow
import keras
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.data import Field, LabelField, Example, Dataset
from torchtext.data.utils import get_tokenizer

# Natural Language Processing (NLP) libraries
import nltk
import spacy
from nltk.corpus import stopwords

# Visualization
import matplotlib.pyplot as plt



In [5]:
## reading in json files

"""
Description of json files
* [train-claims,dev-claims].json: JSON files for the labelled training and development set; 
* evidence.json: JSON file containing a large number of evidence passages (i.e. the “knowledge source”); 
* dev-claims-baseline.json: JSON file containing predictions of a baseline system on the development set;
"""

## relative file paths

## baseline system - will not be used for any training/evaluation
devClaimsBaselineFile='./data/dev-claims-baseline.json'
## use this for model training
trainClaimsFile='./data/train-claims.json'
## use this set for hyperparameter tuning and evaluation metric 
devClaimsFile='./data/dev-claims.json'
## evidence files need to be downloaded through https://drive.google.com/file/d/1JlUzRufknsHzKzvrEjgw8D3n_IRpjzo6/view?usp=sharing as it is to big to be uploaded to github
evidenceFile='./data/evidence.json'

testFile='./data/test-claims-unlabelled.json'

tokenizer=get_tokenizer("basic_english")


In [6]:

# Define tokenizer
tokenizer=get_tokenizer("basic_english")
stopWords=set(stopwords.words('english'))
punctuation=string.punctuation

def tokenizeAndStopwordRemoval(text):
    tokens=tokenizer(text)
    filteredTokens=[word for word in tokens if (word not in stopWords) and (word not in punctuation)]
    return filteredTokens


TEXT=torchtext.data.Field(tokenize=tokenizeAndStopwordRemoval,
                          init_token='<sos>',
                          eos_token='<sos>',
                          lower=True)
LABEL = torchtext.data.LabelField(dtype=torch.float)

# Load datasets
def load_json_data(json_file, text_field, label_field):
    with open(json_file, 'r') as file:
        data=json.load(file)
    examples=[]
    for i,claimText in data.items():
        text=claimText['claim_text']
        label=claimText['evidences']
        examples.append(Example.fromlist([text, label], fields=[('text', text_field), ('label', label_field)]))
    return examples

def load_json_data_test(json_file, text_field):
    with open(json_file, 'r') as file:
        data=json.load(file)
    examples=[]
    for i,claimText in data.items():
        text=claimText['claim_text']
        examples.append(Example.fromlist([text], fields=[('text', text_field)]))
    return examples

train_examples=load_json_data(trainClaimsFile, TEXT, LABEL)
dev_examples=load_json_data(devClaimsFile, TEXT, LABEL)
test_examples=load_json_data_test(testFile,TEXT)

train_dataset = Dataset(train_examples, fields=[('text', TEXT), ('label', LABEL)])
dev_dataset = Dataset(dev_examples, fields=[('text', TEXT), ('label', LABEL)])
test_dataset=Dataset(dev_examples, fields=[('text', TEXT)])

for i, example in enumerate(dev_dataset.examples):
    if i < 5:
        print(vars(example))

# Build vocabulary
TEXT.build_vocab(train_dataset)
LABEL.build_vocab(train_dataset)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Batchify function
def batchify(dataset, bsz):
    data = TEXT.process([getattr(x, 'text') for x in dataset.examples])
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

TEST = Field(tokenize=tokenizeAndStopwordRemoval, lower=True, init_token='<sos>', eos_token='<eos>', batch_first=True)
TEST.build_vocab(train_dataset)

def processTextOnly(dataset):
    data = TEST.process([getattr(x, 'text') for x in dataset.examples])
    return data.to(device)

# Batch sizes
batch_size = 20
eval_batch_size = 10

# Process the datasets

train_data = batchify(train_dataset, batch_size)
dev_data = batchify(dev_dataset, eval_batch_size)
test_data=processTextOnly(test_dataset)
dev_accuracy_data=processTextOnly(dev_dataset)

bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

{'text': ['[south', 'australia]', 'expensive', 'electricity', 'world'], 'label': ['evidence-67732', 'evidence-572512']}
{'text': ['3', 'per', 'cent', 'total', 'annual', 'global', 'emissions', 'carbon', 'dioxide', 'humans', 'australia', 'prod\xaduces', '1', '3', 'per', 'cent', '3', 'per', 'cent', 'amount', 'emissions', 'reductio\xadn', 'effect', 'global', 'climate'], 'label': ['evidence-996421', 'evidence-1080858', 'evidence-208053', 'evidence-699212', 'evidence-832334']}
{'text': ['means', 'world', '1c', 'warmer', 'pre-industrial', 'times'], 'label': ['evidence-889933', 'evidence-694262']}
{'text': ['“as', 'happens', 'zika', 'may', 'also', 'good', 'model', 'second', 'worrying', 'effect', '—', 'disease', 'mutation'], 'label': ['evidence-422399', 'evidence-702226', 'evidence-286834', 'evidence-472751', 'evidence-641043']}
{'text': ['greenland', 'lost', 'tiny', 'fraction', 'ice', 'mass'], 'label': ['evidence-52981', 'evidence-264761', 'evidence-947243', 'evidence-424102']}


# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [7]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        #triu returns the upper triangular part of a matrix (2-D tensor) or batch of matrices (see section below)
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

In [8]:
#triu returns the upper triangular part of a matrix (2-D tensor) or batch of matrices (see section below)
torch.triu(torch.ones(3, 3))

tensor([[1., 1., 1.],
        [0., 1., 1.],
        [0., 0., 1.]])

In [9]:
# Masking
def masking():
  sz = 4
  mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
  mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))

  return mask

masking()

tensor([[0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf],
        [0., 0., 0., 0.]])

In [10]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term) #0::2 means starting with index 0, step = 2
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
    
ntokens = len(TEXT.vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)




# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [13]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab.stoi)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    total_correct = 0
    total_samples = 0
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            
            #Calculate accuracy
            _, predicted = torch.max(output_flat, 1)
            total_correct += (predicted == targets).sum().item()
            total_samples += targets.size(0)

    loss = total_loss / (len(data_source) - 1)
    accuracy = total_correct / total_samples
    return loss#, accuracy

best_val_loss = float("inf")
epochs = 10 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss= evaluate(model, dev_data)
    #val_loss,accuracy = evaluate(model, dev_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    #print('ACCURACY',accuracy)
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()


-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  2.18s | valid loss  3.92 | valid ppl    50.60
-----------------------------------------------------------------------------------------
| end of epoch   2 | time:  2.15s | valid loss  3.41 | valid ppl    30.17
-----------------------------------------------------------------------------------------
| end of epoch   3 | time:  2.05s | valid loss  3.64 | valid ppl    38.28
-----------------------------------------------------------------------------------------
| end of epoch   4 | time:  2.03s | valid loss  3.43 | valid ppl    30.87
-----------------------------------------------------------------------------------------
| end of epoch   5 | time:  2.10s | valid loss  3.52 | valid ppl    33.68
-----------------------------------------------------------------------------------------
| end of epoch   6 | time:  2.07s | valid loss  4.18 | valid ppl    65.13
----------------

In [14]:
test_loss = evaluate(best_model, dev_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

model.eval()
output=model(dev_data)
probabilities=torch.nn.functional.softmax(output,dim=-1)
_,predictedIndex=torch.max(probabilities,dim=-1)
# for i in predictedIndex:
#     print(predictedIndex[i])

# print(LABEL.vocab.stoi)
from sklearn.metrics import accuracy_score

# vocab = LABEL.vocab.stoi 
# reverseVocab = {index: word for word, index in vocab.items()} 
# def tensor_to_text(tensor, index_to_word):
#     words = [index_to_word[index.item()] for index in tensor if index in index_to_word]
#     return ' '.join(words)

# # print(reverseVocab)

# for i in predictedIndex:
#     #print(i)
#     words = list(set([reverseVocab[index.item()] for index in i]))
#     print(words)


#accuracyScore=accuracy_score()

| End of training | test loss  3.70 | test ppl    40.45


In [None]:
## import as pandas dataframe
devClaimsBaseline=pd.read_json(devClaimsBaselineFile)
trainClaims=pd.read_json(trainClaimsFile)
devClaims=pd.read_json(devClaimsFile)
evidence=pd.read_json(evidenceFile,orient='index')
evidences=pd.DataFrame(list(evidence.items()), columns=['evidence_id', 'evidence_text'])

## Separate claim_text,claim_label, and evidences from training and development sets, saved as pd dataframes
claimTextTrain=trainClaims.loc['claim_text'].to_frame()
claimLabelTrain=trainClaims.loc['claim_label'].to_frame()
evidenceTrain=trainClaims.loc['evidences'].to_frame()

claimTextDev=devClaims.loc['claim_text'].to_frame()
claimLabelDev=devClaims.loc['claim_label'].to_frame()
evidenceDev=devClaims.loc['evidences'].to_frame()


In [None]:

pred=model(test_data)
print(pred)
import torch

# use softmax
probabilities = torch.softmax(pred, dim=-1)
_, predicted_indices = torch.max(probabilities, dim=-1)
predicted_words = [[LABEL.vocab.itos[index] for index in example] for example in predicted_indices]
predicted_text={}
# Print the words
for i, sentence in enumerate(predicted_words):
    print(f"Sentence {i+1}: {' '.join(sentence)}")
    predicted_text[f'Sentence{i}']={', '.join(sentence)}


remove_tokens = ['<pad>', '<unks>', '<sos>', '<eos>']
def clean_sentence(sentence, tokens_to_remove):
    for token in tokens_to_remove:
        sentence = sentence.replace(token, '')
    return sentence.strip()
cleaned_data = {key: [clean_sentence(sentence, remove_tokens) for sentence in value] for key, value in predicted_text.items()}
print(cleaned_data)

# vectorizer = TfidfVectorizer()
# tfidf_matrix = vectorizer.fit_transform(evidenceDataframe['combined_evidence'])
# print(tfidf_matrix)

unique_words_per_sentence = {}
for key, value in cleaned_data.items():
    # Split by spaces and filter out numeric values and punctuation
    words = set(word.strip('–,.') for word in value[0].split())
    unique_words_per_sentence[key] = words

print(unique_words_per_sentence)

tensor([[[-0.9033,  6.7569,  9.9667,  ..., -0.2729, -0.1895, -0.3508],
         [-1.1341,  9.6085,  1.3243,  ..., -1.9985, -1.8539, -1.9896],
         [-1.1341,  9.6085,  1.3243,  ..., -1.9985, -1.8539, -1.9896],
         ...,
         [-0.6534,  8.2914,  7.2949,  ..., -0.4133, -1.0910, -1.1494],
         [-0.6534,  8.2914,  7.2949,  ..., -0.4133, -1.0910, -1.1494],
         [-0.6534,  8.2914,  7.2949,  ..., -0.4133, -1.0910, -1.1494]],

        [[-1.0073,  6.7825, 10.5012,  ..., -0.2437, -0.0802, -0.2291],
         [-1.5344,  9.7436,  1.5725,  ..., -2.1913, -1.4148, -1.9041],
         [-1.5344,  9.7436,  1.5725,  ..., -2.1913, -1.4148, -1.9041],
         ...,
         [-0.6539,  8.2926,  7.3067,  ..., -0.4078, -1.0865, -1.1395],
         [-0.6539,  8.2926,  7.3067,  ..., -0.4078, -1.0865, -1.1395],
         [-0.6539,  8.2926,  7.3067,  ..., -0.4078, -1.0865, -1.1395]],

        [[-1.1637,  6.7204, 11.2732,  ..., -0.2036, -0.0197, -0.1024],
         [-2.2136,  9.7576,  2.0383,  ..., -2

In [None]:
predictions_df = pd.DataFrame(list(cleaned_data.items()), columns=['Sentence', 'PredictedEvidences'])

evidenceDev_reset = evidenceDev.reset_index(drop=True)
predictions_df_reset = predictions_df.reset_index(drop=True)
combined_df = pd.merge(evidenceDev_reset, predictions_df_reset['PredictedEvidences'], left_index=True, right_index=True, how='outer')

print(combined_df)


NameError: name 'pd' is not defined

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*