# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

Some things we can try:
- Ensemble methods, especially for final labelling
- Different methods of utilizing text created with the transformer (e.g., use cosine similarities to obtain evidences)
- Compare performance using the text as output vs directly obtaining IDs
- For classification, usually, transformers might overfit without the use of bigger pre-trained models

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [11]:
## import libraries

!pip3 install torchtext==0.4.0

# Standard libraries
import math
import string
import pandas as pd
import numpy as np
import json

# Machine learning and deep learning libraries
import tensorflow
import keras
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.data import Field, LabelField, Example, Dataset, BucketIterator
from torchtext.data.utils import get_tokenizer

# Natural Language Processing (NLP) libraries
import nltk
import spacy
from nltk.corpus import stopwords

# Visualization
import matplotlib.pyplot as plt



In [12]:
## reading in json files

"""
Description of json files
* [train-claims,dev-claims].json: JSON files for the labelled training and development set; 
* evidence.json: JSON file containing a large number of evidence passages (i.e. the “knowledge source”); 
* dev-claims-baseline.json: JSON file containing predictions of a baseline system on the development set;
"""

## relative file paths

## baseline system - will not be used for any training/evaluation
devClaimsBaselineFile='./data/dev-claims-baseline.json'
## use this for model training
trainClaimsFile='./data/train-claims.json'
## use this set for hyperparameter tuning and evaluation metric 
devClaimsFile='./data/dev-claims.json'
## evidence files need to be downloaded through https://drive.google.com/file/d/1JlUzRufknsHzKzvrEjgw8D3n_IRpjzo6/view?usp=sharing as it is to big to be uploaded to github
evidenceFile='./data/evidence.json'

testFile='./data/test-claims-unlabelled.json'

tokenizer=get_tokenizer("basic_english")


In [13]:
# Load the JSON data
with open(trainClaimsFile, 'r') as file:
    trainClaims=json.load(file)
with open(devClaimsFile, 'r') as file:
    devClaims=json.load(file)
with open(evidenceFile, 'r') as file:
    evidenceData=json.load(file)

## Preprocessing data -- lowercase, tokenize, and stopword removal
stopwords=set(nltk.corpus.stopwords.words('english'))
tokenizer=get_tokenizer('basic_english')
punctuations=string.punctuation

def preprocess(text):
    token=tokenizer(text.lower())
    cleanedTokens=[t for t in token if (t not in stopwords) and (t not in punctuations)]
    return ' '.join(cleanedTokens)

for ids, texts in evidenceData.items():
    evidenceData[ids]=preprocess(texts)

# Function to create DataFrame and merge evidence IDs with Text
def createDF(claims, evidence):
    combinedData=[]
    for claimID, claimText in claims.items():
        # Combine the ID with its corresponding evidences
        evidenceID=claimText['evidences']
        evidenceText=(evidence[i] for i in evidenceID if i in evidence)
        combinedData.append({
            'claim_id': claimID,
            'claim_text': preprocess(claimText['claim_text']),
            'evidence_id': evidenceID,
            'evidence_text': " ".join(evidenceText),
            'claim_label': claimText['claim_label']
        })
    # Create DataFrame
    return pd.DataFrame(combinedData)

# Create CSV Files
trainFullMerged=createDF(trainClaims,evidenceData)
devFullMerged=createDF(devClaims,evidenceData)
trainFullMerged.to_csv("data/trainFullMerged.csv", index=False)
devFullMerged.to_csv("data/devFullMerged.csv", index=False)

# Convert evidence into csv as well
evidenceFinal=pd.DataFrame(list(evidenceData.items()),columns=['evidence_id','evidence_text'])
evidenceFinal.to_csv('data/evidencePreprocessed.csv',index=False)

# Convert unlabelled Data into CSV as well
with open(testFile, 'r') as file:
    testData=json.load(file)
for ids, texts in testData.items():
    claim_text = texts['claim_text']
    testData[ids]=preprocess(claim_text)
testFinal=pd.DataFrame(list(testData.items()), columns=['claim_id', 'claim_text'])
testFinal.to_csv('data/testPreprocessed.csv',index=False)

In [14]:
## use this for model training
trainClaimsFile='./data/trainFullMerged.csv'
## use this set for hyperparameter tuning and evaluation metric 
devClaimsFile='./data/devFullMerged.csv'
## evidence files need to be downloaded through googledrive (https://drive.google.com/file/d/1OyihwdAWfqHIOueCB4bLBkYg4hTN_OKm/view?usp=sharing)
evidenceFile='./data/evidencePreprocessed.csv'
## test unlabelled dataset
testFile='./data/testPreprocessed.csv'

trainDataframe=pd.read_csv(trainClaimsFile)
devDataframe=pd.read_csv(devClaimsFile)
evidenceDataframe=pd.read_csv(evidenceFile)
testDataframe=pd.read_csv(testFile)

trainDataframe['claim_text']=trainDataframe['claim_text']
trainDataframe['combined_evidence']=trainDataframe['evidence_id']+" "+trainDataframe['evidence_text']
trainDataframe['combined_input'] = trainDataframe['claim_text'] + " [SEP] " + trainDataframe['evidence_text']

devDataframe['claim_text']=devDataframe['claim_text']
devDataframe['combined_evidence']=devDataframe['evidence_id'] +" "+ devDataframe['evidence_text']
devDataframe['combined_input'] = devDataframe['claim_text'] + " [SEP] " + devDataframe['evidence_text']

print(devDataframe['combined_input'])

0      [south australia] expensive electricity world ...
1      3 per cent total annual global emissions carbo...
2      means world 1c warmer pre-industrial times [SE...
3      “as happens zika may also good model second wo...
4      greenland lost tiny fraction ice mass [SEP] ic...
                             ...                        
149    suddenly label co2 pollutant disservice gas pl...
150    natural orbitally driven warming atmospheric c...
151    many world’s coral reefs already barren state ...
152    recent study led lawrence livermore national l...
153    corals may save many creatures attempting movi...
Name: combined_input, Length: 154, dtype: object


In [15]:
## Preprocessing data -- lowercase, tokenize, and stopword removal
stopwords=set(nltk.corpus.stopwords.words('english'))
tokenizer=get_tokenizer('basic_english')
punctuations=set(string.punctuation)

def preprocess(text):
    tokens=tokenizer(text)
    cleanedTokens = [token for token in tokens if token not in stopwords and token not in punctuations]
    return cleanedTokens

TEXT = Field(lower=True, init_token='<sos>', eos_token='<eos>', batch_first=False)
LABEL=LabelField(sequential=False)

def createDatasetEncoderInput(dataframe,textTransform,labelTransform):
    field=[('reviewTextInput',textTransform),('evidenceTextOutput',labelTransform)]
    examples=[]
    for _, row in dataframe.iterrows():
        reviewTextInput=row['combined_input']
        evidenceTextOutput=row['claim_label']
        examples.append(Example.fromlist([reviewTextInput,evidenceTextOutput], field))
    return Dataset(examples,fields=field)

def createDatasetOutput(dataframe,labelTransform):
    field=[('evidenceTextOutput',labelTransform)]
    examples=[]
    for _, row in dataframe.iterrows():
        evidenceTextOutput=row['claim_label']
        examples.append(Example.fromlist([evidenceTextOutput], field))
    return Dataset(examples,fields=field)

def createDatasetTest(dataframe,textTransform):
    field=[('reviewTextInput',textTransform)]
    examples=[]
    for _, row in dataframe.iterrows():
        reviewTextInput=row['combined_input']
        examples.append(Example.fromlist([reviewTextInput], field))
    return Dataset(examples,fields=field)

trainTensor = createDatasetEncoderInput(trainDataframe, TEXT,LABEL)
devTensor = createDatasetEncoderInput(devDataframe, TEXT,LABEL)
#evidenceTensor=createDatasetOutput(evidenceDataframe,LABEL)
#testTensor=createDatasetTest(testDataframe,TEXT)


In [16]:
TEXT.build_vocab(trainTensor)
LABEL.build_vocab(trainTensor)

In [17]:
# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(LABEL.vocab.stoi)
print(TEXT.vocab.stoi)

defaultdict(None, {'SUPPORTS': 0, 'NOT_ENOUGH_INFO': 1, 'REFUTES': 2, 'DISPUTED': 3})


In [18]:

def batchify(dataset, bsz):
    #batchify both text and labels
    data = TEXT.process([getattr(x, 'reviewTextInput') for x in dataset.examples])
    label = LABEL.process([getattr(x, 'evidenceTextOutput') for x in dataset.examples])
    nbatch = min(data.size(0), label.size(0)) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    label = label.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    label = label.view(bsz, -1).t().contiguous()
    min_seq_len = min(data.size(0), label.size(0))
    data = data[:min_seq_len]
    label = label[:min_seq_len]
    return data.to(device), label.to(device)

## test is used for the final predictions
TEST = Field(tokenize=preprocess, lower=True, init_token='<sos>', eos_token='<eos>', batch_first=True)
TEST.build_vocab(trainTensor)
def processTextOnly(dataset):
    data = TEST.process([getattr(x, 'reviewTextInput') for x in dataset.examples])
    return data.to(device)

# Batch sizes
batch_size = 20
eval_batch_size = 35
        
train_data,train_label = batchify(trainTensor, batch_size)
dev_data,dev_label = batchify(devTensor, eval_batch_size)
dev_accuracy_data=processTextOnly(devTensor)

bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [22]:
## this is to obtain seq2seq results
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        #triu returns the upper triangular part of a matrix (2-D tensor) or batch of matrices (see section below)
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output
    
torch.triu(torch.ones(3, 3))

# Masking
def masking():
  sz = 4
  mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
  mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))

  return mask

masking()

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
    
ntokens = len(TEXT.vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [24]:
sos_token_id = TEXT.vocab.stoi['<sos>']
eos_token_id = TEXT.vocab.stoi['<eos>']
pad_token_id = TEXT.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train():
    model.train() 
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab.stoi)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, _ = get_batch(train_data, i) 
        targets, _ = get_batch(train_label, i) 
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntokens), targets.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

best_val_loss = float("inf")
epochs = 150# The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, dev_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()





-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  0.07s | valid loss 11.76 | valid ppl 128144.26
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   2 | time:  0.05s | valid loss 10.59 | valid ppl 39906.39
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   3 | time:  0.05s | valid loss 15.43 | valid ppl 5000840.41
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   4 | time:  0.05s | valid loss 24.84 | valid ppl 61270475174.18
-----------------------------------------------------------------

In [25]:
test_loss = evaluate(best_model, dev_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

model.eval()
output=model(dev_data)
probabilities=torch.nn.functional.softmax(output,dim=-1)
_,predictedIndex=torch.max(probabilities,dim=-1)

vocab = TEXT.vocab.stoi 
reverseVocab = {index: word for word, index in vocab.items()} 
def tensor_to_text(tensor, index_to_word):
    words = [index_to_word[index.item()] for index in tensor if index in index_to_word]
    return ' '.join(words)


| End of training | test loss 14.73 | test ppl 2485170.96


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

model.eval()
pred=model(dev_accuracy_data)
def create_id_dictionary(dataframe):
    unique_id_per_claim={}
    for claim_id in dataframe['claim_id']:
        unique_id_per_claim[claim_id]=[]
    return unique_id_per_claim

unique_evidence_id=create_id_dictionary(devDataframe)
#print(predicted_text['claim-752'])
# use softmax
probabilities = torch.softmax(pred, dim=-1)
_, predicted_indices = torch.max(probabilities, dim=-1)
predicted_words = [[LABEL.vocab.itos[index] for index in example] for example in predicted_indices]
print(predicted_indices)
predicted_single_words={}
for claim_id in unique_evidence_id:
    predicted_single_words[claim_id] = []
if len(predicted_words) == len(unique_evidence_id):
    for claim_text, sentence in zip(unique_evidence_id, predicted_words):
        formatted_sentence =[word for word in sentence if word not in ("<pad>", "<unk>", "<eos>", "<sos>")]
        predicted_single_words[claim_text].extend(formatted_sentence)

for claim_id in predicted_single_words:
    for word in predicted_single_words[claim_id]:
        if 'evidence-' in word:
            predicted_single_words[claim_id].remove(word)
            unique_evidence_id[claim_id].append(word)

predicted_whole_sentences={}
for claim_id, words in predicted_single_words.items():
    predicted_whole_sentences[claim_id] = ' '.join(words)
    
newDf=evidenceDataframe.fillna(value=' ', inplace=False)
predicted_texts_list = list(predicted_whole_sentences.values())
tfidf_vectorizer = TfidfVectorizer()
evidence_tfidf = tfidf_vectorizer.fit_transform(newDf['evidence_text'])
predicted_tfidf = tfidf_vectorizer.transform(predicted_texts_list)
similarity_matrix = cosine_similarity(predicted_tfidf, evidence_tfidf)


for i, sentence in enumerate(predicted_words):
    print(f"Sentence {i+1}: {' '.join(sentence)}")
predicted_text={}
for claim_id in unique_evidence_id:
    predicted_text[claim_id] = []
if len(predicted_words) == len(unique_evidence_id):
    for claim_text, sentence in zip(unique_evidence_id, predicted_words):
        formatted_sentence =[word for word in sentence]
        predicted_text[claim_text].extend(formatted_sentence)

tensor([[3, 3, 3,  ..., 0, 0, 0],
        [0, 3, 3,  ..., 0, 0, 0],
        [0, 3, 3,  ..., 0, 0, 0],
        ...,
        [0, 2, 2,  ..., 0, 0, 0],
        [0, 2, 2,  ..., 0, 0, 0],
        [0, 2, 2,  ..., 0, 0, 0]])
Sentence 1: DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED DISPUTED SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPORTS SUPPO

In [38]:
from collections import Counter

print(predicted_text)
final_label= {claim: Counter(labels).most_common(1)[0][0] for claim, labels in predicted_text.items()}
print(final_label)

{'claim-752': ['DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', '

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*