# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [7]:
LOCAL_DEV = True # to switch between developing locally and on colab

if not LOCAL_DEV:
    # TODO: need to upload data files on Google Drive?
    from google.colab import drive
    drive.mount('/content/drive')

In [8]:
pip install contractions

Note: you may need to restart the kernel to use updated packages.


In [9]:
#Imports
import numpy as np
import torch
import pandas as pd

In [10]:
#visualising training data
if LOCAL_DEV:
    train = pd.read_json("../data/train-claims.json") # for local dev
    
else:
    train = pd.read_json("/content/drive/MyDrive/data/train-claims.json") # on colab
train = train.transpose()
train.head()


Unnamed: 0,claim_text,claim_label,evidences
claim-1937,Not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1..."
claim-126,El Niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]"
claim-2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]"
claim-2021,Weather Channel co-founder John Coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5..."
claim-2449,"""January 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72..."


In [11]:
if LOCAL_DEV:
    test = pd.read_json("../data/test-claims-unlabelled.json") # for local dev
    
else:
    test = pd.read_json("/content/drive/MyDrive/data/test-claims-unlabelled.json") # on colab
test = test.transpose()
test.head()

Unnamed: 0,claim_text
claim-2967,The contribution of waste heat to the global c...
claim-979,“Warm weather worsened the most recent five-ye...
claim-1609,Greenland has only lost a tiny fraction of its...
claim-1020,“The global reef crisis does not necessarily m...
claim-2599,Small amounts of very active substances can ca...


In [12]:
#visualising evidence data
if LOCAL_DEV:
    evidence = pd.read_json("../data/evidence.json",typ='series')
else:
    evidence = pd.read_json("/content/drive/MyDrive/data/evidence.json",typ='series')

In [13]:
print(len(evidence))
evidence.head()

1208827


evidence-0    John Bennet Lawes, English entrepreneur and ag...
evidence-1    Lindberg began his professional career at the ...
evidence-2    ``Boston (Ladies of Cambridge)'' by Vampire We...
evidence-3    Gerald Francis Goyer (born October 20, 1936) w...
evidence-4    He detected abnormalities of oxytocinergic fun...
dtype: object

In [14]:
import string
import contractions
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
def preprocess_data(data: pd.Series) -> pd.Series:
  preprocessed_data = {}
  stop_words = set(stopwords.words('english'))
  stop_words.remove('not')
  for id, text in data.items():
    text = text.lower()
    text = contractions.fix(text)
    tokens = word_tokenize(text)
    wnl = WordNetLemmatizer()
    lemmatized_tokens = [wnl.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    preprocessed_data[id] = " ".join(lemmatized_tokens)

  return pd.Series(preprocessed_data)

processed_evidence = preprocess_data(evidence)

test_claims = test['claim_text']
processed_test = preprocess_data(test_claims)
processed_test.head()

claim-2967               contribution waste heat global climate
claim-979     warm weather worsened recent drought included ...
claim-1609                greenland lost tiny fraction ice mass
claim-1020    global reef crisis not necessarily mean extinc...
claim-2599     small amount active substance cause large effect
dtype: object

In [16]:
processed_evidence = processed_evidence[processed_evidence.str.strip().str.len() > 0]

In [17]:
SPECIAL_TOKEN = ' <SPE_TOKEN> '
def prepareTrainData(n):
    train_claims = preprocess_data(train['claim_text'])
    processed_train_claim = preprocess_data(train_claims)
    text_lst = []
    label_lst = []
    for i in range(len(train)):
        train_claim = processed_train_claim[i]
        evidences = train.iloc[i]['evidences']
        for j in evidences:
            if j in processed_evidence.index :
                text = train_claim + SPECIAL_TOKEN + processed_evidence[j]
                text_lst.append(text)
                label_lst.append('related')
        filtered_evi = processed_evidence[~processed_evidence.index.isin(evidences)]
        random_evidence = filtered_evi.sample(n)
        for k in random_evidence:
            text = train_claim + SPECIAL_TOKEN + k
            text_lst.append(text)
            label_lst.append('unrelated')
    claim_evi_label = {'text': text_lst, 'label': label_lst}
    return pd.DataFrame(claim_evi_label)

train_claims = train['claim_text']
processed_train_claim = preprocess_data(train_claims)
preparedTrain = prepareTrainData(10)
preparedTrain.head()

Unnamed: 0,text,label
0,not scientific evidence pollutant higher conce...,related
1,not scientific evidence pollutant higher conce...,related
2,not scientific evidence pollutant higher conce...,related
3,not scientific evidence pollutant higher conce...,unrelated
4,not scientific evidence pollutant higher conce...,unrelated


In [18]:
# need later versions for torchtext.transforms and special
#pip install torchtext==0.18.0 

# Two steps for the this task
# first. find all relavent evidence, either use contextual embedding or similarity scoring
# second. classify the evidents into 4 classes.

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import torchtext
from torchtext.data.utils import get_tokenizer
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms
from torch.nn.utils.rnn import pad_sequence
# No module named 'torchtext.transforms' ?

In [20]:
# Vectorizing preprocessed text
vectorizer = TfidfVectorizer()
all_texts = pd.concat([processed_evidence, processed_train_claim])
vectorizer.fit(all_texts)

In [21]:
preparedTrain.head(10)

Unnamed: 0,text,label
0,not scientific evidence pollutant higher conce...,related
1,not scientific evidence pollutant higher conce...,related
2,not scientific evidence pollutant higher conce...,related
3,not scientific evidence pollutant higher conce...,unrelated
4,not scientific evidence pollutant higher conce...,unrelated
5,not scientific evidence pollutant higher conce...,unrelated
6,not scientific evidence pollutant higher conce...,unrelated
7,not scientific evidence pollutant higher conce...,unrelated
8,not scientific evidence pollutant higher conce...,unrelated
9,not scientific evidence pollutant higher conce...,unrelated


In [109]:
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(preparedTrain['text']), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

text_pipeline = lambda x: vocab(tokenizer(x))
label_transform = torchtext.transforms.LabelToIndex({'related': 0, 'unrelated': 1})

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    text_list, label_list, offsets = [], [], [0]
    for _text, _label in batch:
        label_list.append(label_transform(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    # should stack instead of concat?
    #text_list = torch.cat(text_list)
   
    # pad sequences to make them the same length 
    padded_sequences = pad_sequence(text_list, batch_first=True)

    return padded_sequences.to(device), label_list.to(device), offsets.to(device)

class TextDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        text = row['text'] 
        label = row['label'] 
        return text, label

dataset = TextDataset(preparedTrain)
dataloader = DataLoader(dataset, batch_size=20, shuffle=False, collate_fn=collate_batch)

In [110]:
#TODO: vectorise and batchify data


In [111]:
# initialise model
# NOTE: run the OOP code first!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ntokens = len(vocab) # TODO: verify correctness of this

emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value

model = TransformerClassificationModel(ntokens, emsize, nhead, nhid, nlayers, 2, dropout).to(device)



In [112]:
# train model
import math
import torch.nn as nn
criterion = nn.CrossEntropyLoss()
lr = 1.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

# TODO: figure out what this does and decide if it's needed in train
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time

def train(train_data_loader, model):

    model.train() # Turn on the train mode
    for inputs, labels, offsets in train_data_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        clipping_value = 1 # arbitrary value of your choosing
        # training unstable? https://stackoverflow.com/questions/66625645/why-does-my-pytorch-nn-return-a-tensor-of-nan
        torch.nn.utils.clip_grad_norm_(model.parameters(), clipping_value) # https://stackoverflow.com/questions/54716377/how-to-do-gradient-clipping-in-pytorch
        optimizer.step()




def evaluate(val_data_loader, eval_model):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.

    with torch.no_grad():
       
        for inputs, labels, offsets in val_data_loader:
          
            output = eval_model(inputs)
            # output_flat = output.view(-1, ntokens) do we need to do this?
            total_loss += len(inputs) * criterion(output, labels).item()
    return total_loss / (len(val_data_loader) - 1)


best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None
#train(dataloader, model)
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    # TODO: PUT DATA HERE
    train(dataloader, model)
    #NOTE: !!!!! using training data to test here!!!! Change later when test data is processed
    val_loss = evaluate(dataloader, model) 
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()


-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 74.63s | valid loss  8.73 | valid ppl  6195.92
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 86.46s | valid loss  8.28 | valid ppl  3942.69
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 76.31s | valid loss  8.20 | valid ppl  3631.05
-----------------------------------------------------------------------------------------


In [129]:
# sanity check

print(dataset[0])
test_batch = next(iter(dataloader))
print(test_batch[0][0]) # first input
print(test_batch[1]) # labels

test_outputs = model(test_batch[0])
predicted_labels = torch.argmax(test_outputs, dim=1)

print(predicted_labels)
    

('not scientific evidence pollutant higher concentration actually help ecosystem support plant animal life <SPE_TOKEN> high concentration time atmospheric concentration greater carbon dioxide toxic animal life raising concentration ppm higher several hour eliminate pest whitefly spider mite greenhouse', 'related')
tensor([    7,    80,    58,   649,   141,   144,   371,   625,   553,   525,
          174,  1163,   177,     1,    69,   144,    25,    72,   144,   532,
           12,    29,  2939,  1163,   177,  1989,   144,   521,   141,   139,
         1404,  1400,  8618, 12852,  5081,  7116,    22,     0])
tensor([0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1])
tensor([1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1])


| End of training | test loss  8.20 | test ppl  3631.05


# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [130]:
test_loss = evaluate(dataloader, best_model) #TODO: Replace with test data


print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)


| End of training | test loss  8.20 | test ppl  3631.05


## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*

In [94]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

# model adpated from workshop 8
class TransformerClassificationModel(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, num_classes, dropout=0.5):
        super(TransformerClassificationModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.classification_head = nn.Linear(ninp, num_classes)  # added classification head
        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.classification_head.bias.data.zero_()
        self.classification_head.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        # print("ENCODER OUTPUT")
        # print(len(output))
        output = output.mean(dim=1)  # aggregate across all tokens TODO: check if dim is correct
        # print("AGGREGATE")
        # print(len(output))
        output = self.classification_head(output) 
        # print("FINAL")
        # print(len(output))
        return output



class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term) #0::2 means starting with index 0, step = 2
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)