# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [2]:
LOCAL_DEV = True # to switch between developing locally and on colab

if not LOCAL_DEV:
    # TODO: need to upload data files on Google Drive?
    from google.colab import drive
    drive.mount('/content/drive')

In [3]:
pip install contractions

Note: you may need to restart the kernel to use updated packages.


In [4]:
#Imports
import numpy as np
import torch
import pandas as pd

In [5]:
#visualising training data
if LOCAL_DEV:
    train = pd.read_json("../data/train-claims.json") # for local dev
    
else:
    train = pd.read_json("/content/drive/MyDrive/data/train-claims.json") # on colab
train = train.transpose()
train.head()


Unnamed: 0,claim_text,claim_label,evidences
claim-1937,Not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1..."
claim-126,El Niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]"
claim-2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]"
claim-2021,Weather Channel co-founder John Coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5..."
claim-2449,"""January 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72..."


In [6]:
if LOCAL_DEV:
    test = pd.read_json("../data/test-claims-unlabelled.json") # for local dev
    
else:
    test = pd.read_json("/content/drive/MyDrive/data/test-claims-unlabelled.json") # on colab
test = test.transpose()
test.head()

Unnamed: 0,claim_text
claim-2967,The contribution of waste heat to the global c...
claim-979,“Warm weather worsened the most recent five-ye...
claim-1609,Greenland has only lost a tiny fraction of its...
claim-1020,“The global reef crisis does not necessarily m...
claim-2599,Small amounts of very active substances can ca...


In [7]:
#visualising evidence data
if LOCAL_DEV:
    evidence = pd.read_json("../data/evidence.json",typ='series')
else:
    evidence = pd.read_json("/content/drive/MyDrive/data/evidence.json",typ='series')

In [8]:
print(len(evidence))
evidence.head()

1208827


evidence-0    John Bennet Lawes, English entrepreneur and ag...
evidence-1    Lindberg began his professional career at the ...
evidence-2    ``Boston (Ladies of Cambridge)'' by Vampire We...
evidence-3    Gerald Francis Goyer (born October 20, 1936) w...
evidence-4    He detected abnormalities of oxytocinergic fun...
dtype: object

In [9]:
import string
import contractions
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
def preprocess_data(data: pd.Series) -> pd.Series:
  preprocessed_data = {}
  stop_words = set(stopwords.words('english'))
  stop_words.remove('not')
  for id, text in data.items():
    text = text.lower()
    text = contractions.fix(text)
    tokens = word_tokenize(text)
    wnl = WordNetLemmatizer()
    lemmatized_tokens = [wnl.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    preprocessed_data[id] = " ".join(lemmatized_tokens)

  return pd.Series(preprocessed_data)

processed_evidence = preprocess_data(evidence)

test_claims = test['claim_text']
processed_test = preprocess_data(test_claims)
processed_test.head()

claim-2967               contribution waste heat global climate
claim-979     warm weather worsened recent drought included ...
claim-1609                greenland lost tiny fraction ice mass
claim-1020    global reef crisis not necessarily mean extinc...
claim-2599     small amount active substance cause large effect
dtype: object

In [11]:
processed_evidence = processed_evidence[processed_evidence.str.strip().str.len() > 0]

In [12]:
def prepareTrainData(n):
    train_claims = preprocess_data(train['claim_text'])
    processed_train_claim = preprocess_data(train_claims)
    claim_lst = []
    evidence_lst = []
    label_lst = []
    for i in range(len(train)):
        train_claim = processed_train_claim[i]
        evidences = train.iloc[i]['evidences']
        for j in evidences:
            if j in processed_evidence.index :
                claim_lst.append(train_claim)
                evidence_lst.append(processed_evidence[j])
                label_lst.append('related')
        filtered_evi = processed_evidence[~processed_evidence.index.isin(evidences)]
        random_evidence = filtered_evi.sample(n)
        for k in random_evidence:
            claim_lst.append(train_claim)
            evidence_lst.append(k)
            label_lst.append('unrelated')
    claim_evi_label = {'claim_text': claim_lst, 'evidence': evidence_lst, 'label': label_lst}
    return pd.DataFrame(claim_evi_label)

train_claims = train['claim_text']
processed_train_claim = preprocess_data(train_claims)
preparedTrain = prepareTrainData(10)
preparedTrain.head()

Unnamed: 0,claim_text,evidence,label
0,not scientific evidence pollutant higher conce...,high concentration time atmospheric concentrat...,related
1,not scientific evidence pollutant higher conce...,plant grow much percent faster concentration p...,related
2,not scientific evidence pollutant higher conce...,higher carbon dioxide concentration favourably...,related
3,not scientific evidence pollutant higher conce...,mine located state jharkhand,unrelated
4,not scientific evidence pollutant higher conce...,also property brick dairy pyramidal roof domin...,unrelated


# Two steps for the this task
# first. find all relavent evidence, either use contextual embedding or similarity scoring
# second. classify the evidents into 4 classes.

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import torchtext
from torchtext.data.utils import get_tokenizer


In [14]:
# Vectorizing preprocessed text
vectorizer = TfidfVectorizer()
all_texts = pd.concat([processed_evidence, processed_train_claim])
vectorizer.fit(all_texts)

In [15]:
preparedTrain.head(10)

Unnamed: 0,claim_text,evidence,label
0,not scientific evidence pollutant higher conce...,high concentration time atmospheric concentrat...,related
1,not scientific evidence pollutant higher conce...,plant grow much percent faster concentration p...,related
2,not scientific evidence pollutant higher conce...,higher carbon dioxide concentration favourably...,related
3,not scientific evidence pollutant higher conce...,mine located state jharkhand,unrelated
4,not scientific evidence pollutant higher conce...,also property brick dairy pyramidal roof domin...,unrelated
5,not scientific evidence pollutant higher conce...,sponsored sigda association computing machiner...,unrelated
6,not scientific evidence pollutant higher conce...,july state kansa franchise failure pay state tax,unrelated
7,not scientific evidence pollutant higher conce...,named british explorer sir john franklin peris...,unrelated
8,not scientific evidence pollutant higher conce...,silver medal member french team,unrelated
9,not scientific evidence pollutant higher conce...,newport located,unrelated


In [65]:
def tokenize(data):
    token_txt = []
    for txt in data:
        tokens = word_tokenize(txt)
        token_txt.append(tokens)
    return pd.Series(token_txt)
        
train_claim_txt = tokenize(preparedTrain['claim_text'])
train_evidence_txt = tokenize(preparedTrain['evidence'])
train_label = preparedTrain['label']

print(train_claim_txt)
print(train_evidence_txt)
print(train_label)

0        [not, scientific, evidence, pollutant, higher,...
1        [not, scientific, evidence, pollutant, higher,...
2        [not, scientific, evidence, pollutant, higher,...
3        [not, scientific, evidence, pollutant, higher,...
4        [not, scientific, evidence, pollutant, higher,...
                               ...                        
16396    [sending, oscillating, microwave, antenna, ins...
16397    [sending, oscillating, microwave, antenna, ins...
16398    [sending, oscillating, microwave, antenna, ins...
16399    [sending, oscillating, microwave, antenna, ins...
16400    [sending, oscillating, microwave, antenna, ins...
Length: 16401, dtype: object
0        [high, concentration, time, atmospheric, conce...
1        [plant, grow, much, percent, faster, concentra...
2        [higher, carbon, dioxide, concentration, favou...
3                        [mine, located, state, jharkhand]
4        [also, property, brick, dairy, pyramidal, roof...
                           

In [19]:
#TODO: vectorise and batchify data




In [67]:
bptt = 35
# TODO: modify this
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


ntokens = 0 # TODO: get num of vocabs
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerClassificationModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, torch.memory_format memory_format, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)


In [61]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time

def train(train_data, model):

    model.train() # Turn on the train mode
    for batch in train_data:
        inputs, labels = batch.text, batch.label
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()




def evaluate(eval_model, data_source):
    return NotImplementedError
    #TODO: implement this

In [62]:
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

    

-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  0.17s | valid loss 24.26 | valid ppl 34207299015.88
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   2 | time:  0.32s | valid loss 24.30 | valid ppl 35673099810.18
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   3 | time:  0.12s | valid loss 24.33 | valid ppl 36926992824.35
-----------------------------------------------------------------------------------------


In [64]:
test_loss = evaluate(best_model, test_data)


print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)


print(best_model(example))

| End of training | test loss 24.55 | test ppl 45957962903.59
tensor([[[ 2.5528e+01,  4.8138e-01, -9.5534e-01,  ..., -8.1160e-01,
          -1.5472e+00, -1.2553e-01],
         [ 2.4752e+01, -5.8802e-01, -1.0957e+00,  ..., -6.6444e-01,
          -4.8647e-01, -1.5684e-01],
         [ 2.4639e+01, -4.5157e-01, -1.7419e+00,  ..., -6.2768e-01,
          -1.0963e+00, -8.3802e-01],
         ...,
         [ 2.5265e+01,  2.4933e-01, -9.0049e-01,  ..., -8.4550e-01,
          -2.0898e+00,  8.6930e-02],
         [ 2.4738e+01, -1.5027e-01, -5.6473e-01,  ..., -6.1347e-01,
          -1.5001e+00,  1.8570e-03],
         [ 2.5788e+01, -2.9468e-01, -1.4228e+00,  ..., -2.7725e-01,
          -1.2456e+00, -7.7090e-02]],

        [[ 2.6233e+01,  5.0844e-01, -1.9147e+00,  ..., -8.2978e-01,
          -1.0774e+00,  3.2543e-01],
         [ 2.3195e+01,  1.4552e-01, -1.2921e+00,  ..., -3.7468e-01,
          -1.2571e+00, -2.6772e-01],
         [ 2.5951e+01, -1.2119e-01, -1.2862e+00,  ..., -9.3127e-01,
          -7.7

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*

In [24]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

# model adpated from workshop 8
class TransformerClassificationModel(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, num_classes, dropout=0.5):
        super(TransformerClassificationModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.classification_head = nn.Linear(ninp, num_classes)  # added classification head
        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.classification_head.bias.data.zero_()
        self.classification_head.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = output.mean(dim=0)  # aggregate across all tokens
        output = self.classification_head(output) 
        return output



class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term) #0::2 means starting with index 0, step = 2
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)