# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [21]:
LOCAL_DEV = True # to switch between developing locally and on colab

if not LOCAL_DEV:
    # TODO: need to upload data files on Google Drive?
    from google.colab import drive
    drive.mount('/content/drive')

In [22]:
#Imports
import numpy as np
import torch
import pandas as pd

In [23]:
#visualising training data
if LOCAL_DEV:
    train = pd.read_json("../data/train-claims.json") # for local dev
    
else:
    train = pd.read_json("/content/drive/MyDrive/data/train-claims.json") # on colab
train = train.transpose()
train.head()

Unnamed: 0,claim_text,claim_label,evidences
claim-1937,Not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1..."
claim-126,El Niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]"
claim-2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]"
claim-2021,Weather Channel co-founder John Coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5..."
claim-2449,"""January 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72..."


In [24]:
#visualising evidence data
#visualising evidence data
if LOCAL_DEV:
    evidence = pd.read_json("../data/evidence.json",typ='series')
else:
    evidence = pd.read_json("/content/drive/MyDrive/data/evidence.json",typ='series')

In [25]:
print(len(evidence))
evidence.head()

1208827


evidence-0    John Bennet Lawes, English entrepreneur and ag...
evidence-1    Lindberg began his professional career at the ...
evidence-2    ``Boston (Ladies of Cambridge)'' by Vampire We...
evidence-3    Gerald Francis Goyer (born October 20, 1936) w...
evidence-4    He detected abnormalities of oxytocinergic fun...
dtype: object

In [26]:
if LOCAL_DEV:
    test = pd.read_json("../data/test-claims-unlabelled.json")
else:
    test = pd.read_json("/content/drive/MyDrive/data/test-claims-unlabelled.json")
test = test.transpose()
test.head()


Unnamed: 0,claim_text
claim-2967,The contribution of waste heat to the global c...
claim-979,“Warm weather worsened the most recent five-ye...
claim-1609,Greenland has only lost a tiny fraction of its...
claim-1020,“The global reef crisis does not necessarily m...
claim-2599,Small amounts of very active substances can ca...


In [27]:
#preprocessing
# punctuations should be removed, common words such as the, is, are, should be removed. all words also should be lemmentised and stemmed.


In [28]:
pip install keras

Note: you may need to restart the kernel to use updated packages.


In [29]:
import string
import contractions
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [30]:


def preprocess_data(data: pd.Series, limit=10000) -> pd.Series:
  preprocessed_data = {}
  stop_words = set(stopwords.words('english'))
  stop_words.remove('not')
  count = 0
  for id, text in data.items():
    text = text.lower()
    text = contractions.fix(text)
    tokens = word_tokenize(text)
    wnl = WordNetLemmatizer()
    lemmatized_tokens = [wnl.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    preprocessed_data[id] = " ".join(lemmatized_tokens)
    count += 1
    if count >= limit:
      break

  return pd.Series(preprocessed_data)

processed_evidence = preprocess_data(evidence)

test_claims = test['claim_text']
train_claims = train['claim_text']
processed_test = preprocess_data(test_claims)
processed_test.head()
processed_train = preprocess_data(train_claims)
processed_train.head()


claim-1937    not scientific evidence pollutant higher conce...
claim-126     el niño drove record high global temperature s...
claim-2510                              pdo switched cool phase
claim-2021    weather channel john coleman provided evidence...
claim-2449    january capped month period global temperature...
dtype: object

In [31]:
processed_evidence = processed_evidence[processed_evidence.str.strip().str.len() > 0]
processed_evidence.head()

evidence-0    john bennet lawes english entrepreneur agricul...
evidence-1    lindberg began professional career age eventua...
evidence-2                boston lady cambridge vampire weekend
evidence-3    gerald francis goyer born october professional...
evidence-4    detected abnormality oxytocinergic function sc...
dtype: object

# Two steps for the this task
# first. find all relavent evidence, either use contextual embedding or similarity scoring
# second. classify the evidents into 4 classes.

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
from tensorflow import keras
from keras.layers import Dense
from keras.models import Sequential, load_model
from keras.layers import Embedding, Flatten, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, Flatten, Dense

In [45]:
#Vectorizing preprocessed text
vectorizer = TfidfVectorizer()
all_texts = pd.concat([processed_evidence, processed_train])
vectorizer.fit(all_texts)
evidence_tfidf = vectorizer.transform(processed_evidence)
test_tfidf = vectorizer.transform(processed_test)
train_tfidf = vectorizer.transform(processed_train)

print(vectorizer.get_feature_names_out()) # why does this contain non-alphabetic?
print(test_tfidf[0])
#print(test_tfidf.shape)




['aa' 'aaa' 'aabis' ... '楊璟翊' '盧翰' '민주국민당']
  (0, 24182)	0.5894022428982739
  (0, 9912)	0.43153385604356104
  (0, 9096)	0.3217964341887153
  (0, 4988)	0.5097918621830975
  (0, 4330)	0.32084706535977053


In [34]:
similarity_matrix = cosine_similarity(test_tfidf, evidence_tfidf)

def getTopN(similarity_matrix, test, evidence, n):
  test = test.to_frame(name='claim_text')
  top_indices = np.argsort(-similarity_matrix, axis = 1)[:, :n]
  top_evidence = [[str(evidence.index[i]) for i in row] for row in top_indices]
  test['evidences'] = top_evidence
  return test

test_with_evi = getTopN(similarity_matrix, processed_test, processed_evidence, 5)
test_with_evi.head()


Unnamed: 0,claim_text,evidences
claim-2967,contribution waste heat global climate,"[evidence-8950, evidence-4903, evidence-1294, ..."
claim-979,warm weather worsened recent drought included ...,"[evidence-2760, evidence-8828, evidence-5911, ..."
claim-1609,greenland lost tiny fraction ice mass,"[evidence-5928, evidence-4202, evidence-3680, ..."
claim-1020,global reef crisis not necessarily mean extinc...,"[evidence-3210, evidence-8721, evidence-2739, ..."
claim-2599,small amount active substance cause large effect,"[evidence-8207, evidence-8000, evidence-320, e..."


In [41]:
!pip3 install torchtext==0.4.0

Collecting torchtext==0.4.0
  Downloading torchtext-0.4.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchtext
Successfully installed torchtext-0.4.0


In [47]:

# using a classification model to test relevance of each evidence

# assuming train data text formatted as follows:
# trainX = [claim_text + SEPARATION_TOKEN + evidence_text, .....]
# trainY = [RELEVANT, NOT_RELEVANT, ...... ]

# WORKSHOP 8 ------
import torch
import torchtext
from torchtext.data.utils import get_tokenizer

# Define tokenizer
tokenizer = get_tokenizer("basic_english")

# Define Field for text data
TEXT = torchtext.data.Field(tokenize=tokenizer,
                            init_token='<sos>',
                            eos_token='<eos>',
                            lower=True)

# Define Field for label data
LABEL = torchtext.data.LabelField(dtype=torch.float)

# Load IMDb dataset
train_txt, test_txt = torchtext.datasets.IMDB.splits(TEXT, LABEL)

# Split train_txt into train and validation sets
train_txt, valid_txt = train_txt.split(split_ratio=0.8)

# Build vocabulary
TEXT.build_vocab(train_txt)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Batchify function
def batchify(data, bsz):
    data = TEXT.numericalize([data.examples[0].text])
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

# Batch sizes
batch_size = 20
eval_batch_size = 10

# Process the datasets
train_data = batchify(train_txt, batch_size)
val_data = batchify(valid_txt, eval_batch_size)
test_data = batchify(test_txt, eval_batch_size)




downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:18<00:00, 4.62MB/s]


In [68]:
print(valid_txt[0].text)
print(valid_txt[0].label)

print(valid_txt[1].text)
print(valid_txt[1].label)

# ours can be:
# text =  claim_tokens + [SEP_TOKEN] + evidence_tokens
# label <- one of {'rel', 'irr'} (relevant, irrelevant)

# and use the same batchify method ?

print(val_data)

['how', 'can', 'such', 'good', 'actors', 'like', 'jean', 'rochefort', 'and', 'carole', 'bouquet', 'could', 'have', 'been', 'involved', 'in', 'such', 'a', '.', '.', '.', 'a', '.', '.', '.', 'well', ',', 'such', 'a', 'thing', '?', 'i', 'can', "'", 't', 'get', 'it', '.', 'it', 'was', 'awful', ',', 'very', 'baldy', 'played', '(', 'but', 'some', 'of', 'the', 'few', 'leading', 'roles', ')', ',', 'the', 'jokes', 'are', 'dumb', 'and', 'absolutely', 'not', 'funny', '.', '.', '.', 'i', 'won', "'", 't', 'talk', 'more', 'about', 'this', 'movie', ',', 'except', 'for', 'one', 'little', 'piece', 'of', 'advice', 'do', 'not', 'go', 'see', 'it', ',', 'it', 'will', 'be', 'a', 'waste', 'of', 'time', 'and', 'money', '.']
neg
['wow', ',', 'finally', 'jim', 'carrey', 'has', 'returned', 'from', 'the', 'died', '.', 'this', 'movie', 'had', 'me', 'laughing', 'and', 'crying', '.', 'it', 'also', 'sends', 'a', 'message', 'that', 'we', 'should', 'all', 'know', 'and', 'learn', 'from', '.', 'jeniffer', 'aniston', 'was

In [48]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


ntokens = len(TEXT.vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)



In [49]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train():

    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab.stoi)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [50]:
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  0.43s | valid loss 10.19 | valid ppl 26557.67
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   2 | time:  0.16s | valid loss  9.88 | valid ppl 19441.85
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   3 | time:  0.17s | valid loss 11.90 | valid ppl 146983.00
-----------------------------------------------------------------------------------------


In [51]:
test_loss = evaluate(best_model, test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| End of training | test loss 11.18 | test ppl 71820.47


In [36]:
#print(train.head())
#print(test_with_evi.head())
# claim: contribution waste heat global climate
print(evidence['evidence-308923'])
print(evidence['evidence-213569'])

#greenland lost tiny fraction ice mass	
print(evidence['evidence-962481'])
print(evidence['evidence-1200633'])



Global forcing from waste heat was 0.028 W/m2 in 2005.
Thus, the waste heat engine may be one of the least expensive components of a complete waste heat recovery system.
Only a tiny fraction of the original chemical energy is used for work:
Land ice sheets in both Antarctica and Greenland have been losing mass since 2002 and have seen an acceleration of ice mass loss since 2009.


In [37]:
from torch import nn, optim

# define hypermeter
sequence_len = 28
input_len = 28
hidden_size = 128
num_layers = 2
num_classes = 4
num_epchos = 5
learning_rate = 0.01


In [38]:
class LSTM(nn.Module):
  def __init__(self, input_length, hidden_size, num_classes, num_layers):
    super(LSTM, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.LSTM(input_len, hidden_size, num_layers, num_classes, batch_first=True)
    self.output_layer = nn.Linear(hidden_size, num_classes)

  def forward(self, X):
    hidden_states = torch.zeros(self.num_layers, X.size(0), self.hidden_size)
    cell_states = torch.zeros(self.num_layers, X.size(0), self.hidden_size)
    out, _ = self.lstm(X, (hidden_states, cell_states))
    out = self.output_layer(out[:, -1, :])
    return out


In [39]:
model = LSTM(input_len, hidden_size, num_classes, num_layers)
print(model)

LSTM(
  (lstm): LSTM(28, 128, num_layers=2, bias=4, batch_first=True)
  (output_layer): Linear(in_features=128, out_features=4, bias=True)
)


In [40]:
loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*

In [42]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

# model adpated from workshop 8
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output



class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term) #0::2 means starting with index 0, step = 2
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)