# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Read Files to get raw data

In [2]:
import json

def readJsFile(filename) :
    return json.load(open(filename,'r'))
    

## preprocess the raw data

In [3]:
''' define functions to preprocess raw texts with tokenization, lemmatization, and remove stopwords'''
import nltk,re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
#nltk.download('stopwords')

tt = TweetTokenizer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stopwords = set(stopwords.words('english'))     # note: stopwords are all in lowercase
stopwords.remove('not')                         # not, no are not meaningless
stopwords.remove('no')

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma
    
def preprocess_text(text):
    # Tokenization and lemmatization
    tokens = tt.tokenize(text.lower())
    tokens = [lemmatize(t) for t in tokens]
    
    # remove any word that does not contain any English letters,
    # including punctuation
    valid_tokens = []
    for t in tokens:
        if re.search(r"[a-z]",t):
            valid_tokens.append(t)

    # remove stop words
    tokens = []
    for t in valid_tokens:
        if t not in stopwords:
            tokens.append(t)

    # resemble processed tokens back to text
    return ' '.join(tokens)

'not' in stopwords and 'no' in stopwords

False

## Split data into **claim-texts, cliam-ids**, & **claim-labels** & **evidences list**

to run code below, change the block from **raw block** to **code block**

In [4]:
train_data= readJsFile('data/train-claims.json')
dev_data = readJsFile('data/dev-claims.json')
test_data = readJsFile('data/test-claims-unlabelled.json')
all_evidences = readJsFile('data/evidence.json')

In [5]:
def split_data(data, isTest=False, isEvd=False):
    ids = []
    text = []
    labels = []
    evidences = []

    # split test data
    if isTest and not isEvd:
        for claim_id, data in data.items():
            ids.append(claim_id)
            text.append(preprocess_text(data['claim_text']))
    # split evidences
    elif isEvd and not isTest:
        for evidence_id, data in data.items():
            evidences.append(preprocess_text(data))
    # split train data
    elif not isTest and not isEvd:
        for claim_id, data in data.items():
            ids.append(claim_id)
            text.append(preprocess_text(data['claim_text']))
            labels.append(data['claim_label'])
            evidences.append(data['evidences'])
    else:
        print('Wrong Mode, please check your arguments: isTest and isEvd')
        
    return ids, text, labels, evidences

In [6]:
# load saved data
# please comment the code out when submmiting
import json

def load_data(filename):
    return json.load(open(filename, "r"))

train_ids = load_data("temp_data/train_ids.json")
train_texts = load_data("temp_data/train_texts.json")
train_labels = load_data("temp_data/train_labels.json")
train_evidences = load_data("temp_data/train_evidences.json")

dev_ids = load_data("temp_data/dev_ids.json")
dev_texts = load_data("temp_data/dev_texts.json")
dev_labels = load_data("temp_data/dev_labels.json")
dev_evidences = load_data("temp_data/dev_evidences.json")

test_ids = load_data("temp_data/test_ids.json")
test_texts = load_data("temp_data/test_texts.json")

evidences_lst = load_data("temp_data/evidences_lst.json")

In [7]:
# verifying correctness
print(train_texts[0])
evidences_lst[0]

not no scientific evidence co2 pollutant higher co2 concentration actually help ecosystem support plant animal life


'john bennet lawes english entrepreneur agricultural scientist'

## tfidf word embeding

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = evidences_lst + train_texts              # text documents
vectorizer = TfidfVectorizer(max_features=500000) # initialization

# Fit the vectorizer to the data and transform the documents into TF-IDF vectors
X = vectorizer.fit_transform(corpus)
print(X.shape)

# To see the feature names (terms)
#feature_names = vectorizer.get_feature_names_out()
#print(feature_names)

(1210055, 500000)


In [9]:
# transform data into tf-idf vectors
train_tfidf = vectorizer.transform(train_texts)
print(train_tfidf.shape)

dev_tfidf = vectorizer.transform(dev_texts)
print(dev_tfidf.shape)

test_tfidf = vectorizer.transform(test_texts)
print(test_tfidf.shape)

evidence_tfidf = vectorizer.transform(evidences_lst)
print(evidence_tfidf.shape)

(1228, 500000)
(154, 500000)
(153, 500000)
(1208827, 500000)


In [10]:
# calculate cosine similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

train_cos_sims = cosine_similarity(train_tfidf, evidence_tfidf) # needs large memory

In [11]:
print(train_cos_sims.shape)
test_cos_sims = cosine_similarity(test_tfidf, evidence_tfidf)
print(test_cos_sims.shape)
dev_cos_sims = cosine_similarity(dev_tfidf, evidence_tfidf)
print(dev_cos_sims.shape)

(1228, 1208827)
(153, 1208827)
(154, 1208827)


In [12]:
# since we cannot use 1208827 intances in our training process
# we have to evaluate and decide how many information will be disposed

import numpy as np

# def top_n_indices(ls, n):
#     ''' Returns indices of the 20 highest numbers.'''
#     sorted_indices = sorted(range(len(ls)), key=lambda i: ls[i], reverse=True)
#     return sorted_indices[:n]

def top_n_similarity_recall(n, cos_sims, evidences):
    '''
    Calculates the recall of correct evidences within the top N evidences with the highest similarity scores.

    Parameters:
        n (int): Represents the number of top similarity score items to select.
        cos_sims (2D-list): A list where each item contains similarity score of a train text with all evidence text.
        evidences (2D-list): A list wheere each item contains all correct evidences for each train text .

    Returns:
        The proportion of correct evidences among the top N evidences, relative to the total number of correct evidences.
    '''
    res = []
    for i in range(cos_sims.shape[0]):
        # retreive evidence_id of top n scores
        scores = cos_sims[i]
        # topn_idx = top_n_indices(scores, n)
        topn_idx = np.argpartition(scores, -n)[-n:] # more efficient, find indecies of topn elements in the list
        
        total = 0
        recall = 0
        # count number of evidences we find in topn evidences
        for e in evidences[i]:
            e_id = int(e.split('-')[1]) # e = evidence-xxxx
            if e_id in topn_idx:
                recall += 1
            total += 1
        res.append(recall / total)
    return sum(res) / len(res)


In [13]:
# this step is time-consuming
# topns = [100, 200, 500, 1000]
# for topn in topns:
#     train_recall = top_n_similarity_recall(topn,train_cos_sims, train_evidences)
#     dev_recall   = top_n_similarity_recall(topn,dev_cos_sims,dev_evidences)
#     print(topn,train_recall, dev_recall)

topn = 100
train_recall = top_n_similarity_recall(topn,train_cos_sims, train_evidences)
dev_recall   = top_n_similarity_recall(topn,dev_cos_sims,dev_evidences)
print(f'topn:{topn}, train: {train_recall}, dev: {dev_recall}')

topn:100, train: 0.39066232356134567, dev: 0.44058441558441563


In [14]:
topn = 200
train_recall = top_n_similarity_recall(topn,train_cos_sims, train_evidences)
dev_recall   = top_n_similarity_recall(topn,dev_cos_sims,dev_evidences)
print(topn,train_recall, dev_recall)

200 0.4706026058631922 0.5366883116883118


In [43]:
# topn = 500
# train_recall = top_n_similarity_recall(topn,train_cos_sims, train_evidences)
# dev_recall   = top_n_similarity_recall(topn,dev_cos_sims,dev_evidences)
# print(topn,train_recall, dev_recall)

In [44]:
# topn = 1000
# train_recall = top_n_similarity_recall(topn,train_cos_sims, train_evidences)
# dev_recall   = top_n_similarity_recall(topn,dev_cos_sims,dev_evidences)
# print(topn,train_recall, dev_recall)

## nagetive sampling

In [17]:
''' find correct evidences from parts of evidences rather than all evidences, for example, first 1000 evidences instead of 120887
    transform the multi-class classification problem to binary classification'''
def nagetive_sampling(cos_sims, texts, evidences, all_evidence, topn, isTrain=False):
    samples = []
    data = []
    label = []

    for i in range(cos_sims.shape[0]):  # text similarity with all evidence texts
        # samples
        if isTrain:  
            # use all positive trainning data during training process
            for e in evidences[i]:
                e_id = int(e.split('-')[1]) # e = evidence-xxxx
                data.append('<cls> ' + texts[i] + '<sep> ' + all_evidence[e_id])  # combine Query and Document, like BERT
                label.append(1)
            # topn_ids = np.argsort(-cos_sims[i])[:topn].tolist()
            topn_ids = np.argsort(-cos_sims[i])[25:topn+25].tolist() # dispose first 25 items with highest sim-score
        else: # we take only topn samples in test sets
            topn_ids = np.argpartition(cos_sims[i], -topn)[-topn:].tolist()
        samples.append(topn_ids)

        # labels & data
        for j in topn_ids:
            data.append('<cls>' + texts[i] + '<sep>' + all_evidence[j])
            # if current evidence is one of the correct ones, label it true
            j = 'evidence-'+str(j)
            if evidences is not None: # some test data have no relative evidences
                if j in evidences[i]:
                    label.append(1)
                else:
                    label.append(0)

    return samples, data, label
            

In [18]:
topn = 200
train_ns_samples, train_ns_data, train_ns_label = nagetive_sampling(
    train_cos_sims, train_texts, train_evidences, evidences_lst, topn, True)

In [19]:
dev_ns_samples, dev_ns_data, dev_ns_label = nagetive_sampling(
    dev_cos_sims, dev_texts, dev_evidences, evidences_lst, topn, False)

In [20]:
test_ns_samples, test_ns_data, _ = nagetive_sampling(
    test_cos_sims, test_texts, None, evidences_lst, topn, False)

In [21]:
# data inspection
from collections import Counter

train_ns_label = np.array(train_ns_label)
dev_ns_label = np.array(dev_ns_label)

print(Counter(train_ns_label))
print(Counter(dev_ns_label))
print('ratio:',train_ns_label[0] / train_ns_label[1])

Counter({0: 244637, 1: 5085})
Counter({0: 30555, 1: 245})
ratio: 1.0


In [22]:
#TODO: imbalanced data set
# 1.resampling
# 

In [23]:
244637 / 5085

48.10953785644051

In [24]:
245499 / (245499+4223)

0.9830891951850458

In [25]:
30555 / (30555+237)

0.99230319563523

In [26]:
len(train_ns_data)

249722

In [27]:
train_ns_data[0]

'<cls> not no scientific evidence co2 pollutant higher co2 concentration actually help ecosystem support plant animal life<sep> high concentration time atmospheric concentration greater carbon dioxide toxic animal life raise concentration ppm higher several hour eliminate pest whitefly spider mite greenhouse'

In [28]:
train_ns_label[0]

1

In [29]:
len(dev_ns_data)

30800

In [30]:
len(test_ns_data)

30600

## Keras

In [31]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(train_ns_data) 

vocab_size = len(tokenizer.word_index) + 1  # padding
print(vocab_size)

38547


In [32]:
#tokenise the input into word sequences

train_seq = tokenizer.texts_to_sequences(train_ns_data)
dev_seq = tokenizer.texts_to_sequences(dev_ns_data)
test_seq = tokenizer.texts_to_sequences(test_ns_data)

In [33]:
len(train_seq)

249722

In [34]:
len(train_seq[0])

41

In [35]:
len(dev_seq[0])

11

In [36]:
# padding matrix to the same length
max_i = 0
for i in train_seq:
    max_i = max(max_i, len(i))
max_t = max_i
print(max_t)

max_i = 0
for i in dev_seq:
    max_i = max(max_i, len(i))
max_v = max_i
print(max_v)

maxlen = max_t if max_v <= max_t else max_v
print('maxlen:',maxlen)

207
168
maxlen: 207


In [37]:
from keras.preprocessing.sequence import pad_sequences

train_seq = pad_sequences(train_seq, padding='post', maxlen=maxlen)
dev_seq = pad_sequences(dev_seq, padding='post', maxlen=maxlen)
test_seq = pad_sequences(test_seq, padding='post', maxlen=maxlen)

In [38]:
len(dev_seq[0])

207

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [39]:
import tensorflow as tf
from keras.layers import LSTM
import keras
from keras.models import Sequential
from keras import layers

# TODO: fine-tunning
embedding_dim = 60
hidden_dim = 100

# model
model = Sequential(name="LSTM_G6")

# embedding layer
model.add(layers.Embedding(
    input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))

##
## experiments here to verify what technology is effective
## for reports
##

# one direction
# model.add(layers.Dropout(0.1)) # increase robustness using dropout
# model.add(LSTM(hidden_dim, return_sequences=True, dropout=0.1)) # double layer
# model.add(LSTM(hidden_dim, dropout=0.1))                        # single layer - baseline

# bidirectional
model.add(layers.Dropout(0.1))
model.add(layers.Bidirectional(LSTM(hidden_dim, return_sequences=True, dropout=0.1)))
model.add(layers.Bidirectional(LSTM(hidden_dim, dropout=0.1)))

# output layer
model.add(layers.Dropout(0.1))
model.add(layers.Dense(hidden_dim // 2, activation='tanh'))
model.add(layers.Dense(1, activation='sigmoid'))
## finish the model construction

## Learning Rates: keras.cosineDecay
## TODO
decay_steps = 3000
learning_rate = 1e-2
lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
    learning_rate, decay_steps
)

# optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

# binary cross entropy loss for binary classification problem
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[keras.metrics.Recall()])
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.compile(loss='binary_crossentropy', optimizer='adam')
model.compile(loss='binary_crossentropy', optimizer=optimizer)

model.summary()

Model: "LSTM_G6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 207, 60)           2312820   
                                                                 
 dropout (Dropout)           (None, 207, 60)           0         
                                                                 
 bidirectional (Bidirection  (None, 207, 200)          128800    
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 200)               240800    
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense (Dense)               (None, 50)                1005

In [40]:
len(train_seq)

249722

In [41]:
len(train_ns_label)

249722

In [45]:
gpus = tf.config.list_physical_devices('GPU')
cpus = tf.config.list_physical_devices('CPU')

print(gpus,cpus)

[] [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


In [None]:
# trianning
model.fit(train_seq, train_ns_label, 
          epochs=3, verbose=True, 
          validation_data=(dev_seq, dev_ns_label), 
          batch_size=1000, class_weight={0: 1, 1: 120})

Epoch 1/3

In [None]:
!pip install torch

import torch

In [60]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*