# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

### Read Files to get raw data

In [1]:
import json

def readJsFile(filename) :
    return json.load(open(filename,'r'))

def save_data(data, filename):
    json.dump(data, open(filename, "w"))

def load_data(filename):
    return json.load(open(filename, "r"))

### preprocess the raw data

In [2]:
''' define functions to preprocess raw texts with tokenization, lemmatization, and remove stopwords'''
import nltk,re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
#nltk.download('stopwords')

tt = TweetTokenizer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stopwords = set(stopwords.words('english'))     # note: stopwords are all in lowercase

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma
    
def preprocess_text(text):
    # Tokenization and lemmatization
    tokens = tt.tokenize(text.lower())
    tokens = [lemmatize(t) for t in tokens]
    
    # remove any word that does not contain any English letters,
    # including punctuation
    valid_tokens = []
    for t in tokens:
        if re.search(r"[a-z]",t):
            valid_tokens.append(t)

    # remove stop words
    tokens = []
    for t in valid_tokens:
        if t not in stopwords:
            tokens.append(t)

    # resemble processed tokens back to text
    return ' '.join(tokens)

'not' in stopwords and 'no' in stopwords

True

### Split raw data into **claim-texts, cliam-ids**, & **claim-labels** & **evidences list**

In [3]:
def split_data(data, isTest=False, isEvd=False):
    ids = []
    text = []
    labels = []
    evidences = []

    # split test data
    if isTest and not isEvd:
        for claim_id, data in data.items():
            ids.append(claim_id)
            text.append(preprocess_text(data['claim_text']))
    # split evidences
    elif isEvd and not isTest:
        for evidence_id, data in data.items():
            evidences.append(preprocess_text(data))
    # split train data
    elif not isTest and not isEvd:
        for claim_id, data in data.items():
            ids.append(claim_id)
            text.append(preprocess_text(data['claim_text']))
            labels.append(data['claim_label'])
            evidences.append(data['evidences'])
    else:
        print('Wrong Mode, please check your arguments: isTest and isEvd')
        
    return ids, text, labels, evidences

## 1.1 Prepare raw data

In [4]:
# read raw data
train_data= readJsFile('data/train-claims.json')
dev_data = readJsFile('data/dev-claims.json')
test_data = readJsFile('data/test-claims-unlabelled.json')
all_evidences = readJsFile('data/evidence.json')

In [5]:
# abstract, transform data to lists
train_ids, train_texts, train_labels, train_evidences = split_data(train_data)
test_ids, test_texts, _, _ = split_data(test_data, isTest=True)
dev_ids, dev_texts, dev_labels, dev_evidences = split_data(dev_data)
_,_,_, evidences_lst = split_data(all_evidences, isEvd=True) # this step is time-consuming

In [6]:
# verify data correctness
# print(train_texts[0])
# evidences_lst[0]

## 1.2 tfidf word embeding

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = evidences_lst + train_texts              # text documents
vectorizer = TfidfVectorizer(max_features=500000) # initialization

# Fit the vectorizer to the data and transform the documents into TF-IDF vectors
X = vectorizer.fit_transform(corpus)
print(X.shape)

# To see the feature names (terms)
#feature_names = vectorizer.get_feature_names_out()
#print(feature_names)

(1210055, 500000)


In [8]:
# transform data into tf-idf vectors

train_tfidf = vectorizer.transform(train_texts)
print(train_tfidf.shape)

dev_tfidf = vectorizer.transform(dev_texts)
print(dev_tfidf.shape)

test_tfidf = vectorizer.transform(test_texts)
print(test_tfidf.shape)

evidence_tfidf = vectorizer.transform(evidences_lst)
print(evidence_tfidf.shape)

(1228, 500000)
(154, 500000)
(153, 500000)
(1208827, 500000)


In [9]:
# calculate cosine similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

train_cos_sims = cosine_similarity(train_tfidf, evidence_tfidf) # needs large memory
print(train_cos_sims.shape)

test_cos_sims = cosine_similarity(test_tfidf, evidence_tfidf)
print(test_cos_sims.shape)

dev_cos_sims = cosine_similarity(dev_tfidf, evidence_tfidf)
print(dev_cos_sims.shape)

(1228, 1208827)
(153, 1208827)
(154, 1208827)


In [10]:
# since we cannot use 1208827 intances in our training process
# we have to evaluate and decide how many information will be disposed

import numpy as np

# def top_n_indices(ls, n):
#     ''' Returns indices of the 20 highest numbers.'''
#     sorted_indices = sorted(range(len(ls)), key=lambda i: ls[i], reverse=True)
#     return sorted_indices[:n]

def top_n_similarity_recall(n, cos_sims, evidences):
    '''
    Calculates the recall of correct evidences within the top N evidences with the highest similarity scores.

    Parameters:
        n (int): Represents the number of top similarity score items to select.
        cos_sims (2D-list): A list where each item contains similarity score of a train text with all evidence text.
        evidences (2D-list): A list wheere each item contains all correct evidences for each train text .

    Returns:
        The proportion of correct evidences among the top N evidences, relative to the total number of correct evidences.
    '''
    res = []
    for i in range(cos_sims.shape[0]):
        # retreive evidence_id of top n scores
        scores = cos_sims[i]
        # topn_idx = top_n_indices(scores, n)
        topn_idx = np.argpartition(scores, -n)[-n:] # more efficient, find indecies of topn elements in the list
        
        total = 0
        recall = 0
        # count number of evidences we find in topn evidences
        for e in evidences[i]:
            e_id = int(e.split('-')[1]) # e = evidence-xxxx
            if e_id in topn_idx:
                recall += 1
            total += 1
        res.append(recall / total)
    return sum(res) / len(res)


In [11]:
# this step is time-consuming
# topns = [100, 200, 500, 1000]
# for topn in topns:
#     train_recall = top_n_similarity_recall(topn,train_cos_sims, train_evidences)
#     dev_recall   = top_n_similarity_recall(topn,dev_cos_sims,dev_evidences)
#     print(topn,train_recall, dev_recall)

In [12]:
# topn = 100
# train_recall = top_n_similarity_recall(topn,train_cos_sims, train_evidences)
# dev_recall   = top_n_similarity_recall(topn,dev_cos_sims,dev_evidences)
# print(f'topn:{topn}, train: {train_recall}, dev: {dev_recall}')

In [13]:
# topn = 200
# train_recall = top_n_similarity_recall(topn,train_cos_sims, train_evidences)
# dev_recall   = top_n_similarity_recall(topn,dev_cos_sims,dev_evidences)
# print(topn,train_recall, dev_recall)

In [14]:
# topn = 500
# train_recall = top_n_similarity_recall(topn,train_cos_sims, train_evidences)
# dev_recall   = top_n_similarity_recall(topn,dev_cos_sims,dev_evidences)
# print(topn,train_recall, dev_recall)

In [15]:
# topn = 1000
# train_recall = top_n_similarity_recall(topn,train_cos_sims, train_evidences)
# dev_recall   = top_n_similarity_recall(topn,dev_cos_sims,dev_evidences)
# print(topn,train_recall, dev_recall)

## 1.3 Nagetive sampling

In [16]:
''' find correct evidences from parts of evidences rather than all evidences, for example, first 1000 evidences instead of 120887
    transform the multi-class classification problem to binary classification'''

def nagetive_sampling(cos_sims, texts, evidences, all_evidence, topn, isTrain=False):
    samples = []
    data = []
    label = []

    for i in range(cos_sims.shape[0]):  # text similarity with all evidence texts
        # samples
        if isTrain:  
            # use all positive trainning data during training process
            for e in evidences[i]:
                e_id = int(e.split('-')[1]) # e = evidence-xxxx
                data.append('<cls> ' + texts[i] + '<sep> ' + all_evidence[e_id])  # combine Query and Document, like BERT
                label.append(1)
            # topn_ids = np.argsort(-cos_sims[i])[:topn].tolist()
            topn_ids = np.argsort(-cos_sims[i])[25:topn+25].tolist() # dispose first 25 items with highest sim-score
        else: # we take only topn samples in test sets
            topn_ids = np.argpartition(cos_sims[i], -topn)[-topn:].tolist()
        samples.append(topn_ids)

        # labels & data
        for j in topn_ids:
            data.append('<cls>' + texts[i] + '<sep>' + all_evidence[j])
            # if current evidence is one of the correct ones, label it true
            j = 'evidence-'+str(j)
            if evidences is not None: # some test data have no relative evidences
                if j in evidences[i]:
                    label.append(1)
                else:
                    label.append(0)

    return samples, data, label
            

In [17]:
topn = 200
train_ns_samples, train_ns_data, train_ns_label = nagetive_sampling(
    train_cos_sims, train_texts, train_evidences, evidences_lst, topn, True)

In [18]:
dev_ns_samples, dev_ns_data, dev_ns_label = nagetive_sampling(
    dev_cos_sims, dev_texts, dev_evidences, evidences_lst, topn, False)

In [19]:
test_ns_samples, test_ns_data, _ = nagetive_sampling(
    test_cos_sims, test_texts, None, evidences_lst, topn, False)

In [20]:
# data inspection

from collections import Counter

train_ns_label = np.array(train_ns_label)
dev_ns_label = np.array(dev_ns_label)

t_ct = Counter(train_ns_label)
d_ct = Counter(dev_ns_label)

print("Train:",t_ct)
print("percentage of label 0:",t_ct[0] / (t_ct[0] + t_ct[1]))
print('training set set label ratio:',t_ct[0] / t_ct[1])
print()
print("Dev:",d_ct)
print("percentage of label 0:",d_ct[0] / (d_ct[0] + d_ct[1]))
print('development set set label ratio:',d_ct[0] / d_ct[1])
print()
ratio = t_ct[0] / t_ct[1]
print('ratio:',ratio)

Train: Counter({0: 244630, 1: 5092})
percentage of label 0: 0.9796093255700339
training set set label ratio: 48.04202670856245

Dev: Counter({0: 30555, 1: 245})
percentage of label 0: 0.9920454545454546
development set set label ratio: 124.71428571428571

ratio: 48.04202670856245


## 1.4 Keras transformation

In [21]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(train_ns_data) 

vocab_size = len(tokenizer.word_index) + 1  # padding
print(vocab_size)

2024-05-20 17:18:58.309723: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


38652


In [22]:
#tokenise the input into word sequences

train_seq = tokenizer.texts_to_sequences(train_ns_data)
dev_seq = tokenizer.texts_to_sequences(dev_ns_data)
test_seq = tokenizer.texts_to_sequences(test_ns_data)

In [23]:
print(len(train_seq),len(train_seq[0]),len(dev_seq[0]))

249722 39 11


In [24]:
# padding matrix to the same length

max_i = 0
for i in train_seq:
    max_i = max(max_i, len(i))
max_t = max_i
print(max_t)

max_i = 0
for i in dev_seq:
    max_i = max(max_i, len(i))
max_v = max_i
print(max_v)

maxlen = max_t if max_v <= max_t else max_v
print('maxlen:',maxlen)

207
167
maxlen: 207


In [25]:
from keras_preprocessing.sequence import pad_sequences

train_seq = pad_sequences(train_seq, padding='post', maxlen=maxlen)
dev_seq = pad_sequences(dev_seq, padding='post', maxlen=maxlen)
test_seq = pad_sequences(test_seq, padding='post', maxlen=maxlen)

In [26]:
len(dev_seq[0])

207

In [27]:
len(train_seq)

249722

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [39]:
import tensorflow as tf
from keras.layers import LSTM
import keras
from keras.models import Sequential
from keras import layers

def LSTM_Model():
    # TODO: fine-tunning
    embedding_dim = 60
    hidden_dim = 100

    # model
    model = Sequential(name="LSTM_G6")

    # embedding layer
    model.add(layers.Embedding(
        input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))

    ##
    ## experiments here to verify what technology is effective
    ##

    # one direction
    # model.add(layers.Dropout(0.1)) # increase robustness using dropout
    # model.add(LSTM(hidden_dim, return_sequences=True, dropout=0.1)) # double layer
    # model.add(LSTM(hidden_dim, dropout=0.1))                        # single layer - baseline

    # bidirectional
    model.add(layers.Dropout(0.1))
    model.add(layers.Bidirectional(LSTM(hidden_dim, return_sequences=True, dropout=0.1)))
    model.add(layers.Bidirectional(LSTM(hidden_dim, dropout=0.1)))

    # output layer
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(hidden_dim // 2, activation='tanh'))
    model.add(layers.Dense(1, activation='sigmoid'))
    ## finish the model construction

    # Exponential Decay Learning Rate Scheduler
    initial_learning_rate = 1e-2
    decay_steps = 1000
    decay_rate = 0.96
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate, decay_steps, decay_rate, staircase=True
    )

    # optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

    # binary cross entropy loss for binary classification problem
    # model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[keras.metrics.Recall()])
    # model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # model.compile(loss='binary_crossentropy', optimizer='adam')
    model.compile(loss='binary_crossentropy', optimizer=optimizer)

    model.summary()
    
    return model



In [40]:
# implement the LSTM Model
model = LSTM_Model()

Model: "LSTM_G6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 207, 60)           2319120   
                                                                 
 dropout_4 (Dropout)         (None, 207, 60)           0         
                                                                 
 bidirectional_4 (Bidirectio  (None, 207, 200)         128800    
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 200)              240800    
 nal)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 200)               0         
                                                                 
 dense_4 (Dense)             (None, 50)                1005

## 2.1 Imbalanced data handling

In [30]:
def resampling(sampler,train_seq,train_ns_label,dev_seq,dev_ns_label):
    train_seq_res, train_ns_label_res = sampler.fit_resample(train_seq, train_ns_label)

    t_ct = Counter(train_ns_label_res)
    ratio = t_ct[0] / t_ct[1]
    print(t_ct)

    dev_seq_res, dev_ns_label_res = sampler.fit_resample(dev_seq, dev_ns_label)
    print(Counter(dev_ns_label_res))
    print('ratio:',ratio)
    
    return train_seq_res, train_ns_label_res,dev_seq_res, dev_ns_label_res

### 2.1.1 Oversampling

In [32]:
from imblearn.over_sampling import SMOTE, ADASYN

sm = SMOTE(random_state=426)
# resampling(sm)

In [33]:
from imblearn.over_sampling import ADASYN

adsyn = ADASYN(random_state=426)
# resampling(adsyn)

### 2.1.2 Undersampling

In [34]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=426)
# resampling(rus)

### 2.1.3 Oversampling and Undersampling

In [99]:
from imblearn.combine import SMOTETomek

smote_enn = SMOTETomek(random_state=426)
# resampling(smote_enn)

## 2.2 Start Trainning

In [96]:
model = LSTM_Model()

Model: "LSTM_G6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 207, 60)           2319120   
                                                                 
 dropout_20 (Dropout)        (None, 207, 60)           0         
                                                                 
 bidirectional_20 (Bidirecti  (None, 207, 200)         128800    
 onal)                                                           
                                                                 
 bidirectional_21 (Bidirecti  (None, 200)              240800    
 onal)                                                           
                                                                 
 dropout_21 (Dropout)        (None, 200)               0         
                                                                 
 dense_20 (Dense)            (None, 50)                1005

In [37]:
# without resamplinng
X = train_seq
y = train_ns_label
dev_X = dev_seq
dev_y = dev_ns_label
t_ct = Counter(train_ns_label)
ratio = t_ct[0] / t_ct[1]
print('training set set label ratio:',t_ct[0] / t_ct[1])

training set set label ratio: 48.04202670856245


In [104]:
# resampling
sampler = rus
train_seq_res, train_ns_label_res,dev_seq_res, dev_ns_label_res = resampling(
    sampler,train_seq,train_ns_label,dev_seq,dev_ns_label)

Counter({0: 5092, 1: 5092})
Counter({0: 245, 1: 245})
ratio: 1.0


In [None]:
# resampling
X = train_seq_res
y = train_ns_label_res
dev_X = dev_seq_res
dev_y = dev_ns_label_res

In [None]:
# trainning
ratio = 1

model.fit(X, y, 
      epochs=15, verbose=True, 
      validation_data=(dev_X, dev_y), 
      batch_size=1000, class_weight={0: 1, 1: ratio})

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Make prediction

In [None]:
# make prediction

dev_pred = model.predict(dev_seq, batch_size=1000, verbose=True)
test_pred = model.predict(test_seq, batch_size=1000, verbose=True)

In [None]:
print(dev_pred[:4], dev_pred.shape, test_pred.shape, len(dev_texts))

## Select evidences

In [None]:
def get_evidences(pred, ns_samples, select_k, topn):
    start_idx = 0
    pred_evidences = []
    count_idx = 0
    
    while start_idx < len(pred):
        end_idx = start_idx + topn
        cur_pred = pred[start_idx : end_idx]
        cur_top_ids = np.argpartition(-cur_pred.squeeze(), select_k)[:select_k].tolist()
        pred_evidences.append( ['evidence-'+str(ns_samples[count_idx][i]) for i in cur_top_ids])
        start_idx = end_idx
        count_idx += 1
    return pred_evidences


In [None]:
# select evidences

select_evidence_k = 6
test_evidences_ids = get_evidences(
    test_pred, test_ns_samples, select_evidence_k, topn)

dev_evidences_ids = get_evidences(
    dev_pred, dev_ns_samples, select_evidence_k, topn)

print(dev_evidences_ids[:2])

## calculate Evidence Retrieval F-score

In [None]:
# calculate Evidence Retrieval F-score

f = []
for idx, evidence_ids in enumerate(dev_evidences_ids):

    #check retrieved evidences
    evidence_correct = 0
    evidence_recall = 0.0
    evidence_precision = 0.0
    evidence_fscore = 0.0
    
    for cur_id in evidence_ids:
        if cur_id in dev_evidences[idx]:
            evidence_correct += 1
    
    if evidence_correct > 0:
        evidence_recall = evidence_correct / len(dev_evidences[idx])
        evidence_precision = evidence_correct / len(evidence_ids)
        evidence_fscore = (2*evidence_precision*evidence_recall)/(evidence_precision+evidence_recall)
    f.append(evidence_fscore)

#compute aggregate performance
mean_f = np.mean(f if len(f) > 0 else [0.0])
print("Evidence Retrieval F-score (F)    =", mean_f)

## Save data

In [None]:
file_sufix = "_smote_enn0.0380"

In [92]:
pred_dev_evd_file = "pred_dev_evd_retrieval_bidirectM"+file_sufix+".json"
pred_test_evd_file = "pred_test_evd_retrieval_bidirectM"+file_sufix+".json"
print(pred_dev_evd_file)

pred_dev_evd_retrieval_bidirectM_adsyn0.0380.json


In [93]:
# save evidences selected

pred_dev_claims = {}
pred_test_claims = {}

for idx, evidence_ids in enumerate(dev_evidences_ids):
    cur_data = dev_data[dev_ids[idx]]
    cur_data['evidences'] = evidence_ids
    pred_dev_claims[dev_ids[idx]] = cur_data
    # print(pred_dev_claims)
    # break

for idx, evidence_ids in enumerate(test_evidences_ids):
    cur_data = test_data[test_ids[idx]]
    cur_data['evidences'] = evidence_ids
    pred_test_claims[test_ids[idx]] = cur_data
    # break
    
## save prediction data
json.dump(pred_dev_claims, open(pred_dev_evd_file, "w"))
json.dump(pred_test_claims, open(pred_test_evd_file, "w"))

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*