# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

**We use keras, tensorflow, nltk, scikit-learn in this project.**

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## PreProcess for evidence and claims

### read files

In [1]:

import json

dev_cls_data = json.load(open("dev_cls_data.json", "r"))
test_cls_data = json.load(open("test_cls_data.json", "r"))

train_claims = json.load(open("data/train-claims.json", "r"))
train_texts = json.load(open("temp_data/train_texts.json", "r"))
train_ids = json.load(open("temp_data/train_ids.json", "r"))
train_evidences = json.load(open("temp_data/train_evidences.json", "r"))

test_claims = json.load(open("data/test-claims-unlabelled.json", "r"))
evidences = json.load(open("data/evidence.json", "r"))
evidences_ids = json.load(open("temp_data/evidences_ids.json", "r"))
evidences_id_dict = json.load(open("temp_data/evidences_id_dict.json", "r"))
evidences_texts = json.load(open("temp_data/evidences_texts.json", "r"))


train_ids_dict = {}
for i, j in enumerate(train_ids):
    train_ids_dict[j] = i

In [2]:
# combine claim-text with evidence-text
import random
select_evidence_k = 6 # the same as retrieval

train_cls_data = []

for train_id in train_ids:
    cur_data = {"label": train_claims[train_id]["claim_label"]}
    select_evidence_ids = train_evidences[train_ids_dict[train_id]]
    while len(select_evidence_ids) < select_evidence_k:
        evidence_id = random.choice(range(len(evidences_texts)))
        while evidence_id in select_evidence_ids:
            evidence_id = random.choice(range(len(evidences_texts)))
        select_evidence_ids.append(evidence_id)
        
    cur_data['text'] = "<sep>".join(["<cls>" + train_texts[train_ids_dict[train_id]]] + [evidences_texts[i] for i in select_evidence_ids])
    train_cls_data.append(cur_data)


In [3]:
# construct training data

def prepare_data(cls_data):
    x_data = []
    y_data = []
    for i in cls_data:
        x_data.append(i['text'])
        if 'label' in i:
            y_data.append(i['label'])
    return x_data, y_data

train_x, train_y = prepare_data(train_cls_data)
dev_x, dev_y = prepare_data(dev_cls_data)
test_x, _ = prepare_data(test_cls_data)


In [4]:
# data inspection

from collections import Counter
print(Counter(train_y))
print(Counter(dev_y))

Counter({'SUPPORTS': 519, 'NOT_ENOUGH_INFO': 386, 'REFUTES': 199, 'DISPUTED': 124})
Counter({'SUPPORTS': 68, 'NOT_ENOUGH_INFO': 41, 'REFUTES': 27, 'DISPUTED': 18})


In [5]:
# tranform class string to class numbers

import numpy as np
label_dict = {}
new_train_y = []
j = 0
for i in train_y:
    if i not in label_dict:
        label_dict[i] = j
        new_train_y.append(j)
        j += 1
    else:
        new_train_y.append(label_dict[i])
print(Counter(new_train_y))
print(Counter([label_dict[i] for i in dev_y]))

idx2label_ = sorted(label_dict.items(), key=lambda x:x[1])
idx2label = [i for i, _ in idx2label_]

train_y = []
for i in new_train_y:
    temp = [0] * len(idx2label)
    temp[i] = 1
    train_y.append(temp)

train_y = np.array(train_y)

dev_y_new = []
for i in dev_y:
    temp = [0] * len(idx2label)
    temp[label_dict[i]] = 1
    dev_y_new.append(temp)
    
dev_y = np.array(dev_y_new)

# dev_y = np.array([label_dict[i] for i in dev_y])

Counter({2: 519, 3: 386, 1: 199, 0: 124})
Counter({2: 68, 3: 41, 1: 27, 0: 18})


In [6]:
# same in evidence retreival 
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(train_x)

vocab_size = len(tokenizer.word_index) + 1  # 0 is padding token
print(vocab_size)
    

2024-05-13 18:56:11.302796: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


17970


In [7]:
# same in evidence retreival 
# tokenise the input into word sequences

xseq_train = tokenizer.texts_to_sequences(train_x)
xseq_dev = tokenizer.texts_to_sequences(dev_x)
xseq_test = tokenizer.texts_to_sequences(test_x)

In [8]:
# align dimension of data

max_i = 0
for i in xseq_train:
    max_i = max(max_i, len(i))
print(max_i)

max_i = 0
for i in xseq_dev:
    max_i = max(max_i, len(i))
print(max_i)

488
146


In [12]:
# same in evidence retreival 
# transform texts to sequences

from keras_preprocessing.sequence import pad_sequences
# from keras.preprocessing.sequence import pad_sequences

maxlen = 350
xseq_train = pad_sequences(xseq_train, padding='post', maxlen=maxlen)
xseq_dev = pad_sequences(xseq_dev, padding='post', maxlen=maxlen)
xseq_test = pad_sequences(xseq_test, padding='post', maxlen=maxlen)

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [13]:
from keras.layers import LSTM
import tensorflow as tf

import keras
from keras.models import Sequential
from keras import layers

# TODO: fine-tunning
embedding_dim = 60
hidden_dim = 100

# model
model = Sequential(name="LSTM_G6")

# embedding layer
model.add(layers.Embedding(
    input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))

# bidirectional
model.add(layers.Dropout(0.1))
model.add(layers.Bidirectional(LSTM(hidden_dim, return_sequences=True, dropout=0.1)))
model.add(layers.Bidirectional(LSTM(hidden_dim, dropout=0.1)))

# output layer
model.add(layers.Dropout(0.1))
model.add(layers.Dense(hidden_dim // 2, activation='tanh'))
model.add(layers.Dense(4, activation='softmax'))

#since it's a binary classification problem, we use a binary cross entropy loss here
decay_steps = 35
learning_rate = 1e-2
lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
    learning_rate, decay_steps
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=["accuracy"])

# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])
model.summary()

Model: "LSTM_G6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 350, 60)           1078200   
                                                                 
 dropout_4 (Dropout)         (None, 350, 60)           0         
                                                                 
 bidirectional_4 (Bidirectio  (None, 350, 200)         128800    
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 200)              240800    
 nal)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 200)               0         
                                                                 
 dense_4 (Dense)             (None, 50)                1005

### Training

In [None]:
model.fit(xseq_train, train_y, epochs=10, verbose=True, validation_data=(xseq_dev, dev_y), batch_size=500, class_weight={0: 2, 1: 2, 2: 1, 3: 1})

Epoch 1/10


# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
dev_logits = model.predict(xseq_dev, batch_size=200, verbose=True)
test_logits = model.predict(xseq_test, batch_size=200, verbose=True)

NameError: name 'model' is not defined

In [2]:
dev_classes = dev_logits.argmax(axis=-1)
test_classes = test_logits.argmax(axis=-1)

NameError: name 'dev_logits' is not defined

In [3]:
print(Counter(dev_classes))

NameError: name 'Counter' is not defined

In [4]:
pred_dev_claims = json.load(open("pred_dev_claims_retrieval.json", "r"))
pred_test_claims = json.load(open("pred_test_claims_retrieval.json", "r"))

for i, j in zip(dev_cls_data, dev_classes):
    pred_dev_claims[i['claim_id']]['claim_label'] = idx2label[j]

for i, j in zip(test_cls_data, test_classes):
    pred_test_claims[i['claim_id']]['claim_label'] = idx2label[j]
    

NameError: name 'json' is not defined

In [5]:
## save cls data
json.dump(pred_dev_claims, open("pred_dev_claims.json", "w"))
json.dump(pred_test_claims, open("pred_test_claims.json", "w"))

NameError: name 'json' is not defined

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*

In [14]:
# python eval.py --predictions pred_dev_claims.json --groundtruth data/dev-claims.json