# **2025 COMP90042 Project**

# **Readme**
*If there is something to be noted for the marker, please mention here.*
- All original data files, including the `evidence.json` file should be saved into `./data`. Processed data will be saved to `./data/preprocessed`. Not required to create a new directory for processed files.

# **1. DataSet Processing**

In [None]:
import os
import pandas as pd
import json
import re
import nltk

nltk.download("stopwords")
nltk.download("wordnet")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
stop_words = set(stopwords.words("english"))
path = "data/"
# Load JSON files
with open(path+"train-claims.json", "r") as f:
    train_claims = json.load(f)

with open(path+"dev-claims.json", "r") as f:
    dev_claims = json.load(f)

with open(path+"dev-claims-baseline.json", "r") as f:
    dev_baseline_claims = json.load(f)

with open(path+"test-claims-unlabelled.json", "r") as f:
    test_claims = json.load(f)

with open(path+"evidence.json", "r") as f:
    evidence = json.load(f)

In [None]:
# Convert to DataFrames
train_df = pd.DataFrame(train_claims)
dev_df = pd.DataFrame(dev_claims)
dev_baseline_df = pd.DataFrame(dev_baseline_claims)
test_df = pd.DataFrame(test_claims)
evidence_df = pd.DataFrame(list(evidence.items()), columns=["key", "value"])

train_df = pd.DataFrame(train_claims).transpose()
dev_df = pd.DataFrame(dev_claims).transpose()
dev_baseline_df = pd.DataFrame(dev_baseline_claims).transpose()
test_df = pd.DataFrame(test_claims).transpose()

#### **Step 1: Text Normalisation**

In [None]:
# text normalization - lowercase and make alphanumeric
def normalize_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text

#### **Step 2. Stopword Removal**

In [None]:
# remove stop words
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

#### **Step 3. Lemmatization**

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

#### **Step 4. Tokenisation**

In [None]:
def tokenize_text(text):
    return text.split(" ")

#### **Apply Preprocessing Steps**

In [None]:
datasets = [
    (train_df, (f"{"train-claims":<20}", "claim_text")), 
    (dev_df, (f"{"dev-claims":<20}", "claim_text")), 
    (dev_baseline_df, (f"{"dev-claims-baseline":<20}", "claim_text")), 
    (test_df, (f"{"test-claims":<20}", "claim_text")),
    (evidence_df, (f"{"evidence":<20}", "value"))
    ]
steps = [normalize_text, remove_stopwords, lemmatize_text, tokenize_text]
progress_bar = len(steps)*4

# Processing steps for all datasets
ds = 1
for df, (dataset, col) in datasets:
    print(f"[{ds}/{len(datasets)}] {dataset}: {progress_bar * "▒"}", end="\r")
    i = 0
    for step in steps:
        i += 4
        df[col] = df[col].apply(step)
        print(f"[{ds}/{len(datasets)}] {dataset}: {i * "█"}{(progress_bar-i)*"▒"}", end="\r")
    print(f"[{ds}/{len(datasets)}] {dataset}: {progress_bar * "█"} Done!")
    ds += 1


[1/5] train-claims        : ████████████████ Done!
[2/5] dev-claims          : ████████████████ Done!
[3/5] dev-claims-baseline : ████████████████ Done!
[4/5] test-claims         : ████████████████ Done!
[5/5] evidence            : ████████████████ Done!


#### **Saving Processed Data**

In [None]:
# Save processed data
path = "data/preprocessed/"
if not os.path.exists(path):
    os.makedirs(path)
    print("Created directory ./data/preprocessed/")

train_df.to_json(path+"preprocessed_train.json", orient="index")
dev_df.to_json(path+"preprocessed_dev.json", orient="index")
dev_baseline_df.to_json(path+"preprocessed_dev_baseline.json", orient="index")
test_df.to_json(path+"preprocessed_test.json", orient="index")
evidence_df.to_json(path+"preprocessed_evidence.json", orient="records")

print(f"[FINISHED] Saved preprocessed data to JSON files...")

Created directory ./data/preprocessed/
[FINISHED] Saved preprocessed data to JSON files...


# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*