# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
LOCAL_DEV = True # to switch between developing locally and on colab

if not LOCAL_DEV:
    # TODO: need to upload data files on Google Drive?
    from google.colab import drive
    drive.mount('/content/drive')

In [2]:
pip install contractions

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
#Imports
import numpy as np
import torch
import pandas as pd

In [4]:
#visualising training data
if LOCAL_DEV:
    train = pd.read_json("../data/train-claims.json") # for local dev
    
else:
    train = pd.read_json("/content/drive/MyDrive/data/train-claims.json") # on colab
train = train.transpose()
train.head()


Unnamed: 0,claim_text,claim_label,evidences
claim-1937,Not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1..."
claim-126,El Niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]"
claim-2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]"
claim-2021,Weather Channel co-founder John Coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5..."
claim-2449,"""January 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72..."


In [6]:
if LOCAL_DEV:
    test = pd.read_json("../data/test-claims-unlabelled.json") # for local dev
    
else:
    test = pd.read_json("/content/drive/MyDrive/data/test-claims-unlabelled.json") # on colab
test = test.transpose()
test.head()

Unnamed: 0,claim_text
claim-2967,The contribution of waste heat to the global c...
claim-979,“Warm weather worsened the most recent five-ye...
claim-1609,Greenland has only lost a tiny fraction of its...
claim-1020,“The global reef crisis does not necessarily m...
claim-2599,Small amounts of very active substances can ca...


In [5]:
#visualising evidence data
if LOCAL_DEV:
    evidence = pd.read_json("../data/evidence.json",typ='series')
else:
    evidence = pd.read_json("/content/drive/MyDrive/data/evidence.json",typ='series')

In [7]:
print(len(evidence))
evidence.head()

1208827


evidence-0    John Bennet Lawes, English entrepreneur and ag...
evidence-1    Lindberg began his professional career at the ...
evidence-2    ``Boston (Ladies of Cambridge)'' by Vampire We...
evidence-3    Gerald Francis Goyer (born October 20, 1936) w...
evidence-4    He detected abnormalities of oxytocinergic fun...
dtype: object

In [8]:
import string
import contractions
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielsu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/danielsu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/danielsu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
def preprocess_data(data: pd.Series) -> pd.Series:
  preprocessed_data = {}
  stop_words = set(stopwords.words('english'))
  stop_words.remove('not')
  for id, text in data.items():
    text = text.lower()
    text = contractions.fix(text)
    tokens = word_tokenize(text)
    wnl = WordNetLemmatizer()
    lemmatized_tokens = [wnl.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    preprocessed_data[id] = " ".join(lemmatized_tokens)

  return pd.Series(preprocessed_data)

processed_evidence = preprocess_data(evidence)

test_claims = test['claim_text']
processed_test = preprocess_data(test_claims)
processed_test.head()

claim-2967               contribution waste heat global climate
claim-979     warm weather worsened recent drought included ...
claim-1609                greenland lost tiny fraction ice mass
claim-1020    global reef crisis not necessarily mean extinc...
claim-2599     small amount active substance cause large effect
dtype: object

In [10]:
processed_evidence = processed_evidence[processed_evidence.str.strip().str.len() > 0]

In [66]:
SPECIAL_TOKEN = ' <SPE_TOKEN> '
def prepareTrainData(n):
    train_claims = preprocess_data(train['claim_text'])
    processed_train_claim = preprocess_data(train_claims)
    text_lst = []
    label_lst = []
    for i in range(len(train)):
        train_claim = processed_train_claim[i]
        evidences = train.iloc[i]['evidences']
        for j in evidences:
            if j in processed_evidence.index :
                text = train_claim + SPECIAL_TOKEN + processed_evidence[j]
                text_lst.append(text)
                label_lst.append('related')
        filtered_evi = processed_evidence[~processed_evidence.index.isin(evidences)]
        random_evidence = filtered_evi.sample(n)
        for k in random_evidence:
            text = train_claim + SPECIAL_TOKEN + k
            text_lst.append(text)
            label_lst.append('unrelated')
    claim_evi_label = {'text': text_lst, 'label': label_lst}
    return pd.DataFrame(claim_evi_label)

train_claims = train['claim_text']
processed_train_claim = preprocess_data(train_claims)
preparedTrain = prepareTrainData(10)
preparedTrain.head()

  train_claim = processed_train_claim[i]


Unnamed: 0,text,label
0,not scientific evidence pollutant higher conce...,related
1,not scientific evidence pollutant higher conce...,related
2,not scientific evidence pollutant higher conce...,related
3,not scientific evidence pollutant higher conce...,unrelated
4,not scientific evidence pollutant higher conce...,unrelated


# Two steps for the this task
# first. find all relavent evidence, either use contextual embedding or similarity scoring
# second. classify the evidents into 4 classes.

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import torchtext
from torchtext.data.utils import get_tokenizer
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms


In [79]:
# Vectorizing preprocessed text
vectorizer = TfidfVectorizer()
all_texts = pd.concat([processed_evidence, processed_train_claim])
vectorizer.fit(all_texts)

evidence_tfidf = vectorizer.transform(processed_evidence)
test_tfidf = vectorizer.transform(processed_test)

In [78]:
similarity_matrix = cosine_similarity(test_tfidf, evidence_tfidf)

def getTopN(similarity_matrix, test, evidence, n):
  test = test.to_frame(name='claim_text')
  top_indices = np.argsort(-similarity_matrix, axis = 1)[:, :n]
  top_evidence = [[str(evidence.index[i]) for i in row] for row in top_indices]
  test['evidences'] = top_evidence
  return test

test_with_evi = getTopN(similarity_matrix, processed_test, processed_evidence, 10)
test_with_evi.head()

Unnamed: 0,claim_text,evidences
claim-2967,contribution waste heat global climate,"[evidence-308923, evidence-213569, evidence-63..."
claim-979,warm weather worsened recent drought included ...,"[evidence-178433, evidence-421870, evidence-43..."
claim-1609,greenland lost tiny fraction ice mass,"[evidence-962481, evidence-1200633, evidence-7..."
claim-1020,global reef crisis not necessarily mean extinc...,"[evidence-642301, evidence-161852, evidence-67..."
claim-2599,small amount active substance cause large effect,"[evidence-834109, evidence-1175545, evidence-8..."


In [89]:
def prepareTestData():
    tfidf_claim = test_with_evi['claim_text']
    tfidf_evi = test_with_evi['evidences']
    text_lst = []
    for i in range(len(tfidf_claim)):
        test_claim = tfidf_claim[i]
        evidences = tfidf_evi[i]
        for j in evidences:
            text = test_claim + SPECIAL_TOKEN + processed_evidence[j]
            text_lst.append(text)
    claim_evi = {'text': text_lst}
    return pd.DataFrame(claim_evi)

preparedTest = prepareTestData()
preparedTest.head()

  test_claim = tfidf_claim[i]
  evidences = tfidf_evi[i]


Unnamed: 0,text
0,contribution waste heat global climate <SPE_TO...
1,contribution waste heat global climate <SPE_TO...
2,contribution waste heat global climate <SPE_TO...
3,contribution waste heat global climate <SPE_TO...
4,contribution waste heat global climate <SPE_TO...


In [72]:
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(preparedTrain['text']), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

text_pipeline = lambda x: vocab(tokenizer(x))
label_transform = torchtext.transforms.LabelToIndex({'related': 0, 'unrelated': 1})

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    text_list, label_list, offsets = [], [], [0]
    for _text, _label in batch:
        label_list.append(label_transform(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return text_list.to(device), label_list.to(device), offsets.to(device)

class TextDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        text = row['text'] 
        label = row['label'] 
        return text, label

dataset = TextDataset(preparedTrain)
dataloader = DataLoader(dataset, batch_size=20, shuffle=False, collate_fn=collate_batch)

In [91]:
for texts, labels, offsets in dataloader:
    print(texts, labels, offsets)

tensor([0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1]) tensor([    7,    78,    58,   612,   142,   143,   371,   568,   620,   593,
          156,  1296,   182,     1,    76,   143,    25,    71,   143,   482,
           12,    28,  2842,  1296,   182,  2035,   143,   525,   142,   132,
         1437,  1361,  6378, 12982,  5416,  5314,    22,     7,    78,    58,
          612,   142,   143,   371,   568,   620,   593,   156,  1296,   182,
            1,   156,   689,    51,   125,   530,   143,   525,   228,   419,
         4703,   367,   460,  5504,     8,     3,  6297,  5332,     7,    78,
           58,   612,   142,   143,   371,   568,   620,   593,   156,  1296,
          182,     1,   142,    12,    28,   143, 18314,   543,   156,   438,
         2071,    32,     7,    78,    58,   612,   142,   143,   371,   568,
          620,   593,   156,  1296,   182,     1,   232, 15007,  2490,  1007,
          411,  4151,   155,  4433,  5905,   558,    64,    29,     7,   

In [90]:
class TestDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        text = row['text'] 
        return text

def collate_test_batch(batch):
    text_list, offsets = [], [0]
    for _text in batch:
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return text_list.to(device), offsets.to(device)

test_dataset = TestDataset(preparedTest)
test_dataloader = DataLoader(test_dataset, batch_size=20, shuffle=False, collate_fn=collate_test_batch)

In [93]:
for texts, offsets in test_dataloader:
    print(texts, offsets)

tensor([ 1027,  1672,    49,     4,     3,     1,     4,   506,  1672,    49,
         1027,  1672,    49,     4,     3,     1,   564,  1672,    49,  1648,
           43,    26,   356,  1823,  1612,  2063,  1672,    49,  1803,    95,
         1027,  1672,    49,     4,     3,     1,  2228,  1677,     0,  2421,
          573,   540,    49,   177,  1672,    49,    45,    49,  1027,  1672,
           49,     4,     3,     1, 12827,  1672,   204,  1672,   794,     7,
          322,   736,  2066,  1291,  8153,  1672,  1067,  1134,  1536,  1672,
         1493,  1672,  1027,  1672,    49,     4,     3,     1,   153,    50,
           60,    54,  4062,  2188, 12061,  1672,    49,   461,   291, 24286,
          579,   101,   695,    49,  1672,    16,  1027,  1672,    49,     4,
            3,     1,    57,    60,    83,   545,  3660,   279,     3,     8,
           24,   123,  1672,    49,  2618,  1027,  1672,    49,     4,     3,
            1,  2004,  1672,    49,  1648,     0,   190,  3876, 

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*