# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [52]:
LOCAL_DEV = True # to switch between developing locally and on colab

if not LOCAL_DEV:
    # TODO: need to upload data files on Google Drive?
    from google.colab import drive
    drive.mount('/content/drive')

In [53]:
#Imports
import numpy as np
import torch
import pandas as pd

In [54]:
#visualising training data
if LOCAL_DEV:
    train = pd.read_json("../data/train-claims.json") # for local dev
    
else:
    train = pd.read_json("/content/drive/MyDrive/data/train-claims.json") # on colab
train = train.transpose()
train.head()


Unnamed: 0,claim_text,claim_label,evidences
claim-1937,Not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1..."
claim-126,El Niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]"
claim-2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]"
claim-2021,Weather Channel co-founder John Coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5..."
claim-2449,"""January 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72..."


In [55]:
#visualising evidence data
if LOCAL_DEV:
    evidence = pd.read_json("../data/evidence.json",typ='series')
else:
    evidence = pd.read_json("/content/drive/MyDrive/data/evidence.json",typ='series')

In [56]:
print(len(evidence))
evidence.head()

1208827


evidence-0    John Bennet Lawes, English entrepreneur and ag...
evidence-1    Lindberg began his professional career at the ...
evidence-2    ``Boston (Ladies of Cambridge)'' by Vampire We...
evidence-3    Gerald Francis Goyer (born October 20, 1936) w...
evidence-4    He detected abnormalities of oxytocinergic fun...
dtype: object

In [57]:
#preprocessing
# punctuations should be removed, common words such as the, is, are, should be removed. all words also should be lemmentised and stemmed.
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')


def preprocess_evidence(data: pd.Series, limit:int=999999999 )-> dict: 
    processed_evidence = {}
    count = 0
    for evidence_num, text in data.items():
        processed_tokens = tokenize(text)
        processed_evidence[evidence_num] = processed_tokens
        count += 1
        if(count >= limit): # for faster testing
            break
    return processed_evidence

def tokenize(text: str) -> defaultdict:
    wnl = WordNetLemmatizer()
    tokens = word_tokenize(text)
        #print(tokens)
        # TODO: might need to reserve the order of the words too
        # could preprocess 2 versions, each for one model?
    processed_tokens = defaultdict(int) 
    for word in tokens:
            # no symbols, no punctuations, no stopwords
        if not word.isalpha() or word in stopwords.words():
            continue
            
        processed_tokens[wnl.lemmatize(word)] += 1
    return processed_tokens

processed_evidence = preprocess_evidence(evidence, limit=500)
print(processed_evidence)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaiyuancui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'evidence-0': defaultdict(<class 'int'>, {'John': 1, 'Bennet': 1, 'Lawes': 1, 'English': 1, 'entrepreneur': 1, 'agricultural': 1, 'scientist': 1}), 'evidence-1': defaultdict(<class 'int'>, {'Lindberg': 1, 'began': 1, 'professional': 1, 'career': 1, 'age': 1, 'eventually': 1, 'moving': 1, 'New': 1, 'York': 1, 'City': 1}), 'evidence-2': defaultdict(<class 'int'>, {'Boston': 1, 'Ladies': 1, 'Cambridge': 1, 'Vampire': 1, 'Weekend': 1}), 'evidence-3': defaultdict(<class 'int'>, {'Gerald': 1, 'Francis': 1, 'Goyer': 1, 'born': 1, 'October': 1, 'professional': 1, 'ice': 1, 'hockey': 1, 'player': 1, 'played': 1, 'game': 1, 'National': 1, 'Hockey': 1, 'League': 1}), 'evidence-4': defaultdict(<class 'int'>, {'He': 1, 'detected': 1, 'abnormality': 1, 'oxytocinergic': 1, 'function': 1, 'schizoaffective': 1, 'mania': 1, 'psychosis': 1, 'ECT': 1, 'modified': 1, 'oxytocin': 1, 'release': 1}), 'evidence-5': defaultdict(<class 'int'>, {'With': 1, 'peak': 1, 'wind': 1, 'mph': 1, 'minimum': 1, 'pressure'

# Two steps for the this task
# first. find all relavent evidence, either use contextual embedding or similarity scoring
# second. classify the evidents into 4 classes.

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [58]:
import heapq

# find most relevant evidence based on similarity scoring:
def find_most_relavent(claim: str, evidence: dict, count: int) -> list[str]:
    tokenized_claim = tokenize(claim)
    print(tokenized_claim)
    evidence_scores = {}
    for evidence_num, evidence_tokens in evidence.items():
        # implement some kind of similarity score
        new_score = similarity_score(tokenized_claim, evidence_tokens)
        evidence_scores[evidence_num] = new_score

    return top_n(evidence_scores, count)

# computes a similarity score between a claim and an evidence
# doesn't work very well, we can improve this later
def similarity_score(t1: defaultdict, t2:defaultdict) -> float:
    overlap = 0
    total = 0

    for token, count in t1.items():
        if token in t2:
            overlap += count
        total += count
    
    for token, count in t2.items():
        if token in t1:
            overlap += count
        total += count


    return overlap/total



# helper function to get top n
def top_n(dictionary, n):
    heap = [(-value, key) for key, value in dictionary.items()]
    heapq.heapify(heap)
    top_n_elements = [heapq.heappop(heap) for _ in range(min(n, len(heap)))]
    top_n_elements = [(-value, key) for value, key in top_n_elements]
    return top_n_elements

test_most_relevant = find_most_relavent("Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.",processed_evidence, 5)
print(test_most_relevant)
for evidence in test_most_relevant:
    print(processed_evidence[evidence[1]])

defaultdict(<class 'int'>, {'Not': 1, 'scientific': 1, 'evidence': 1, 'pollutant': 1, 'higher': 1, 'concentration': 1, 'ecosystem': 1, 'support': 1, 'plant': 1, 'animal': 1, 'life': 1})
[(0.21052631578947367, 'evidence-215'), (0.11764705882352941, 'evidence-213'), (0.1111111111111111, 'evidence-368'), (0.1, 'evidence-236'), (0.08333333333333333, 'evidence-17')]
defaultdict(<class 'int'>, {'This': 1, 'Earth': 1, 'surface': 1, 'warm': 1, 'liquid': 1, 'water': 1, 'support': 1, 'life': 1})
defaultdict(<class 'int'>, {'Grasslands': 1, 'lower': 1, 'elevation': 2, 'forest': 1, 'higher': 1})
defaultdict(<class 'int'>, {'The': 1, 'creeping': 1, 'willowherb': 1, 'refer': 1, 'specie': 1, 'flowering': 1, 'plant': 1})
defaultdict(<class 'int'>, {'In': 1, 'animal': 1, 'bony': 1, 'tail': 1, 'tailhead': 1, 'dock': 1, 'bird': 1, 'anatomy': 1, 'tailfan': 1})
defaultdict(<class 'int'>, {'Yoper': 1, 'Linux': 2, 'Your': 1, 'Operating': 1, 'System': 1, 'distribution': 1, 'PCs': 1, 'Pentium': 1, 'Pro': 1, 'h

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*