# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [52]:
## package imports

#!pip install pandas scikit-learn torch torchtext

## deep-learning libraries
import tensorflow
import torch
import keras

## NLP preprocessing libraries
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

##others
import pandas as pd
import numpy as np
import matplotlib as plt
import string



In [53]:
## reading in json files

"""
Description of json files
* [train-claims,dev-claims].json: JSON files for the labelled training and development set; 
* evidence.json: JSON file containing a large number of evidence passages (i.e. the “knowledge source”); 
* dev-claims-baseline.json: JSON file containing predictions of a baseline system on the development set;
"""

## relative file paths

## baseline system - will not be used for any training/evaluation
devClaimsBaselineFile='./data/dev-claims-baseline.json'
## use this for model training
trainClaimsFile='./data/train-claims.json'
## use this set for hyperparameter tuning and evaluation metric 
devClaimsFile='./data/dev-claims.json'
## evidence files need to be downloaded through https://drive.google.com/file/d/1JlUzRufknsHzKzvrEjgw8D3n_IRpjzo6/view?usp=sharing as it is to big to be uploaded to github
evidenceFile='./data/evidence.json'

## import as pandas dataframe
devClaimsBaseline=pd.read_json(devClaimsBaselineFile)
trainClaims=pd.read_json(trainClaimsFile)
devClaims=pd.read_json(devClaimsFile)
evidence=pd.read_json(evidenceFile,orient='index')

## Separate claim_text,claim_label, and evidences from training and development sets, saved as pd dataframes
claimTextTrain=trainClaims.loc['claim_text'].to_frame()
claimLabelTrain=trainClaims.loc['claim_label'].to_frame()
evidenceTrain=trainClaims.loc['evidences'].to_frame()

claimTextDev=devClaims.loc['claim_text'].to_frame()
claimLabelDev=devClaims.loc['claim_label'].to_frame()
evidenceDev=devClaims.loc['evidences'].to_frame()


In [54]:
## Obtain the evidence texts and add it as a new column at the evidence pd dataframes
## This is in place transformation, but should always overwrite the evidence_text column so does not matter

def getTextEvidence(evidenceList):
    texts=[]
    for evidenceID in evidenceList:
        text=evidence.loc[evidenceID]
        texts.append(text)
    return texts

## keeps the original df to visually check that evidences are correct
evidenceDev['evidence_text']=evidenceDev['evidences'].apply(getTextEvidence)
evidenceTrain['evidence_text']=evidenceTrain['evidences'].apply(getTextEvidence)

In [55]:
## Preprocessing data -- lowercase, tokenize, and stopword removal
stopwords=set(nltk.corpus.stopwords.words('english'))
punctuations=string.punctuation

def preprocess(text):
    tokens=nltk.word_tokenize(text.lower())
    cleanedTokens=[t for t in tokens if (t not in stopwords) and (t not in punctuations)]
    return ' '.join(cleanedTokens)

claimTextDev['claim_text']=claimTextDev['claim_text'].apply(preprocess)
claimTextTrain['claim_text']=claimTextTrain['claim_text'].apply(preprocess)

##used to preprocess evidence, as structure is a bit different for evidence and claim dataframes
for sentence,i in zip(evidenceDev['evidence_text'],range(len(evidenceDev['evidence_text']))):
    for n in range(len(sentence)):
        evidenceDev['evidence_text'][i][n]=evidenceDev['evidence_text'][i][n].apply(preprocess)
        
for sentence,i in zip(evidenceTrain['evidence_text'],range(len(evidenceTrain['evidence_text']))):
    for n in range(len(sentence)):
        evidenceTrain['evidence_text'][i][n]=evidenceTrain['evidence_text'][i][n].apply(preprocess)



## evidence only pd dataframes
evidenceDevText=evidenceDev['evidence_text']
evidenceTrainText=evidenceTrain['evidence_text']

In [73]:
## use TF-IDF word embedding

tfidfVector=TfidfVectorizer()
x=tfidfVector.fit_transform(claimTextTrain['claim_text'])
print(x)


  (0, 2042)	0.25465864193776144
  (0, 365)	0.3357179817735733
  (0, 2584)	0.286016675105824
  (0, 3357)	0.31737470827388653
  (0, 1188)	0.286016675105824
  (0, 1691)	0.29426491672830174
  (0, 257)	0.25465864193776144
  (0, 795)	0.3043599486055108
  (0, 1707)	0.23386936307954653
  (0, 2612)	0.286016675105824
  (0, 739)	0.29443543623449897
  (0, 1302)	0.19798354634541807
  (0, 3043)	0.22723701033370075
  (1, 1221)	0.20758727315332845
  (1, 2108)	0.25488353480179354
  (1, 2123)	0.2847129492318619
  (1, 2163)	0.2512059813791188
  (1, 2970)	0.2091785370685991
  (1, 3338)	0.37394155546435687
  (1, 3418)	0.19264623027283195
  (1, 1582)	0.13904851359976036
  (1, 1711)	0.3586071279012528
  (1, 2829)	0.22087131783386782
  (1, 1164)	0.3955542172359483
  (1, 2357)	0.3046853775095814
  :	:
  (1226, 63)	0.2342925590059373
  (1226, 3149)	0.15874178134371789
  (1226, 3767)	0.25254064703344437
  (1226, 3690)	0.2394769713283662
  (1226, 1768)	0.1291513741773741
  (1226, 3483)	0.17927437646263644
  (1226

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*