# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [21]:
## package imports

#!pip install pandas scikit-learn torch torchtext

## deep-learning libraries
import tensorflow
import torch
import torch.nn as nn
import torch.nn.functional as F
import keras
from torchtext.data.utils import get_tokenizer

## NLP preprocessing libraries
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

##others
import pandas as pd
import numpy as np
import matplotlib as plt
import string



In [22]:
## reading in json files

"""
Description of json files
* [train-claims,dev-claims].json: JSON files for the labelled training and development set; 
* evidence.json: JSON file containing a large number of evidence passages (i.e. the “knowledge source”); 
* dev-claims-baseline.json: JSON file containing predictions of a baseline system on the development set;
"""

## relative file paths

## baseline system - will not be used for any training/evaluation
devClaimsBaselineFile='./data/dev-claims-baseline.json'
## use this for model training
trainClaimsFile='./data/train-claims.json'
## use this set for hyperparameter tuning and evaluation metric 
devClaimsFile='./data/dev-claims.json'
## evidence files need to be downloaded through https://drive.google.com/file/d/1JlUzRufknsHzKzvrEjgw8D3n_IRpjzo6/view?usp=sharing as it is to big to be uploaded to github
evidenceFile='./data/evidence.json'

## import as pandas dataframe
devClaimsBaseline=pd.read_json(devClaimsBaselineFile)
trainClaims=pd.read_json(trainClaimsFile)
devClaims=pd.read_json(devClaimsFile)
evidence=pd.read_json(evidenceFile,orient='index')
evidences=pd.DataFrame(list(evidence.items()), columns=['evidence_id', 'evidence_text'])

## Separate claim_text,claim_label, and evidences from training and development sets, saved as pd dataframes
claimTextTrain=trainClaims.loc['claim_text'].to_frame()
claimLabelTrain=trainClaims.loc['claim_label'].to_frame()
evidenceTrain=trainClaims.loc['evidences'].to_frame()

claimTextDev=devClaims.loc['claim_text'].to_frame()
claimLabelDev=devClaims.loc['claim_label'].to_frame()
evidenceDev=devClaims.loc['evidences'].to_frame()

print(evidence)


                                                                  0
evidence-0        John Bennet Lawes, English entrepreneur and ag...
evidence-1        Lindberg began his professional career at the ...
evidence-2        ``Boston (Ladies of Cambridge)'' by Vampire We...
evidence-3        Gerald Francis Goyer (born October 20, 1936) w...
evidence-4        He detected abnormalities of oxytocinergic fun...
...                                                             ...
evidence-1208822  Also on the property is a contributing garage ...
evidence-1208823  | class = ``fn org'' | Fyrde | | | | 6110 | | ...
evidence-1208824  Dragon Storm (game), a role-playing game and c...
evidence-1208825  It states that the Zeriuani ``which is so grea...
evidence-1208826  The storyline revolves around a giant plesiosa...

[1208827 rows x 1 columns]


In [23]:
## Obtain the evidence texts and add it as a new column at the evidence pd dataframes
## This is in place transformation, but should always overwrite the evidence_text column so does not matter

def getTextEvidence(evidenceList):
    texts=[]
    for evidenceID in evidenceList:
        text=evidences['evidence_id']==evidenceID
        texts.append(text)
        print(text)
    return texts

## keeps the original df to visually check that evidences are correct
evidenceDev['evidence_text']=evidenceDev['evidences'].apply(getTextEvidence)
evidenceTrain['evidence_text']=evidenceTrain['evidences'].apply(getTextEvidence)

0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtype: bool
0    False
Name: evidence_id, dtyp

In [24]:
## Preprocessing data -- lowercase, tokenize, and stopword removal
stopwords=set(nltk.corpus.stopwords.words('english'))
punctuations=string.punctuation

def preprocess(text):
    tokens=nltk.word_tokenize(text.lower())
    cleanedTokens=[t for t in tokens if (t not in stopwords) and (t not in punctuations)]
    return ' '.join(cleanedTokens)

claimTextDev['claim_text']=claimTextDev['claim_text'].apply(preprocess)
claimTextTrain['claim_text']=claimTextTrain['claim_text'].apply(preprocess)

##used to preprocess evidence, as structure is a bit different for evidence and claim dataframes
for sentence,i in zip(evidenceDev['evidence_text'],range(len(evidenceDev['evidence_text']))):
    for n in range(len(sentence)):
        evidenceDev['evidence_text'][i][n]=evidenceDev['evidence_text'][i][n].apply(preprocess)
        
for sentence,i in zip(evidenceTrain['evidence_text'],range(len(evidenceTrain['evidence_text']))):
    for n in range(len(sentence)):
        evidenceTrain['evidence_text'][i][n]=evidenceTrain['evidence_text'][i][n].apply(preprocess)

devClaimAndEvidenceMerged= pd.merge(claimTextDev, evidenceDev, left_index=True, right_index=True)
devFullMerged=pd.merge(devClaimAndEvidenceMerged,claimLabelDev,left_index=True,right_index=True)

trainClaimAndEvidenceMerged=pd.merge(claimTextTrain,evidenceTrain,left_index=True,right_index=True)
trainFullMerged=pd.merge(trainClaimAndEvidenceMerged,claimLabelTrain,left_index=True,right_index=True)

trainFullMerged.to_csv("data/trainFullMerged.csv", index=True)
devFullMerged.to_csv("data/devFullMerged.csv", index=True)



AttributeError: 'bool' object has no attribute 'lower'

In [None]:
## Convert test-claims-unlabelled into csv 

unlabelledFile='./data/train-claims-unlabelled.json'
claimUnlabelled=trainClaims.loc['claim_text'].to_frame()
claimUnlabelled.to_csv("data/claimUnlabelled.csv",index=True)

In [None]:
## use TF-IDF word embedding

tfidfVector=TfidfVectorizer()
tfidfVector.fit(claimTextTrain['claim_text'])

tfidfTrainSet=tfidfVector.transform(claimTextTrain['claim_text'])

tfidfDevSet=tfidfVector.transform(claimTextDev['claim_text'])

features=tfidfVector.get_feature_names_out()

In [None]:
x=pd.read_csv('data/devFullMerged.csv')
print(x)

     Unnamed: 0                                         claim_text  \
0     claim-752        south australia expensive electricity world   
1     claim-375  3 per cent total annual global emissions carbo...   
2    claim-1266         means world 1c warmer pre-industrial times   
3     claim-871  “ happens zika may also good model second worr...   
4    claim-2164              greenland lost tiny fraction ice mass   
..          ...                                                ...   
149  claim-2400  'to suddenly label co2 `` pollutant '' disserv...   
150   claim-204  natural orbitally driven warming atmospheric c...   
151  claim-1426  many world ’ coral reefs already barren state ...   
152   claim-698  recent study led lawrence livermore national l...   
153  claim-1021  corals may save many creatures attempting movi...   

                                             evidences  \
0                ['evidence-67732', 'evidence-572512']   
1    ['evidence-996421', 'evidence-1080858'

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
## Simple baseline logistic regression

from sklearn.linear_model import LogisticRegression

classifier=LogisticRegression()
classifier.fit(tfidfTrainSet,claimLabelTrain)
score=classifier.score(tfidfDevSet,claimLabelDev)
print(score)

0.4090909090909091


  y = column_or_1d(y, warn=True)


In [None]:
print(len(claimLabelDev))

154


# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*