# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
## import libraries

!pip3 install torchtext==0.4.0

# Standard libraries
import math
import string
import pandas as pd
import numpy as np
import json

# Machine learning and deep learning libraries
import tensorflow
import keras
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.data import Field, LabelField, Example, Dataset
from torchtext.data.utils import get_tokenizer

# Natural Language Processing (NLP) libraries
import nltk
import spacy
from nltk.corpus import stopwords

# Visualization
import matplotlib.pyplot as plt



In [2]:
## reading in json files

"""
Description of json files
* [train-claims,dev-claims].json: JSON files for the labelled training and development set; 
* evidence.json: JSON file containing a large number of evidence passages (i.e. the “knowledge source”); 
* dev-claims-baseline.json: JSON file containing predictions of a baseline system on the development set;
"""

## relative file paths

## baseline system - will not be used for any training/evaluation
devClaimsBaselineFile='./data/dev-claims-baseline.json'
## use this for model training
trainClaimsFile='./data/train-claims.json'
## use this set for hyperparameter tuning and evaluation metric 
devClaimsFile='./data/dev-claims.json'
## evidence files need to be downloaded through https://drive.google.com/file/d/1JlUzRufknsHzKzvrEjgw8D3n_IRpjzo6/view?usp=sharing as it is to big to be uploaded to github
evidenceFile='./data/evidence.json'

testFile='./data/test-claims-unlabelled.json'

tokenizer=get_tokenizer("basic_english")


In [3]:
# Load the JSON data
with open(trainClaimsFile, 'r') as file:
    trainClaims=json.load(file)
with open(devClaimsFile, 'r') as file:
    devClaims=json.load(file)
with open(evidenceFile, 'r') as file:
    evidenceData=json.load(file)

## Preprocessing data -- lowercase, tokenize, and stopword removal
stopwords=set(nltk.corpus.stopwords.words('english'))
tokenizer=get_tokenizer('basic_english')
punctuations=string.punctuation

def preprocess(text):
    token=tokenizer(text.lower())
    cleanedTokens=[t for t in token if (t not in stopwords) and (t not in punctuations)]
    return ' '.join(cleanedTokens)

for ids, texts in evidenceData.items():
    evidenceData[ids]=preprocess(texts)

# Function to create DataFrame and merge evidence IDs with Text
def createDF(claims, evidence):
    combinedData=[]
    for claimID, claimText in claims.items():
        # Combine the ID with its corresponding evidences
        evidenceID=claimText['evidences']
        evidenceText=(evidence[i] for i in evidenceID if i in evidence)
        combinedData.append({
            'claim_id': claimID,
            'claim_text': preprocess(claimText['claim_text']),
            'evidence_id': evidenceID,
            'evidence_text': " ".join(evidenceText),
            'claim_label': claimText['claim_label']
        })
    # Create DataFrame
    return pd.DataFrame(combinedData)

# Create CSV Files
trainFullMerged=createDF(trainClaims,evidenceData)
devFullMerged=createDF(devClaims,evidenceData)
trainFullMerged.to_csv("data/trainFullMerged.csv", index=False)
devFullMerged.to_csv("data/devFullMerged.csv", index=False)

# Convert evidence into csv as well
# evidenceFinal=pd.DataFrame(list(evidenceData.items()),columns=['evidence_id','evidence_text'])
# evidenceFinal.to_csv('data/evidencePreprocessed.csv',index=False)

# Convert unlabelled Data into CSV as well
with open(testFile, 'r') as file:
    testData=json.load(file)
for ids, texts in testData.items():
    claim_text = texts['claim_text']
    testData[ids]=preprocess(claim_text)
testFinal=pd.DataFrame(list(testData.items()), columns=['claim_id', 'claim_text'])
testFinal.to_csv('data/testPreprocessed.csv',index=False)

In [4]:
## use this for model training
trainClaimsFile='./data/trainFullMerged.csv'
## use this set for hyperparameter tuning and evaluation metric 
devClaimsFile='./data/devFullMerged.csv'
## evidence files need to be downloaded through googledrive (https://drive.google.com/file/d/1OyihwdAWfqHIOueCB4bLBkYg4hTN_OKm/view?usp=sharing)
evidenceFile='./data/evidencePreprocessed.csv'
## test unlabelled dataset
testFile='./data/testPreprocessed.csv'

trainDataframe=pd.read_csv(trainClaimsFile)
devDataframe=pd.read_csv(devClaimsFile)
evidenceDataframe=pd.read_csv(evidenceFile)
testDataframe=pd.read_csv(testFile)

trainDataframe['claim_text']=trainDataframe['claim_text']
#trainDataframe['evidence_id']=trainDataframe['evidence_id'].astype(str).str.strip('[]').str.strip("'").apply(preprocess)
#trainDataframe['combined_evidence']=trainDataframe['evidence_id']+" "+trainDataframe['evidence_text']

devDataframe['claim_text']=devDataframe['claim_text']
#devDataframe['evidence_id']=devDataframe['evidence_id'].astype(str).str.strip('[]').str.strip("'").apply(preprocess)
#devDataframe['combined_evidence']=devDataframe['evidence_id'] +" "+ devDataframe['evidence_text']

#evidenceDataframe['combined_evidence']=evidenceDataframe['evidence_id']+" "+evidenceDataframe['evidence_text']
#evidenceDataframe['combined_evidence']=evidenceDataframe['combined_evidence'].astype(str).str.strip("'").apply(preprocess)


In [39]:
## Preprocessing data -- lowercase, tokenize, and stopword removal
stopwords=set(nltk.corpus.stopwords.words('english'))
tokenizer=get_tokenizer('basic_english')
punctuations=set(string.punctuation)

def preprocess(text):
    tokens=tokenizer(text)
    cleanedTokens = [token for token in tokens if token not in stopwords and token not in punctuations]
    return cleanedTokens


## this one is used to get the prediction words directly
TEXT = Field(tokenize=preprocess, lower=True, init_token='<sos>', eos_token='<eos>', batch_first=False)
LABEL=LabelField(batch_first=False)
def createDatasetEncoderInput(dataframe,textTransform,labelTransform):
    field=[('reviewTextInput',textTransform),('evidenceTextOutput',labelTransform)]
    examples=[]
    for _, row in dataframe.iterrows():
        reviewTextInput=row['claim_text']
        evidenceTextOutput=row['evidence_id']
        examples.append(Example.fromlist([reviewTextInput,evidenceTextOutput], field))
    return Dataset(examples,fields=field)

def createDatasetOutput(dataframe,textTransform,labelTransform):
    field=[('evidenceTextOutput',labelTransform)]
    examples=[]
    for _, row in dataframe.iterrows():
        evidenceTextOutput=row['evidence_id']
        examples.append(Example.fromlist([evidenceTextOutput], field))
    return Dataset(examples,fields=field)

def createDatasetTest(dataframe,textTransform):
    field=[('reviewTextInput',textTransform)]
    examples=[]
    for _, row in dataframe.iterrows():
        reviewTextInput=row['claim_text']
        examples.append(Example.fromlist([reviewTextInput], field))
    return Dataset(examples,fields=field)

## remove LABEL if you want to get the text as predictions instead of ids
trainTensor = createDatasetEncoderInput(trainDataframe, TEXT,LABEL)
devTensor = createDatasetEncoderInput(devDataframe, TEXT,LABEL)
evidenceTensor=createDatasetOutput(evidenceDataframe,TEXT,LABEL)
testTensor=createDatasetTest(testDataframe,TEXT)


In [40]:
TEXT.build_vocab(trainTensor)
LABEL.build_vocab(evidenceTensor)

In [41]:
for example in trainTensor.examples[:5]:
    print("Review Text:", example.reviewTextInput)
    print("Evidence Text:", example.evidenceTextOutput)
    print()
    
for example in trainTensor.examples[:5]:
    print(vars(example))

Review Text: ['scientific', 'evidence', 'co2', 'pollutant', 'higher', 'co2', 'concentrations', 'actually', 'help', 'ecosystems', 'support', 'plant', 'animal', 'life']
Evidence Text: ['evidence-442946', 'evidence-1194317', 'evidence-12171']

Review Text: ['el', 'niño', 'drove', 'record', 'highs', 'global', 'temperatures', 'suggesting', 'rise', 'may', 'man-made', 'emissions']
Evidence Text: ['evidence-338219', 'evidence-1127398']

Review Text: ['1946', 'pdo', 'switched', 'cool', 'phase']
Evidence Text: ['evidence-530063', 'evidence-984887']

Review Text: ['weather', 'channel', 'co-founder', 'john', 'coleman', 'provided', 'evidence', 'convincingly', 'refutes', 'concept', 'anthropogenic', 'global', 'warming']
Evidence Text: ['evidence-1177431', 'evidence-782448', 'evidence-540069', 'evidence-352655', 'evidence-1007867']

Review Text: ['january', '2008', 'capped', '12', 'month', 'period', 'global', 'temperature', 'drops', 'major', 'well', 'respected', 'indicators']
Evidence Text: ['evidence

In [42]:
# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def preprocess_evidence_ids(evidence_id_str):
    # Strip brackets and split by comma
    evidence_ids = evidence_id_str.strip("[]").replace("'", "").split(', ')
    return evidence_ids

def batchify(dataset, bsz):
    # Process text and labels to tensors
    data = TEXT.process([getattr(x, 'reviewTextInput') for x in dataset.examples])
    label = [preprocess_evidence_ids(getattr(x, 'evidenceTextOutput')) for x in dataset.examples]
    label_temp = [item for sublist in label for item in sublist]
    label = LABEL.process(label_temp)
    # Calculate number of complete batches
    nbatch = min(data.size(0), label.size(0)) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    label = label.narrow(0, 0, nbatch * bsz)

    # Reshape data into batches
    data = data.view(bsz, -1).t().contiguous()
    label = label.view(bsz, -1).t().contiguous()
    
    # Ensure each has the same number of sequences
    min_seq_len = min(data.size(0), label.size(0))
    data = data[:min_seq_len]
    label = label[:min_seq_len]
    print(len(data))
    print(len(dataset))
    return data.to(device), label.to(device)

TEST = Field(tokenize=preprocess, lower=True, init_token='<sos>', eos_token='<eos>', batch_first=True)
TEST.build_vocab(trainTensor)
def processTextOnly(dataset):
    data = TEST.process([getattr(x, 'reviewTextInput') for x in dataset.examples])
    return data.to(device)

# Batch sizes
batch_size = 20
eval_batch_size = 35

# Process the datasets
for example in trainTensor.examples[:5]:
    print(vars(example))
        
train_data,train_label = batchify(trainTensor, batch_size)
dev_data,dev_label = batchify(devTensor, eval_batch_size)
test_data=processTextOnly(testTensor)
dev_accuracy_data=processTextOnly(devTensor)

bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

{'reviewTextInput': ['scientific', 'evidence', 'co2', 'pollutant', 'higher', 'co2', 'concentrations', 'actually', 'help', 'ecosystems', 'support', 'plant', 'animal', 'life'], 'evidenceTextOutput': "['evidence-442946', 'evidence-1194317', 'evidence-12171']"}
{'reviewTextInput': ['el', 'niño', 'drove', 'record', 'highs', 'global', 'temperatures', 'suggesting', 'rise', 'may', 'man-made', 'emissions'], 'evidenceTextOutput': "['evidence-338219', 'evidence-1127398']"}
{'reviewTextInput': ['1946', 'pdo', 'switched', 'cool', 'phase'], 'evidenceTextOutput': "['evidence-530063', 'evidence-984887']"}
{'reviewTextInput': ['weather', 'channel', 'co-founder', 'john', 'coleman', 'provided', 'evidence', 'convincingly', 'refutes', 'concept', 'anthropogenic', 'global', 'warming'], 'evidenceTextOutput': "['evidence-1177431', 'evidence-782448', 'evidence-540069', 'evidence-352655', 'evidence-1007867']"}
{'reviewTextInput': ['january', '2008', 'capped', '12', 'month', 'period', 'global', 'temperature', 'dr

In [35]:
print(TEXT.vocab.stoi)




# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [28]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        #triu returns the upper triangular part of a matrix (2-D tensor) or batch of matrices (see section below)
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

In [29]:
#triu returns the upper triangular part of a matrix (2-D tensor) or batch of matrices (see section below)
torch.triu(torch.ones(3, 3))

tensor([[1., 1., 1.],
        [0., 1., 1.],
        [0., 0., 1.]])

In [30]:
# Masking
def masking():
  sz = 4
  mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
  mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))

  return mask

masking()

tensor([[0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf],
        [0., 0., 0., 0.]])

In [31]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term) #0::2 means starting with index 0, step = 2
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
    
ntokens = len(TEXT.vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)




# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [34]:
sos_token_id = TEXT.vocab.stoi['<sos>']
eos_token_id = TEXT.vocab.stoi['<eos>']
criterion = nn.CrossEntropyLoss(ignore_index=sos_token_id)
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train():
    model.train()  # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab.stoi)
    # The data and targets need to be loaded differently now
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        
        ## THIS ONE IS FOR EVIDENCE TEXT OUTPUT
        data, _ = get_batch(train_data, i) 
        targets, _ = get_batch(train_label, i)  # Load targets from train_label
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntokens), targets.view(-1))  # Ensure targets are properly shaped
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

best_val_loss = float("inf")
epochs = 3# The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, dev_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()


ZeroDivisionError: float division by zero

In [None]:
test_loss = evaluate(best_model, dev_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

model.eval()
output=model(dev_data)
probabilities=torch.nn.functional.softmax(output,dim=-1)
_,predictedIndex=torch.max(probabilities,dim=-1)
# for i in predictedIndex:
#     print(predictedIndex[i])

# print(LABEL.vocab.stoi)
from sklearn.metrics import accuracy_score

vocab = TEXT.vocab.stoi 
reverseVocab = {index: word for word, index in vocab.items()} 
def tensor_to_text(tensor, index_to_word):
    words = [index_to_word[index.item()] for index in tensor if index in index_to_word]
    return ' '.join(words)

# print(reverseVocab)

# for i in predictedIndex:
#     #print(i)
#     words = list(set([reverseVocab[index.item()] for index in i]))
#     print(words)


#accuracyScore=accuracy_score()

| End of training | test loss  8.42 | test ppl  4538.57


In [None]:
pred=model(test_data)
print(pred)
import torch

# use softmax
probabilities = torch.softmax(pred, dim=-1)
_, predicted_indices = torch.max(probabilities, dim=-1)
predicted_words = [[TEXT.vocab.itos[index] for index in example] for example in predicted_indices]
predicted_text={}
# Print the words
for i, sentence in enumerate(predicted_words):
    print(f"Sentence {i+1}: {' '.join(sentence)}")
    predicted_text[f'Sentence{i}']={' '.join(sentence)}


tensor([[[ 8.1715e-01,  2.8134e+00,  3.3177e-01,  ..., -1.8889e-01,
          -6.7526e-01,  1.9182e-03],
         [-1.1501e-01,  2.6535e+00, -4.1210e-01,  ..., -7.6813e-01,
          -6.2037e-01, -2.5765e-01],
         [-1.1501e-01,  2.6535e+00, -4.1210e-01,  ..., -7.6813e-01,
          -6.2037e-01, -2.5765e-01],
         ...,
         [-4.1553e-01,  3.1095e+00, -8.4322e-01,  ..., -1.1639e-01,
          -1.6485e-01, -1.0620e+00],
         [-4.1553e-01,  3.1095e+00, -8.4322e-01,  ..., -1.1639e-01,
          -1.6485e-01, -1.0620e+00],
         [-4.1553e-01,  3.1095e+00, -8.4322e-01,  ..., -1.1639e-01,
          -1.6485e-01, -1.0620e+00]],

        [[ 9.6800e-01,  2.7248e+00,  5.2278e-01,  ..., -2.4137e-01,
          -7.0250e-01,  2.4040e-01],
         [-1.8926e-02,  2.5646e+00, -1.9435e-01,  ..., -7.7838e-01,
          -6.9473e-01, -2.2776e-02],
         [-1.8926e-02,  2.5646e+00, -1.9435e-01,  ..., -7.7838e-01,
          -6.9473e-01, -2.2776e-02],
         ...,
         [-4.0758e-01,  3

In [None]:
remove_tokens = ['<pad>', '<unks>', '<sos>', '<eos>']
def clean_sentence(sentence, tokens_to_remove):
    for token in tokens_to_remove:
        sentence = sentence.replace(token, '')
    return sentence.strip()
cleaned_data = {key: [clean_sentence(sentence, remove_tokens) for sentence in value] for key, value in predicted_text.items()}
print(cleaned_data)

# vectorizer = TfidfVectorizer()
# tfidf_matrix = vectorizer.fit_transform(evidenceDataframe['combined_evidence'])
# print(tfidf_matrix)

unique_words_per_sentence = {}
for key, value in cleaned_data.items():
    # Split by spaces and filter out numeric values and punctuation
    words = set(word.strip('–,.') for word in value[0].split())
    unique_words_per_sentence[key] = words

print(unique_words_per_sentence)

{'Sentence0': ['5 °c °c °c °c °c °c °c °c 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'], 'Sentence1': ['5 °c °c °c °c °c °c °c °c 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'], 'Sentence2': ['5 °c °c °c °c °c °c 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'], 'Sentence3': ['5 °c °c °c °c °c °c 1 1 1 warming 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'], 'Sentence4': ['5 °c °c °c °c °c °c 1 warming 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'], 'Sentence5': ['5 5 5 5 5 5 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'], 'Sentence6': ['5 5 5 5 5 5 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'], 'Sentence7': ['5 5 5 5 5 5 5 1 1 1 1 1 1 1 1 may 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'], 'Sentence8': ['5 5 5 5 5 5 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 may 1 1 1'], 'Sentence9': ['5 5 5 5 5 5 5 1 1 1 1 1 1 1 1 may 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'], 'Sentence10': ['5 5 5 5 5 5 5 1 1 1 1 1 1 1 1 1 1 1 warming 1 1 1 1 1 1 1 1 1 1 1 1 1 1'], 'Sentence11': ['

In [None]:

# def find_evidence_ids(words):
#     matched_ids = []
#     for word in words:
#         ids = evidenceDataframe[evidenceDataframe['evidence_text'].str.contains(fr'\b{word}\b', na=False, case=False)]['evidence_id'].tolist()
#         matched_ids.extend(ids)
#     return set(matched_ids)

# evidence_ids_per_sentence = {key: find_evidence_ids(words) for key, words in unique_words_per_sentence.items()}

# print(evidence_ids_per_sentence)

In [None]:
#print(evidence_ids_per_sentence)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*