### Do preprocessing

In [1]:
import preprocess

train_df, dev_df, dev_baseline_df, test_df, evidence_df = preprocess.return_df()
original_train_df, original_dev_df, original_dev_baseline_df, original_test_df, original_evidence_df = preprocess.return_original_df()

[nltk_data] Downloading package stopwords to C:\Users\Bill
[nltk_data]     Zhu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Bill
[nltk_data]     Zhu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loaded data
train_df: (1228, 3)
dev_df: (154, 3)
dev_baseline_df: (154, 3)
test_df: (153, 1)
evidence_df: (1208827, 2)

Start preprocessing...
[STEP 1/4] Normalised text!
[STEP 2/4] Removed stop words!
[STEP 3/4] Lemmatized text!
[STEP 4/4] Tokenized text!
[FINISHED] Saved preprocessed data to JSON files...

Ready for vectorization!


In [2]:
# print head of the dataframes
print(train_df.head())
print(dev_df.head())
print(dev_baseline_df.head())
print(test_df.head())
print(evidence_df.head())
print("#####################################")
print(original_train_df.head())
print(original_dev_df.head())
print(original_dev_baseline_df.head())
print(original_test_df.head())
print(original_evidence_df.head())

                                                   claim_text  \
claim-1937  [scientific, evidence, co2, pollutant, higher,...   
claim-126   [el, nio, drove, record, high, global, tempera...   
claim-2510                 [1946, pdo, switched, cool, phase]   
claim-2021  [weather, channel, cofounder, john, coleman, p...   
claim-2449  [january, 2008, capped, 12, month, period, glo...   

                claim_label                                          evidences  
claim-1937         DISPUTED  [evidence-442946, evidence-1194317, evidence-1...  
claim-126           REFUTES                [evidence-338219, evidence-1127398]  
claim-2510         SUPPORTS                 [evidence-530063, evidence-984887]  
claim-2021         DISPUTED  [evidence-1177431, evidence-782448, evidence-5...  
claim-2449  NOT_ENOUGH_INFO  [evidence-1010750, evidence-91661, evidence-72...  
                                                   claim_text  \
claim-752   [south, australia, expensive, electricity, wor

### Doc2Vec encoding

In [3]:
# Do doc2vec on the evidence_df
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import pandas as pd

train_doc = train_df["claim_text"]
dev_doc = dev_df["claim_text"]
dev_baseline_doc = dev_baseline_df["claim_text"]
test_doc = test_df["claim_text"]
evidence_doc = evidence_df["value"]
tagged_data_raw = (
    # train_df["claim_text"].tolist() +
    # dev_df["claim_text"].tolist() +
    # dev_baseline_df["claim_text"].tolist() +
    evidence_df["value"].tolist()
)

print(tagged_data_raw[0:5])

tagged_data = [
    TaggedDocument(words=doc, tags=[str(i)])
    for i, doc in enumerate(tagged_data_raw)
    if isinstance(doc, list) and all(isinstance(w, str) for w in doc)
]

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

model = Doc2Vec(vector_size=20, min_count=2, epochs=10)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)


# Get vectors for claim_text
train_df["claim_vector"] = train_df["claim_text"].apply(
    lambda x: model.infer_vector(word_tokenize(str(x).lower())).tolist()
)

print("Finished vectorizing train_df claim_text")

dev_df["claim_vector"] = dev_df["claim_text"].apply(
    lambda x: model.infer_vector(word_tokenize(str(x).lower())).tolist()
)

print("Finished vectorizing dev_df claim_text")

dev_baseline_df["claim_vector"] = dev_baseline_df["claim_text"].apply(
    lambda x: model.infer_vector(word_tokenize(str(x).lower())).tolist()
)

print("Finished vectorizing dev_baseline_df claim_text")

# Get vectors for evidence values
evidence_df["evidence_vector"] = evidence_df["value"].apply(
    lambda x: model.infer_vector(word_tokenize(str(x).lower())).tolist()
)

print("Finished vectorizing evidence_df value")

print(train_df.head())





[['john', 'bennet', 'lawes', 'english', 'entrepreneur', 'agricultural', 'scientist'], ['lindberg', 'began', 'professional', 'career', 'age', '16', 'eventually', 'moving', 'new', 'york', 'city', '1977'], ['boston', 'lady', 'cambridge', 'vampire', 'weekend'], ['gerald', 'francis', 'goyer', 'born', 'october', '20', '1936', 'professional', 'ice', 'hockey', 'player', 'played', '40', 'game', 'national', 'hockey', 'league'], ['detected', 'abnormality', 'oxytocinergic', 'function', 'schizoaffective', 'mania', 'postpartum', 'psychosis', 'ect', 'modified', 'oxytocin', 'release']]


[nltk_data] Downloading package punkt to C:\Users\Bill
[nltk_data]     Zhu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2025-05-13 10:51:34,781 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d20,n5,w5,mc2,s0.001,t3>', 'datetime': '2025-05-13T10:51:34.781091', 'gensim': '4.3.3', 'python': '3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}
2025-05-13 10:51:34,781 : INFO : collecting all words and their counts
2025-05-13 10:51:34,782 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2025-05-13 10:51:34,816 : INFO : PROGRESS: at example #10000, processed 120676 words (3594136 words/s), 27728 word types, 10000 tags
2025-05-13 10:51:34,849 : INFO : PROGRESS: at example #20000, processed 240194 words (3725542 words/s), 43858 word types, 20000 tags
2025-05-13 10:51:34,880 : INFO : PROGRESS: at example #30000, processed

Finished vectorizing train_df claim_text
Finished vectorizing dev_df claim_text
Finished vectorizing dev_baseline_df claim_text
Finished vectorizing evidence_df value
                                                   claim_text  \
claim-1937  [scientific, evidence, co2, pollutant, higher,...   
claim-126   [el, nio, drove, record, high, global, tempera...   
claim-2510                 [1946, pdo, switched, cool, phase]   
claim-2021  [weather, channel, cofounder, john, coleman, p...   
claim-2449  [january, 2008, capped, 12, month, period, glo...   

                claim_label  \
claim-1937         DISPUTED   
claim-126           REFUTES   
claim-2510         SUPPORTS   
claim-2021         DISPUTED   
claim-2449  NOT_ENOUGH_INFO   

                                                    evidences  \
claim-1937  [evidence-442946, evidence-1194317, evidence-1...   
claim-126                 [evidence-338219, evidence-1127398]   
claim-2510                 [evidence-530063, evidence-984887

### Ranking

In [4]:
### Ranking 1: cosine similarity on claim_text with evidence to produce top 100 evidence for each claim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

similarity_matrix = cosine_similarity(train_df["claim_vector"].tolist(), evidence_df["evidence_vector"].tolist())

top_k = 100
top_indices = np.argsort(-similarity_matrix, axis=1)[:, :top_k]  # sort descending, get top 100 indices

# Create new column with top evidence texts for each claim
train_df["top_100_evidence"] = [
    evidence_df.iloc[indices]["key"].tolist() for indices in top_indices
]

print(train_df[["claim_text", "top_100_evidence"]].head())

                                                   claim_text  \
claim-1937  [scientific, evidence, co2, pollutant, higher,...   
claim-126   [el, nio, drove, record, high, global, tempera...   
claim-2510                 [1946, pdo, switched, cool, phase]   
claim-2021  [weather, channel, cofounder, john, coleman, p...   
claim-2449  [january, 2008, capped, 12, month, period, glo...   

                                             top_100_evidence  
claim-1937  [evidence-143269, evidence-796217, evidence-35...  
claim-126   [evidence-743888, evidence-372910, evidence-85...  
claim-2510  [evidence-130713, evidence-779638, evidence-10...  
claim-2021  [evidence-104597, evidence-563911, evidence-37...  
claim-2449  [evidence-1205158, evidence-1203993, evidence-...  


In [None]:
print(train_df["top_100_evidence"].iloc[0])
l = ["evidence-442946","evidence-1194317","evidence-12171"]
for i in range(len(train_df)):
    if train_df["top_100_evidence"].iloc[i] in l:
        print("Found it")
        print(train_df["claim_text"].iloc[i])
        print(train_df["claim_vector"].iloc[i])
        print(train_df["top_100_evidence"].iloc[i])
    else:
        print("Not found")

['evidence-143269', 'evidence-796217', 'evidence-351732', 'evidence-284457', 'evidence-973135', 'evidence-892094', 'evidence-363147', 'evidence-6413', 'evidence-1010652', 'evidence-145808', 'evidence-53852', 'evidence-754066', 'evidence-172162', 'evidence-429076', 'evidence-707114', 'evidence-259493', 'evidence-765966', 'evidence-25131', 'evidence-1037802', 'evidence-248899', 'evidence-835637', 'evidence-1202020', 'evidence-412151', 'evidence-181754', 'evidence-190926', 'evidence-915997', 'evidence-85124', 'evidence-766720', 'evidence-254468', 'evidence-366694', 'evidence-789604', 'evidence-308217', 'evidence-878386', 'evidence-1077265', 'evidence-41044', 'evidence-407364', 'evidence-18216', 'evidence-366640', 'evidence-334132', 'evidence-288500', 'evidence-544149', 'evidence-1104136', 'evidence-585397', 'evidence-748483', 'evidence-565213', 'evidence-679107', 'evidence-685800', 'evidence-1071199', 'evidence-971926', 'evidence-366455', 'evidence-667021', 'evidence-980491', 'evidence-17