# Creating tabular FEVER test sets from preprocessed tabular data

**Henry Zelenak | Last updated: 05/12/2025**

## Imports

In [4]:
import pandas as pd
import nltk  # Make sure NLTK is installed and data downloaded (e.g., nltk.download('punkt'))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import openai  # For LLM interaction
from openai import OpenAI
import numpy as np
from nltk import Tree, pos_tag, word_tokenize, ne_chunk
from nltk.corpus import stopwords
import numpy as np
from nltk import RegexpParser
import json
import tqdm
import ast

# Download the necessary NLTK data files
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
nltk.download('stopwords')
nltk.download('treebank')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_t

True

## Add data sources


In [43]:
# Mount google drive
from google.colab import drive
import gc

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
%cd ./drive/My Drive/SUNY_Poly_DSA598/datasets/FEVER/

[Errno 2] No such file or directory: './drive/My Drive/SUNY_Poly_DSA598/datasets/FEVER/'
/content/drive/My Drive/SUNY_Poly_DSA598/datasets/FEVER


In [45]:
!pwd

/content/drive/My Drive/SUNY_Poly_DSA598/datasets/FEVER


In [6]:
def load_jsonl(file_path, encoding='utf-8'):
    """Loads a JSON Lines file into a list of Python objects."""
    data = []
    with open(file_path, 'r', encoding=encoding) as f:  # Specify encoding for safety
        for line in f:
            data.append(json.loads(line))  # Parse each line individually
    return data

In [10]:
# Data paths (replace with your actual paths if different)
train_clf_path = f"tabular_sets/tabular_clf_paper_dev_train/v1_segmented_sentIDs_n3461_04-04_002.csv"
valid_clf_path = f"tabular_sets/tabular_clf_paper_dev_valid/v1_segmented_sentIDs_n1482_04-04_002.csv"
train_sentEx_path = f"tabular_sets/tabular_sentEx_paper_dev_train/v1_segmented_sentIDs_n3461_04-04_002.csv"
valid_sentEx_path = f"tabular_sets/tabular_sentEx_paper_dev_valid/v1_segmented_sentIDs_n1482_04-04_002.csv"
test_path = f"paper_test.jsonl"
train_path = f"paper_dev.jsonl"

# Load datasets
train_clf = pd.read_csv(train_clf_path)
valid_clf = pd.read_csv(valid_clf_path)
train_sentEx = pd.read_csv(train_sentEx_path)
valid_sentEx = pd.read_csv(valid_sentEx_path)
test_jsonl = load_jsonl(test_path)
train_jsonl = load_jsonl(train_path)

In [11]:
# Show the distribution of labels
print(f"SentEx train set label distribution:")
print(train_sentEx['label'].value_counts())
print(f"SentEx valid set label distribution:")
print(valid_sentEx['label'].value_counts())
print(f"CLF train set label distribution:")
print(train_clf['label'].value_counts())
print(f"CLF valid set label distribution:")
print(valid_clf['label'].value_counts())

# Balance the labels by reducing each to the minimum count
min_count = min(train_sentEx['label'].value_counts())
train_sentEx = train_sentEx.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
min_count = min(valid_sentEx['label'].value_counts())
valid_sentEx = valid_sentEx.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
min_count = min(train_clf['label'].value_counts())
train_clf = train_clf.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
min_count = min(valid_clf['label'].value_counts())
valid_clf = valid_clf.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

# Show the distribution of labels after balancing
print(f"SentEx train set label distribution after balancing:")
print(train_sentEx['label'].value_counts())
print(f"SentEx valid set label distribution after balancing:")
print(valid_sentEx['label'].value_counts())
print(f"CLF train set label distribution after balancing:")
print(train_clf['label'].value_counts())
print(f"CLF valid set label distribution after balancing:")
print(valid_clf['label'].value_counts())

SentEx train set label distribution:
label
SUPPORTS           1156
REFUTES            1156
NOT ENOUGH INFO    1149
Name: count, dtype: int64
SentEx valid set label distribution:
label
SUPPORTS           495
REFUTES            495
NOT ENOUGH INFO    488
Name: count, dtype: int64
CLF train set label distribution:
label
SUPPORTS           1156
REFUTES            1156
NOT ENOUGH INFO    1149
Name: count, dtype: int64
CLF valid set label distribution:
label
SUPPORTS           496
REFUTES            496
NOT ENOUGH INFO    486
Name: count, dtype: int64
SentEx train set label distribution after balancing:
label
NOT ENOUGH INFO    1149
REFUTES            1149
SUPPORTS           1149
Name: count, dtype: int64
SentEx valid set label distribution after balancing:
label
NOT ENOUGH INFO    488
REFUTES            488
SUPPORTS           488
Name: count, dtype: int64
CLF train set label distribution after balancing:
label
NOT ENOUGH INFO    1149
REFUTES            1149
SUPPORTS           1149
Name: cou

  train_sentEx = train_sentEx.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
  valid_sentEx = valid_sentEx.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
  train_clf = train_clf.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
  valid_clf = valid_clf.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)


## Functions to assemble samples from multiple parts

In [34]:
# Entity  extraction function
def extract_entities(text):
    """
    Extracts entities from the text using NLTK's Named Entity Chunker.

    Args:
        text (str): The input text.

    Returns:
        list of str: List of extracted entities.
    """
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]
    tagged_tokens = pos_tag(tokens)
    named_entities = ne_chunk(tagged_tokens)

    entities = []
    for subtree in named_entities:
        if isinstance(subtree, Tree):
            entity = " ".join([word for word, tag in subtree.leaves()])
            entities.append(entity)

    return entities


# Keyword extraction function
def extract_keywords(text):
    """
    Extracts keywords from the text using TF-IDF.

    Args:
        text (str): The input text.

    Returns:
        list of str: List of extracted keywords.
    """
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10)  # Adjust max_features as needed
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    dense = tfidf_matrix.todense()
    denselist = dense.tolist()
    # Convert the list to a NumPy array to use argsort()

    dense_array = np.array(denselist[0])
    keywords = [feature_names[i] for i in dense_array.argsort()[-5:]]  # Get top 5 keywords

    return keywords


##### SET THIS INDEX TO 0 TO START FROM THE FIRST CLAIM AFTER SHUFFLING THE DATA #####
##### IF WE SHUFFLE THE DATA USING THE SAME SEED, WE CAN DEVELOP A CONSISTENT TEST SET #####
index = 0
def get_test_claim(df, verbose=0, debug=False):
    """
    In "test" mode, gets data for a claim by matching a row in the df (which contains the claim and the wikipedia text data) to the JSONL object
    (which contains the claim and the evidence references) by the claim, returning the claim, label, evidence sentences, documents, and evidence references.

    In 'live' mode, gets data for a claim by generating a query and retrieving documents from Wikipedia. THIS MODE IS NOT COMPATIBLE WITH FEVER SCORING.

    EXAMPLE:
    {"id": 113501, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Grease had bad reviews.", "evidence": [[[133128, null, null, null]]]}
    {"id": 163803, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Ukrainian Soviet Socialist Republic was a founding participant of the UN.", "evidence": [[[296950, 288668, "Ukrainian_Soviet_Socialist_Republic", 7]], [[298602, 290067, "Ukrainian_Soviet_Socialist_Republic", 7], [298602, 290067, "United_Nations", 0]], [[300696, 291816, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344347, 327887, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344994, 328433, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344997, 328435, "Ukrainian_Soviet_Socialist_Republic", 7]]]}
    {"id": 70041, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "2 Hearts is a musical composition by Minogue.", "evidence": [[[225394, 230056, "2_Hearts_-LRB-Kylie_Minogue_song-RRB-", 0]], [[317953, 306972, "2_Hearts_-LRB-Kylie_Minogue_song-RRB-", 0]], [[319638, 308345, "2_Hearts_-LRB-Kylie_Minogue_song-RRB-", 0]], [[319643, 308348, "2_Hearts_-LRB-Kylie_Minogue_song-RRB-", 0]]]}
    {"id": 202314, "verifiable": "VERIFIABLE", "label": "REFUTES", "claim": "The New Jersey Turnpike has zero shoulders.", "evidence": [[[238335, 240393, "New_Jersey_Turnpike", 15]]]}
    """

    global index
    claim = df.iloc[index]['claim']
    documents = df.iloc[index]['full_text']
    documents = documents.split('\n')
    label = df.iloc[index]['label']
    keywords = extract_keywords(claim)  # Extract keywords from the claim
    evidence_items = df.iloc[index]['evidence_sentences']
    id = df.iloc[index]['id']

    if debug:
      print(f"DEBUG 1.1.1:")
      print(f"Evidence items: {evidence_items}")
    # Evidence items are in the format [sentence, page_title, sentence_id, entities[entity1, entity2, ...]]
    '''
    [('Despite their San Francisco Bay Area origins , they played in a Southern rock style , with lyrics about bayous , catfish , the Mississippi River , and other popular elements of Southern United States iconography , as well as political and socially-conscious lyrics about topics including the Vietnam War .', 'Creedence_Clearwater_Revival', 3, ['Vietnam War', 'Southern rock', 'San Francisco Bay Area', 'Opposition to United States involvement in the Vietnam War', 'rock', 'Mississippi River', 'rock music']), ('Creedence Clearwater Revival , often informally abbreviated to Creedence or CCR , was an American rock band active in the late 1960s and early 1970s .', 'Creedence_Clearwater_Revival', 0, ['rock', 'rock music']), ('Their musical style encompassed the roots rock , swamp rock , and blues rock genres .', 'Creedence_Clearwater_Revival', 2, ['roots rock', 'rock', 'blues rock', 'rock music', 'swamp rock'])]
    '''
    index += 1
    # Return essential information (we only need the documents, keywords, entities, claim, and label for the NOT ENOUGH INFO case, since we don't need to extract evidence sentences)
    if label == "NOT ENOUGH INFO":
        evidence_items = []
        entities = extract_entities(claim)  # Extract entities from the claim
        if verbose == 1:
          print(f"-------------------------------------------------------------")
          print(f"Claim: {claim}")
          print(f"Label: {label}")
          print(f"Evidence items: {evidence_items}")
          print(f"Documents: {documents}")
          print(f"Entities: {entities}")
          print(f"Keywords: {keywords}")
          print(f"-------------------------------------------------------------")
        return claim, label, evidence_items, documents, keywords, entities, id

    entities = []
    # Use ast.literal_eval to convert the string representation of the list to an actual list
    evidence_items = ast.literal_eval(evidence_items)
    # Extract entities from the evidence items
    for item in evidence_items:
        entities.extend(item[3])
        if debug:
          print(f"DEBUG 1.1.2:")
          print(f"\tEvidence item: {item}")
          print(f"\tEntities: {item[3]}")
          print("-_-_-_-_-_-_-_-_-_-_-_-_-_-")
    # Remove duplicates
    entities = list(set(entities))
    # Remove empty strings
    entities = [entity for entity in entities if entity]

    if verbose == 1:
        print(f"-------------------------------------------------------------")
        print(f"Claim: {claim}")
        print(f"Label: {label}")
        print(f"Evidence items: {evidence_items}")
        print(f"Documents: {documents}")
        print(f"Entities: {entities}")
        print(f"Keywords: {keywords}")
        print(f"-------------------------------------------------------------")


    return claim, label, evidence_items, documents, keywords, entities, id

## Functions to create independent, randomized, and balanced test sets

In [35]:
# Helper function to split a dataset into three datasets, one for each label (returns three dataframes)
def split_dataset(df):
    df_supports = df[df['label'] == 'SUPPORTS']
    df_refutes = df[df['label'] == 'REFUTES']
    df_not_enough_info = df[df['label'] == 'NOT ENOUGH INFO']

    return df_supports, df_refutes, df_not_enough_info

# Assemble a 99 sample balanced test subset by running get_test_claim in a loop and appending each result to a dictionary (only claim, label, evidence_items, documents_text, and entities)
def shuffle_assemble_balanced(df, verbose=0, debug=False):

  test_set = []

  df_supports, df_refutes, df_not_enough_info = split_dataset(df)

  df_supports = df_supports.sample(frac=1, random_state=2025)
  df_refutes = df_refutes.sample(frac=1, random_state=2025)
  df_not_enough_info = df_not_enough_info.sample(frac=1, random_state=2025)

  for i in range(0, 99): # 99 claims, 33 of each label
      # Cycle between all three using i%% and if statements
      if i % 3 == 0:
          claim, label, evidence_items, documents_text, keywords, entities, id = get_test_claim(df_supports, verbose=0, debug=debug)
      if i % 3 == 1:
          claim, label, evidence_items, documents_text, keywords, entities, id = get_test_claim(df_refutes, verbose=0, debug=debug)
      if i % 3 == 2:
          claim, label, evidence_items, documents_text, keywords, entities, id = get_test_claim(df_not_enough_info, verbose=0, debug=debug)


      test_set.append(
          {
              "id": id,
              "claim": claim,
              "label": label,
              "evidence_items": evidence_items,
              "documents_text": documents_text,
              "entities": entities, # Entities are part of the FEVER data (they are more robust than our nltk entity extraction from our testing)
          } # We don't include keyword as those are not in the FEVER data (we extract them in the system)
      )
      if verbose == 1:
        print(f"Appended claim {claim} with label {label} to the test set.")
        print(f"-------------------------------------------------------------\n")


  # Convert to a dataframe
  test_set = pd.DataFrame(test_set)
  return test_set

test_sub_valid_sentEx_A = shuffle_assemble_balanced(valid_sentEx, verbose=1, debug=False)
test_sub_valid_sentEx_B = shuffle_assemble_balanced(valid_sentEx, verbose=1, debug=False)
test_sub_valid_sentEx_C = shuffle_assemble_balanced(valid_sentEx, verbose=1, debug=False)



Appended claim Colin Kaepernick is a starting quarterback for the San Francisco 49ers. with label SUPPORTS to the test set.
-------------------------------------------------------------

Appended claim Private Lives is a two act comedy from 1930. with label REFUTES to the test set.
-------------------------------------------------------------

Appended claim Jennifer Lopez made a wood shed. with label NOT ENOUGH INFO to the test set.
-------------------------------------------------------------

Appended claim New Orleans Pelicans compete the southwest Division of the NBA's Western Conference. with label SUPPORTS to the test set.
-------------------------------------------------------------

Appended claim The Beach exclusively falls under the comedy genre. with label REFUTES to the test set.
-------------------------------------------------------------

Appended claim Bruce Shand was awarded the Military Cross for gallantry. with label NOT ENOUGH INFO to the test set.
----------------

In [40]:

print(test_sub_valid_sentEx_A['label'].value_counts())
print(test_sub_valid_sentEx_B['label'].value_counts())
print(test_sub_valid_sentEx_C['label'].value_counts())


label
SUPPORTS           33
REFUTES            33
NOT ENOUGH INFO    33
Name: count, dtype: int64
label
SUPPORTS           33
REFUTES            33
NOT ENOUGH INFO    33
Name: count, dtype: int64
label
SUPPORTS           33
REFUTES            33
NOT ENOUGH INFO    33
Name: count, dtype: int64


In [39]:
# Check for duplicates by id between all three dataframes
ab_overlap = len(set(test_sub_valid_sentEx_A['id']).intersection(set(test_sub_valid_sentEx_B['id'])))
ac_overlap = len(set(test_sub_valid_sentEx_A['id']).intersection(set(test_sub_valid_sentEx_C['id'])))
bc_overlap = len(set(test_sub_valid_sentEx_B['id']).intersection(set(test_sub_valid_sentEx_C['id'])))
print(f"Overlap between A and B: {ab_overlap}")
print(f"Overlap between A and C: {ac_overlap}")
print(f"Overlap between B and C: {bc_overlap}")


Overlap between A and B: 0
Overlap between A and C: 0
Overlap between B and C: 0


In [48]:
!mkdir -p tabular_sets/tabular_sentEx_paper_dev_test

In [49]:
test_sub_valid_sentEx_A.to_csv('./tabular_sets/tabular_sentEx_paper_dev_test/test_sub_valid_sentEx_A.csv', index=False)
test_sub_valid_sentEx_B.to_csv('./tabular_sets/tabular_sentEx_paper_dev_test/test_sub_valid_sentEx_B.csv', index=False)
test_sub_valid_sentEx_C.to_csv('./tabular_sets/tabular_sentEx_paper_dev_test/test_sub_valid_sentEx_C.csv', index=False)