In [119]:
from collections import Counter
import json
import sqlite3
import pandas as pd
from datasets import Dataset
from pathlib import Path
import logging
import re

import torch
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    pipeline
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils import resample, shuffle
from sklearn.utils.class_weight import compute_class_weight
from pathlib import Path
from datetime import datetime
from typing import List
import os
from enum import Enum

import shutil
import glob

The following is just a work in progress - but this config will help for building and testing I hope.

In [None]:
class FileConfig:
    __FullRunContext = """denotes using the complete dataset of just a smaller portion of it for testing."""
    FullRun = False

    __Percentage = """If NOT a full run, what percentage of data do we use? (Think decimal values 0 < pc < 1)"""
    Percentage = 0.1
    
    __ToBuildContext = """Do we want to build another model or run without build for testing purposes."""
    ToBuild = True

    __CustomLossFnContext = """For certain cases with class imbalance we need a custom loss function."""
    CustomLossFn = False

    BaseModelName = "bert-base-uncased"

# class Labels(Enum)
LabelMap = Enum(
    'LabelMap', 
    [
        ('SUPPORTS', 0),
        ('REFUTES', 1),
        ('NOT ENOUGH INFO', 2),
    ]
)

In [89]:
# FULL_RUN denotes using all the dataset or a small bit of it for testing the process
FULL_RUN = False
MODEL_VERSION = "v0.1.3"
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

Turned out the database was 52GB of Wikipedia articles, but the dataset fit OK in one file so chunking wasn't necessary...
Keeping the logic though.

## Loading Data

The data had to be processed on my Windows laptop since the wiki DB was over 50GB large and I wasn't transfering to WSL.
I was able to create Parquet files which are good at storage. 

In [90]:
def load_processed_fever(output_dir: Path, filestart: str,  max_chunks=None):
    """
    Load processed FEVER data from disk
    
    Args:
        output_dir: Directory containing processed chunks
        max_chunks: Maximum number of chunks to load (None for all)
    """
    output_path = Path(output_dir).resolve()
    assert output_path.exists()
    
    # Find all parquet files
    parquet_files = sorted(output_path.glob(f"{filestart}*.parquet"))
    
    if max_chunks:
        parquet_files = parquet_files[:max_chunks]
    
    logger.info(f"Loading {len(parquet_files)} chunks from {output_path}")
    
    # Load and combine all chunks
    dfs = []
    for file in parquet_files:
        df = pd.read_parquet(file)
        dfs.append(df)
        logger.info(f"Loaded {file.name}: {len(df)} samples")
    
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        logger.info(f"Total samples loaded: {len(combined_df)}")
        return Dataset.from_pandas(combined_df)
    else:
        logger.warning("No data files found!")
        return None
    
# processed_data_home = Path(__file__).resolve().parent / '.datasets' / 'processed'
processed_data_home = Path('.').resolve() / '.datasets' / 'processed'
pds = load_processed_fever(processed_data_home, 'fever_train_chunk')
pdft = pd.DataFrame(pds)
pds = load_processed_fever(processed_data_home, 'fever_dev_chunk')
pdfd = pd.DataFrame(pds)
pdft.head()

INFO:__main__:Loading 1 chunks from /home/ksull18/code/iu-autonomous-fact-checker/aieng/judge_model/.datasets/processed
INFO:__main__:Loaded fever_train_chunk_0000.parquet: 48205 samples
INFO:__main__:Total samples loaded: 48205
INFO:__main__:Loading 1 chunks from /home/ksull18/code/iu-autonomous-fact-checker/aieng/judge_model/.datasets/processed
INFO:__main__:Loaded fever_dev_chunk_0000.parquet: 5363 samples
INFO:__main__:Total samples loaded: 5363


Unnamed: 0,id,claim,evidence,label,challenge
0,14802,Asiatic Society of Bangladesh(housed in Nimtal...,"[The society is housed in Nimtali, walking dis...",SUPPORTS,other
1,28540,"Lindfield railway station has 3 bus routes, in...",[Lindfield Station is served by three bus rout...,SUPPORTS,other
2,71874,Mukaradeeb('Wolf's Den') is a city in Iraq nea...,['Wolf's Den') is a small village in Iraq near...,SUPPORTS,combining tables and text
3,70296,Herbivore men was coined by Maki Fukasawa and ...,[The term was coined by the author Maki Fukasa...,SUPPORTS,multi-hop reasoning
4,16578,"Shulin, a 33.1288 km (12.7911 sq mi) land loca...",['forest district') is an inner city district ...,REFUTES,other


In [91]:
print(pdft['label'].value_counts())
print(len(pdft))

label
SUPPORTS           31811
REFUTES            14610
NOT ENOUGH INFO     1784
Name: count, dtype: int64
48205


In [92]:
print(pdft['evidence'][0])
type(pdft['evidence'][0])

['The society is housed in Nimtali, walking distance from the Curzon Hall of Dhaka University, locality of Old Dhaka.', 'The Asiatic Society of Bangladesh is a non political and non profit research organisation registered under both Society Act of 1864 and NGO Bureau, Government of Bangladesh.', 'The Asiatic Society of Bangladesh was established as the Asiatic Society of East Pakistan in Dhaka in 1952 by numbers of Muslim leaders, and renamed in 1972.', 'Ahmed Hasan Dani, a noted Muslim historian and archaeologist of Pakistan played an important role in founding this society.']


list

In [93]:
# what is not enough data?
mask = pdft["label"] == "NOT ENOUGH INFO"
print(mask)
baddf = pdft[mask]

# could also go with iloc
for index, row in baddf.head(10).iterrows():
    print(f"{index + 1}.) {row['claim']}")
    print("Evidence:")
    for t in row['evidence']:
        print(f"  {t}")
    print("---"*10)

0        False
1        False
2        False
3        False
4        False
         ...  
48200    False
48201    False
48202    False
48203    False
48204    False
Name: label, Length: 48205, dtype: bool
40.) Cameroon lists three definitely endangered languages, 13 severely endangered, and 16 critically endangered from among its at least 250 languages.
Evidence:
  Cameroon is home to at least 250 languages.
------------------------------
57.) Alan Lowry only played games with his close relatives in his entire life.
Evidence:
  Running back Roosevelt Leaks also ran for more than 100 yards in that game, making it the first time Texas had two 100-yard rushers in the same bowl game.
  Born and raised in Brenham, Texas, between Houston and Austin, Leaks grew up on his family's farm, where they raised, among other things, cotton and corn.
  Alan Lowry grew up in Irving, Texas.
------------------------------
94.) A Distributed Bragg Reflector laser has diffraction grating on one or both end 

# Model Training

Going to follow pretty close to how I trained the claim extractor / detector model.
Going to finetune BERT first I think as the context might not be big enough in DistilBERT for the evidence.

## Data Preparation

I organized the data while processing the WikiDB and training and dev files.
The data is off balance but I will try to use what I have before making further altercations.

In [94]:
print("DATA IS LOADED IN FROM ABOVE: it is already split into different files as well.")

# Data Shuffle
for _ in range(3):
    pdft = pdft.sample(frac=1, replace=False, ignore_index=True)
    pdfd = pdfd.sample(frac=1, replace=False, ignore_index=True)

DATA IS LOADED IN FROM ABOVE: it is already split into different files as well.


## Loading and Setup with BERT

### Initialize Tokenizer and Model

In [95]:
model_name = "bert-base-uncased"

In [None]:
# Loading tokenizer for this model
tokenizer = AutoTokenizer.from_pretrained(FileConfig.BaseModelName)
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__bool__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_call_one',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager',
 '_pad',
 '_pad_token_type_id',
 '_processor_class',
 '_save_pretrained',
 '

In [97]:
# Ran into tokenization issue - All tensors in a batch should be same length
# Some were 100 and but one was 187.
# Use padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Loading Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = AutoModelForSequenceClassification.from_pretrained(
    FileConfig.BaseModelName, 
    num_labels=3,  # Yay, Nay, Not enough info
)
model.to(device)

print(f"Model loaded: {FileConfig.BaseModelName}")
print(f"Vocabulary size: {tokenizer.vocab_size}")

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: bert-base-uncased
Vocabulary size: 30522


### Tokenize Data

Unlike the claim_extractor model, which just had the claim and a label, this model has claims and evidence, and 3 labels. 
I believe our tokenizer must handle the proper combination.

> Perhaps a point of improvement - better input and separation of claims and evidence. 

In [None]:
def data_transform(claim: str, evidence: List[str]):
    """
    This will be used later when we need to actually use this model.
    """
    if isinstance(evidence, list):
        evidence_text = " ".join(evidence)
    else:
        evidence_text = str(evidence)
    
    # Update for BERT Specific
    return f"[CLS] CLAIM: {claim} [SEP] EVIDENCE: {evidence_text} [SEP]"
    # return f"CLAIM: {claim} EVIDENCE: {evidence_text}"

def tokenize_fn(dataset):
    # Combine claim and evidence into single text input
    texts = []
    for claim, evidence in zip(dataset['claim'], dataset['evidence']):
        combined_text = data_transform(claim, evidence)
        texts.append(combined_text)

    # Due to issues around tensor length - get actual max length or default to what worked.
    max_len = getattr(tokenizer, 'model_max_length', 512)

    model_inputs = tokenizer(
        texts,
        truncation=True,
        padding=True,
        # Tensor Size returned is 1021 and doesn't match the 512...
        # max_length=(2**10)
        max_length=512,
        # return_tensors=None, # not necessary once correct lenght established
    )

    label_map = {"SUPPORTS": 0, "REFUTES": 1, "NOT ENOUGH INFO": 2}
    if isinstance(dataset['label'], list):
        # batched
        model_inputs["label"] = [label_map[label] for label in dataset['label']]
    else:
        model_inputs["label"] = [label_map[dataset['label']]]

    return model_inputs

#### Data Debugging

The training is not working because it says the labels are a list...

In [100]:
# Checking the structure of your pandas data before conversion
print("Sample rows from pdft:")
print(pdft[['claim', 'evidence', 'label']].head(2))
print("\nData types:")
print(pdft[['claim', 'evidence', 'label']].dtypes)
print("\nSample evidence type:")
print(type(pdft['evidence'].iloc[0]))
print("Sample evidence content:")
print(pdft['evidence'].iloc[0])
print("\nSample label type:")
print(type(pdft['label'].iloc[0]))
pdft[['claim', 'evidence', 'label']].head(2)

Sample rows from pdft:
                                               claim  \
0  Paulie Gilmore, born James E. Allen from Bosto...   
1  MassResistance is a political activist group w...   

                                            evidence     label  
0  [James E. Allen is an American professional wr...  SUPPORTS  
1  [MassResistance is a hate group which promotes...   REFUTES  

Data types:
claim       object
evidence    object
label       object
dtype: object

Sample evidence type:
<class 'list'>
Sample evidence content:
['James E. Allen is an American professional wrestler and promoter, best known by his ringname "Big" Paulie Gilmore or Gilmorea, who wrestled on the New England independent circuit for the Century Wrestling Alliance, the National Wrestling Alliance, the Millennium Wrestling Federation and the World Wrestling Alliance during the 1990s and early 2000s.', 'James E. Allen made his debut in 1994 and spent a brief time on the local independent circuit prior to joining

Unnamed: 0,claim,evidence,label
0,"Paulie Gilmore, born James E. Allen from Bosto...",[James E. Allen is an American professional wr...,SUPPORTS
1,MassResistance is a political activist group w...,[MassResistance is a hate group which promotes...,REFUTES


### Constant...

LABELS is for computing class weights later...

In [101]:
LABELS = pdft['label'].values

In [102]:
min(pdft['label'].value_counts())

1784

[This StackOverflow post](https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows) lists several ways to shuffle a dataframe. 
Pandas has the builtin `df = df.sample(frac=1).reset_index(drop=True)` which apparently reassigns but does not recreate.
SciKit Learn also has a "shuffle" method but that might require resetting indexes.
There are then seemingly countless ways after that as well...

Side note on sample - it now has an 'ignore_index' param to help with that reset issue.

In [103]:
# Convert pandas DataFrames to 🤗 Dataset objects
if FileConfig.FullRun:
    print("Full Run!")
    dst = Dataset.from_pandas(pdft[['claim', 'evidence', 'label']])
    dsv = Dataset.from_pandas(pdfd[['claim', 'evidence', 'label']])
else:
    # mini-datasets for running into issues... 😖
    # should probably go in the data processing section
    pc = FileConfig.Percentage
    print(f"Partial Run of {pc:.02%}")

    # there's probably an easier way...
    # training_mini = pd.concat([
    #     pdft[pdft['label'] == 'SUPPORTS'].sample(frac=pc, replace=False, ignore_index=True),
    #     pdft[pdft['label'] == 'REFUTES'].sample(frac=pc, replace=False, ignore_index=True),
    #     pdft[pdft['label'] == 'NOT ENOUGH INFO'].sample(frac=pc, replace=False, ignore_index=True),
    # ])

    # val_mini = pd.concat([
    #     pdfd[pdfd['label'] == 'SUPPORTS'].sample(frac=pc, replace=False, ignore_index=True),
    #     pdfd[pdfd['label'] == 'REFUTES'].sample(frac=pc, replace=False, ignore_index=True),
    #     pdfd[pdfd['label'] == 'NOT ENOUGH INFO'].sample(frac=pc, replace=False, ignore_index=True),
    # ])

    # NEW STRATEGY
    numr = min(pdft['label'].value_counts()) * 2
    training_mini = pd.concat([
        pdft[pdft['label'] == 'SUPPORTS'].sample(n=numr, replace=False, ignore_index=True),
        pdft[pdft['label'] == 'REFUTES'].sample(n=numr, replace=False, ignore_index=True),
        # allowing replacement here.
        pdft[pdft['label'] == 'NOT ENOUGH INFO'].sample(n=numr, replace=True, ignore_index=True),
    ])

    val_mini = pd.concat([
        pdfd[pdfd['label'] == 'SUPPORTS'].sample(frac=1, replace=False, ignore_index=True),
        pdfd[pdfd['label'] == 'REFUTES'].sample(frac=1, replace=False, ignore_index=True),
        pdfd[pdfd['label'] == 'NOT ENOUGH INFO'].sample(frac=1, replace=False, ignore_index=True),
    ])

    # Shuffling / Randomizing data
    for _ in range(5):
        training_mini = training_mini.sample(frac=1, replace=False, ignore_index=True)
        val_mini = val_mini.sample(frac=1, replace=False, ignore_index=True)

    print(training_mini['label'].value_counts())
    print(val_mini['label'].value_counts())
    dst = Dataset.from_pandas(training_mini[['claim', 'evidence', 'label']])
    dsv = Dataset.from_pandas(val_mini[['claim', 'evidence', 'label']])

    ## REDO Labels - shouldn't be too different
    LABLES = training_mini['label'].values

# Apply tokenization
train_dataset = dst.map(
    tokenize_fn, 
    batched=True,
    remove_columns=dst.column_names,
)
val_dataset = dsv.map(
    tokenize_fn, 
    batched=True,
    remove_columns=dsv.column_names,
)

print("Data tokenized successfully!")

Partial Run of 10.00%
label
REFUTES            3568
SUPPORTS           3568
NOT ENOUGH INFO    3568
Name: count, dtype: int64
label
SUPPORTS           3000
REFUTES            1947
NOT ENOUGH INFO     416
Name: count, dtype: int64


Map: 100%|██████████| 10704/10704 [00:02<00:00, 4343.91 examples/s]
Map: 100%|██████████| 5363/5363 [00:05<00:00, 1014.38 examples/s]

Data tokenized successfully!





In [118]:
train_dataset.description

''

The below helped me find a mismatch in data-types between the dataset I thought I was making and what was really created...
which the training did not like.

In [104]:
# Check the dataset structure after conversion from pandas
print("Dataset columns:", dst.column_names)
print("Dataset features:", dst.features)
print("First example:")
try:
    print(dst[0])
except Exception as e:
    print(f"Error accessing first example: {e}")

# Check if there are any problematic column names or types
for col in dst.column_names:
    print(f"Column '{col}' type: {type(dst[col])}")
    try:
        print(f"First value: {dst[col][0]}")
        print(f"First value type: {type(dst[col][0])}")
    except Exception as e:
        print(f"Error accessing column '{col}': {e}")
    print("---")

print("="*25)

print("Dataset columns:", train_dataset.column_names)
print("Dataset features:", train_dataset.features)
print("First example:")
try:
    print(train_dataset[0])
except Exception as e:
    print(f"Error accessing first example: {e}")

# Check if there are any problematic column names or types
for col in train_dataset.column_names:
    print(f"Column '{col}' type: {type(train_dataset[col])}")
    try:
        print(f"First value: {train_dataset[col][0]}")
        print(f"First value type: {type(train_dataset[col][0])}")
    except Exception as e:
        print(f"Error accessing column '{col}': {e}")
    print("---")

Dataset columns: ['claim', 'evidence', 'label']
Dataset features: {'claim': Value('string'), 'evidence': List(Value('string')), 'label': Value('string')}
First example:
{'claim': 'Bernie Sanders, former Mayor of Burlington, Vermont lost to Dolores Sandoval as the Democratic bet for the 1990 US House of Representatives election in Vermont.', 'evidence': ["He won election to the U.S. House of Representatives in 1990, representing Vermont's at-large congressional district, later co-founding the Congressional Progressive Caucus."], 'label': 'REFUTES'}
Column 'claim' type: <class 'datasets.arrow_dataset.Column'>
First value: Bernie Sanders, former Mayor of Burlington, Vermont lost to Dolores Sandoval as the Democratic bet for the 1990 US House of Representatives election in Vermont.
First value type: <class 'str'>
---
Column 'evidence' type: <class 'datasets.arrow_dataset.Column'>
First value: ["He won election to the U.S. House of Representatives in 1990, representing Vermont's at-large co

## Fine-Tune the Model

We are doing **transfer learning** with **fine-tuning**. 
BERT was pre-trained to understand language - Thank you!
We fine-tuning the model for a specific task - claim vs opinion here.
The technique = Supervised learning with backpropagation

Deep dive: BERT has millions of weights to understand language. We are adjusting these to suit our classification task. Only our final classification layer is learning from scratch. The rest of BERT is merely adapting instead of being completely retrained. 
BERT (I think) expects a "[MASK]" token to predict values. 
By fine-tuning, we add a layer like: `input text -> BERT Encoder -> Classification Head -> [Claim, Opinion] probabilities`.

### Defining Training Arguments

> I am taking most of this from the claim_extractor I made previously

In [105]:
# Set up directories for saving
datenow = datetime.now()
timestamp = datenow.strftime("%Y%m%d%H%M%S")

# TODO: Give model name at top
move_path = Path().cwd() / "trainingresults" / f'hide-bert_{timestamp}'
out_path = Path().cwd() / "trainingresults" / "latest"
metatdata_file_path = out_path / "metadata.json"

# Below is the logic for moving previous versions
if FileConfig.ToBuild:
    if metatdata_file_path.exists():
        # A model exists in latest already - move to it's timestamp

        try:
            with open(metatdata_file_path, 'r') as file:
                tmp = json.load(file)
                str_path = tmp.get('path', None)
                assert str_path is not None
                ts_path = Path(str_path)
                # Moving the old model into its timestamp directory
                out_path.rename(ts_path)
        except Exception as e:
            logger.warning(e)
            # suggests something in directory didn't finish and should probably be deleted?
        assert not out_path.exists()

    ## Open cannot make the directories after I rename them...
    out_path.mkdir(parents=True, exist_ok=True)

    with open(out_path / "metadata.json", 'w') as file:
        json.dump({"path": str(move_path), "foundation": model_name, 'timestamp': datenow.isoformat(), "version": MODEL_VERSION}, file, indent=2)

In [106]:
## OLD
# training_args = TrainingArguments(
#     output_dir=out_path, # Working directory during training for logs and checkpoints.
#     num_train_epochs=3,              # Start with 3, adjust based on results
#     ## batch size of 16 gets to 5.6/6GB of RTX 3060
#     per_device_train_batch_size=8,  # Reduce if memory issues
#     per_device_eval_batch_size=16,
#     warmup_steps=500, # gradually increase learning rate over 1000 steps | prevents huge descrutive changes early on
#     weight_decay=0.01, # Very mild 1% to prevent memorizing training data exactly. 
#     logging_dir='./logs',
#     ## had set to 10 which is a lot of overhead
#     logging_steps=20,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",
#     greater_is_better=False,
#     dataloader_pin_memory=False, # can help with GPU transfer speed
#     fp16=True, # mixed precision can speedup training if supported
#     dataloader_num_workers=4, # parallel data loading
# )

# New'ish
# I increased Epochs and changed the saving and eval strategies to Steps
training_args = TrainingArguments(
    output_dir=out_path, # Working directory during training for logs and checkpoints.
    num_train_epochs=5,              # Start with 3, adjust based on results
    ## batch size of 16 gets to 5.6/6GB of RTX 3060
    per_device_train_batch_size=8,  # Reduce if memory issues
    per_device_eval_batch_size=8,
    warmup_steps=1000, # gradually increase learning rate over 1000 steps | prevents huge descrutive changes early on
    weight_decay=0.01, # Very mild 1% to prevent memorizing training data exactly. 
    logging_dir='./logs',
    ## had set to 10 which is a lot of overhead
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="f1", # Changing for balanced data
    greater_is_better=True, # updated
    dataloader_pin_memory=False, # can help with GPU transfer speed
    fp16=True, # mixed precision can speedup training if supported
    dataloader_num_workers=4, # parallel data loading
)

### Define Evaluation Metrics

SciKit Learn has some handy prebuilt functions.

In [107]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )
    accuracy = accuracy_score(labels, predictions)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### Custome Weights

The data is quite unbalanced.
There is a total of 48,205 entries:
- supports = 31,811
- refutes = 14,610
- na = 1,784

Rebalancing to the tiny amount is not desireable.
The `Trainer` can take in a loss function. The signature is just 'Callable' so inside the class it takes in `(outputs, labels, num_items_in_batch)`.

There are several loss functions to consider:
- 


[SciKit-Learn's `compute_class_weight`](https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html) function follows the inverse frequency approach.

In [108]:
for I,J in enumerate(LabelMap):
    print(I)
    print(str(J))

LabelMap['SUPPORTS'].name

0
LabelMap.SUPPORTS
1
LabelMap.REFUTES
2
LabelMap.NOT ENOUGH INFO


'SUPPORTS'

In [109]:
def create_weights(labels):
    unique_labels = np.unique(labels)
    weights_balanced = compute_class_weight('balanced', classes=unique_labels, y=labels)
    weight_tensor = torch.zeros(3)

    for index, en in enumerate(LabelMap):
        label_inx = np.where(unique_labels == en.name)[0][0]
        weight_tensor[index] = weights_balanced[label_inx]
    
    return weight_tensor

class_weights = create_weights(LABELS)
class_weights = class_weights.to(model.device)
print(class_weights)

# This needs work, I don't think weights should change nor be calucated during each batch...
def custom_weighted_loss_fn(outputs, labels, num_items_in_batch=None):
    logits = outputs.logits # Model's raw predictions [batch_size, 3]

    # Method 1: Balanced - Inverse Frequency
    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
    # This is impure function for now
    loss = loss_fn(logits, labels)
    return loss
    

tensor([0.5051, 1.0998, 9.0069], device='cuda:0')


### Initialize and Train!

In [110]:
print(len(train_dataset))
print(len(val_dataset))

10704
5363


In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    compute_loss_func=custom_weighted_loss_fn if FileConfig.CustomLossFn else None,
)

# Start training
print("Starting training...")
if FileConfig.ToBuild:
    trainer.train()
    # Save the model
    trainer.save_model(out_path) # Where to save model weights and config
    tokenizer.save_pretrained(out_path) # for tokenizer stuff
    print("Model saved!")
    print("Cleaning up Checkpoints...")
    checkpoint_dirs = glob.glob(f"{out_path}/checkpoint-*")
    for checkpoint_dir in checkpoint_dirs:
        shutil.rmtree(checkpoint_dir)
    print("Clean up complete!")

Starting training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,1.0176,0.909827,0.615141,0.629001,0.691264,0.615141
1000,0.9193,0.928607,0.599478,0.648868,0.75058,0.599478
1500,0.7674,0.707694,0.716017,0.728436,0.745427,0.716017
2000,0.5298,0.821412,0.699422,0.716486,0.748392,0.699422
2500,0.519,1.024353,0.62465,0.677492,0.779201,0.62465
3000,0.3695,1.162384,0.694947,0.71775,0.751558,0.694947
3500,0.4226,1.045302,0.723849,0.728045,0.736437,0.723849
4000,0.3299,1.089929,0.729629,0.735666,0.742561,0.729629
4500,0.2224,1.613433,0.69215,0.705076,0.741069,0.69215
5000,0.1307,1.608569,0.706694,0.719924,0.742852,0.706694


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Model saved!


## Test the Model

This is kind of the manual process I suppose for the time being.

### Loading Model

In [112]:
print("\n")
# Load your fine-tuned model
# TODO: UPDATE!!!
the_judge = pipeline(
    task="text-classification",
    model=str(out_path),
    tokenizer=str(out_path),
    device='cuda'
)

Device set to use cuda






Making up some claims and support for manual testing

In [113]:
fake_test_data = [
    # SUPPORTS - 4 claims
    {
        "id": 1,
        "claim": "The 2024 US presidential election had the highest voter turnout in history.",
        "evidence": [
            "According to the Federal Election Commission, approximately 158 million Americans voted in the 2024 presidential election. This surpassed the previous record of 155 million voters set in 2020. Election officials reported that turnout reached 66.8% of eligible voters, marking a new milestone in American electoral participation.",
        ],
        "label": "SUPPORTS",
        "context": "Short and factual about voter turnout."
    },
    {
        "id": 2, 
        "claim": "Social Security benefits increased by 3.2% in 2024.",
        "evidence": [
            "The Social Security Administration announced a 3.2% cost-of-living adjustment for 2024 benefits. This increase affects over 67 million Social Security beneficiaries and 7 million SSI recipients. The adjustment was based on the Consumer Price Index data from the third quarter of 2023.",
        ],
        "label": "SUPPORTS",
        "context": "Matching percentages."
    },
    {
        "id": 3,
        "claim": "The federal minimum wage in America is $7.25 per hour.",
        "evidence": [
            "The federal minimum wage has remained at $7.25 per hour since July 2009, when it was last increased under the Fair Minimum Wage Act. While many states have implemented higher minimum wages, the federal rate serves as the baseline for all states. Congressional attempts to raise the federal minimum wage to $15 per hour have stalled in recent legislative sessions.",
        ],
        "label": "SUPPORTS",
        "context": "Simple fact about minimum wage."
    },
    {
        "id": 4,
        "claim": "Medicare covers prescription drug costs for seniors.",
        "evidence": [
            "Medicare Part D provides prescription drug coverage for Medicare beneficiaries, covering approximately 63 million seniors and disabled individuals. The program was established in 2006 and helps reduce out-of-pocket prescription costs. Recent legislation has also capped annual prescription drug costs at $2,000 starting in 2025 for Medicare recipients.",
        ],
        "label": "SUPPORTS",
        "context": "More evidence for slightly more vague claim."
    },
    # REFUTES - 4 claims  
    {
        "id": 5,
        "claim": "The US Constitution requires congressional approval for all military deployments overseas.",
        "evidence": [
            "The Constitution grants Congress the power to declare war, but does not require congressional approval for all military actions. The President, as Commander in Chief, has authority to deploy troops for limited periods without congressional authorization. The War Powers Resolution of 1973 requires congressional approval only for deployments lasting more than 60 days.",
        ],
        "label": "REFUTES",
        "context": "A constitutional misconception."
    },
    {
        "id": 6,
        "claim": "Climate change legislation was passed by Congress in 2023 with bipartisan support.",
        "evidence": [
            "No major climate change legislation received bipartisan support in Congress during 2023. The Inflation Reduction Act, which included climate provisions, was passed in 2022 with only Democratic votes. Several climate-related bills were introduced in 2023 but failed to advance due to partisan disagreements over implementation and funding mechanisms.",
        ],
        "label": "REFUTES",
        "context": "A false bipartisan claim about climate legislation."
    },
    {
        "id": 7,
        "claim": "The federal deficit decreased by 50% in 2024.",
        "evidence": [
            "The Congressional Budget Office reported that the federal deficit increased by approximately 8% in fiscal year 2024, reaching $1.9 trillion. This represents a significant increase from the previous year's deficit of $1.7 trillion. Rising interest payments on national debt and increased government spending contributed to the larger deficit.",
        ],
        "label": "REFUTES",
        "context": "A favourite for some, incorrect percentages."
    },
    {
        "id": 8,
        "claim": "All Supreme Court justices must be confirmed by a two-thirds majority in the Senate.",
        "evidence": [
            "Supreme Court nominees require only a simple majority vote for confirmation in the Senate, not a two-thirds majority. This threshold was established by Senate rules and precedent. The confirmation process involves Senate Judiciary Committee hearings followed by a full Senate vote, where 51 votes are sufficient for confirmation.",
        ],
        "label": "REFUTES",
        "context": "Incorrect claim about voting threshold."
    },
    # NOT ENOUGH INFO - 2 claims
    {
        "id": 9,
        "claim": "The new infrastructure bill will create 500,000 jobs in rural communities specifically.",
        "evidence": [
            "The Infrastructure Investment and Jobs Act allocated $1.2 trillion for various infrastructure projects across the United States. The legislation includes funding for roads, bridges, broadband expansion, and water systems. Economic analysts project the bill will create millions of jobs nationwide over the next decade, with significant benefits expected for both urban and rural areas.",
        ],
        "label": "NOT ENOUGH INFO",
        "context": "Evidence is about the project but without the supporting figures."
    },
    {
        "id": 10,
        "claim": "Congressional approval ratings reached their lowest point since 1974 last month.",
        "evidence": [
            "Recent polling shows Congress has historically low approval ratings, with multiple surveys indicating public dissatisfaction with legislative performance. Gallup polling has tracked congressional approval since the 1970s, showing significant fluctuations over the decades. Political polarization and gridlock have contributed to declining public confidence in the institution.",
        ],
        "label": "NOT ENOUGH INFO",
        "context": "Evidence does not specify exact timeframe nor comparison to 1974."
    },
    {
        "id": 11,
        "claim": "Trump and Putin are meeting in Alaska to talk about ending the war in Ukraine.",
        "evidence": [],
        "label": "NOT ENOUGH INFO",
        "context": "Very recent news as of today, withholding evidence even though it does techincally exist."
    }
]



Above: I was going to do the multiple sentences but when we pass data into the model I zip them up anyways so... it's half done for now.

In [114]:
# Create DataFrame and Dataset
def create_fake_dataset():
    """Create fake dataset for testing judge model"""
    df = pd.DataFrame(fake_test_data)
    
    print("Test Dataset Summary:")
    print(f"Total samples: {len(df)}")
    print("\nLabel Distribution:")
    label_counts = df['label'].value_counts()
    for label, count in label_counts.items():
        print(f"{label}: {count}")
    
    print("\nClaim Length Statistics:")
    df['claim_length'] = df['claim'].str.len()
    print(f"Average claim length: {df['claim_length'].mean():.0f} characters")
    print(f"Shortest claim: {df['claim_length'].min()} characters")
    print(f"Longest claim: {df['claim_length'].max()} characters")
    
    return Dataset.from_pandas(df)

# Create the dataset
test_dataset = create_fake_dataset()

Test Dataset Summary:
Total samples: 11

Label Distribution:
SUPPORTS: 4
REFUTES: 4
NOT ENOUGH INFO: 3

Claim Length Statistics:
Average claim length: 71 characters
Shortest claim: 45 characters
Longest claim: 90 characters


In [115]:
# for each test_case
for tc in test_dataset:
    print(f"TEST CASE: {tc['id']:02}")
    print(f"Claim: {tc['claim']}")
    print(f"Evidence: ")
    for x in [y for y in tc['evidence']]:
        print(f"  {x}")
    expected = tc['label']
    print(f"Label: {expected}")
    print(f"ABOUT: {tc['context']}")
    fixed = data_transform(tc['claim'], tc['evidence'])
    result_list = the_judge(fixed)
    result = result_list[0]
    print("RESULT:", json.dumps(result, indent=2))
    actual = result.get('label')
    if "0" in actual:
        actual = "SUPPORTS"
    elif "1" in actual:
        actual = "REFUTES"
    elif "2" in actual:
        actual = "NOT ENOUGH INFO"
    else:
        actual = "ERROR - WHAT?!?"
    print()
    correct = actual == expected
    if correct:
        print(f"✅ PREDICTED: {actual} | ACTUAL: {tc['label']}")
    else:
        print(f"❌ PREDICTED: {actual} | ACTUAL: {tc['label']}")
    print()

TEST CASE: 01
Claim: The 2024 US presidential election had the highest voter turnout in history.
Evidence: 
  According to the Federal Election Commission, approximately 158 million Americans voted in the 2024 presidential election. This surpassed the previous record of 155 million voters set in 2020. Election officials reported that turnout reached 66.8% of eligible voters, marking a new milestone in American electoral participation.
Label: SUPPORTS
ABOUT: Short and factual about voter turnout.
RESULT: {
  "label": "LABEL_0",
  "score": 0.9166038632392883
}

✅ PREDICTED: SUPPORTS | ACTUAL: SUPPORTS

TEST CASE: 02
Claim: Social Security benefits increased by 3.2% in 2024.
Evidence: 
  The Social Security Administration announced a 3.2% cost-of-living adjustment for 2024 benefits. This increase affects over 67 million Social Security beneficiaries and 7 million SSI recipients. The adjustment was based on the Consumer Price Index data from the third quarter of 2023.
Label: SUPPORTS
ABOUT

  return forward_call(*args, **kwargs)


RESULT: {
  "label": "LABEL_2",
  "score": 0.6960554718971252
}

✅ PREDICTED: NOT ENOUGH INFO | ACTUAL: NOT ENOUGH INFO

TEST CASE: 11
Claim: Trump and Putin are meeting in Alaska to talk about ending the war in Ukraine.
Evidence: 
Label: NOT ENOUGH INFO
ABOUT: Very recent news as of today, withholding evidence even though it does techincally exist.
RESULT: {
  "label": "LABEL_1",
  "score": 0.9951300621032715
}

❌ PREDICTED: REFUTES | ACTUAL: NOT ENOUGH INFO



I know the validation data shouldn't be used, but just checking here for fun...

In [116]:
t_preds = []
t_labels = []
min_n = min(pdfd['label'].value_counts())
pdfs = pd.concat([
        pdfd[pdfd['label'] == 'SUPPORTS'].sample(n=min_n, replace=False, ignore_index=True),
        pdfd[pdfd['label'] == 'REFUTES'].sample(n=min_n, replace=False, ignore_index=True),
        pdfd[pdfd['label'] == 'NOT ENOUGH INFO'].sample(frac=1, replace=False, ignore_index=True),
    ])
for _, row in pdfs.iterrows():
    fixed = data_transform(row['claim'], row['evidence'])
    result_list = the_judge(fixed)
    result = result_list[0]
    actual = result.get('label')
    expected = row['label']
    if "0" in actual:
        actual = "SUPPORTS"
    elif "1" in actual:
        actual = "REFUTES"
    elif "2" in actual:
        actual = "NOT ENOUGH INFO"
    else:
        actual = "ERROR - WHAT?!?"
    t_preds.append(actual)
    t_labels.append(expected)

def compute_metrics_simple(predicted_values: List[str], actual_values: List[str]):
    precision, recall, f1, _ = precision_recall_fscore_support(
        actual_values, predicted_values, average='weighted'
    )
    accuracy = accuracy_score(actual_values, predicted_values)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
test_results = compute_metrics_simple(t_preds, t_labels)
print(json.dumps(test_results, indent=2))

  return forward_call(*args, **kwargs)


{
  "accuracy": 0.6145833333333334,
  "f1": 0.5873959778635663,
  "precision": 0.6221580366647493,
  "recall": 0.6145833333333334
}


In [117]:
print(pdfs['label'].value_counts())
print()
print(len(t_preds))
print()
mycounts = {}
for exp, act in zip(t_preds, t_labels):
    try:
        mycounts[exp][act] += 1
    except:
        one = mycounts.get(exp)
        if one is None:
            mycounts[exp] = {}
        mycounts[exp][act] = 1

print(json.dumps(mycounts, indent=2))

label
SUPPORTS           416
REFUTES            416
NOT ENOUGH INFO    416
Name: count, dtype: int64

1248

{
  "SUPPORTS": {
    "SUPPORTS": 334,
    "REFUTES": 74,
    "NOT ENOUGH INFO": 146
  },
  "REFUTES": {
    "SUPPORTS": 45,
    "REFUTES": 315,
    "NOT ENOUGH INFO": 152
  },
  "NOT ENOUGH INFO": {
    "SUPPORTS": 37,
    "REFUTES": 27,
    "NOT ENOUGH INFO": 118
  }
}
