In [5]:
from pathlib import Path
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from datetime import datetime
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight

import torch
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, 
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    pipeline,
)
from datasets import Dataset

from improved_data_preprocessor import ImprovedPoliticalDataPreprocessor
from train_improved_model import ImprovedPoliticalClassifierTrainer
from test_improved_model import test_improved_model

In [None]:
preprocessor = ImprovedPoliticalDataPreprocessor()

In [None]:
print(preprocessor)

In [None]:
# Creates and stores data in files.
train_df, val_df, test_df = preprocessor.run_improved_pipeline()

In [None]:
# Object can load its own data
trainer_obj = ImprovedPoliticalClassifierTrainer() # defaults to very small distilbert uncased

In [None]:
trainer, eval_results, model_info = trainer_obj.full_improved_training_pipeline()
print(eval_results)

In [3]:
test_improved_model()

Testing model: /home/ksull18/code/iu-autonomous-fact-checker/aieng/political_detector/trainingresults/latest_improved


Device set to use cuda


Model loaded successfully on cuda

Testing improved model:


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Text: Trump announces new immigration policy
Expected: Political, Predicted: Non-Political (confidence: 1.000) ❌ WRONG
Raw scores: LABEL_0=1.000, LABEL_1=0.000
------------------------------------------------------------
Text: Biden's healthcare reform bill passes
Expected: Political, Predicted: Non-Political (confidence: 1.000) ❌ WRONG
Raw scores: LABEL_0=1.000, LABEL_1=0.000
------------------------------------------------------------
Text: Congressional hearing reveals corruption
Expected: Political, Predicted: Non-Political (confidence: 0.998) ❌ WRONG
Raw scores: LABEL_0=0.998, LABEL_1=0.002
------------------------------------------------------------
Text: New senator elected in swing state
Expected: Political, Predicted: Non-Political (confidence: 1.000) ❌ WRONG
Raw scores: LABEL_0=1.000, LABEL_1=0.000
------------------------------------------------------------
Text: Supreme court decision affects voting rights
Expected: Political, Predicted: Non-Political (confidence: 1.000) ❌ 

In [None]:
test_df = None
with open('./.datasets/test.json', 'r') as file:
    test_df = pd.DataFrame(json.load(file))

# test_df.head(5)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = pipeline(
    task='text-classification',
    # model=str(Path(__file__).resolve().parent / 'trainingresults' / 'improved_political_classifier_20250809_162854'),
    model=str(Path('.').resolve() / 'trainingresults' / 'improved_political_classifier_20250809_162854'),
    # tokenizer=str(Path(__file__).resolve().parent / 'trainingresults' / 'improved_political_classifier_20250809_162854'),
    tokenizer=str(Path('.').resolve() / 'trainingresults' / 'improved_political_classifier_20250809_162854'),
    device=device,
    return_all_scores=True
)

test_cases = [
    # Political texts (should be classified as political)
    ("Trump announces new immigration policy", "Political"),
    ("Biden's healthcare reform bill passes", "Political"), 
    ("Congressional hearing reveals corruption", "Political"),
    ("New senator elected in swing state", "Political"),
    ("Supreme court decision affects voting rights", "Political"),
    
    # Non-political texts (should be classified as non-political)  
    ("I love pizza and going to movies", "Non-Political"),
    ("My cat is sleeping on the couch", "Non-Political"),
    ("Walking in the park on a sunny day", "Non-Political"),
    ("Just watched a great movie on Netflix", "Non-Political"),
    ("The weather is beautiful today", "Non-Political"),
    
    # Edge cases (harder to classify)
    ("I disagree with this decision", "Ambiguous"),
    ("This is important for our future", "Ambiguous"),
    ("People should have the right to choose", "Ambiguous")
]

print("\nTesting improved model:")
print("=" * 80)


correct_predictions = 0
total_clear_cases = 0  # Only count clear political/non-political cases

# for text, expected_category in test_cases:
for i in range(100, 125):
    try:
        # w/DataFrame
        text = test_df['text'][i]
        expected_category = 'Political'
        ### -------------------
        result = model(text)
        
        # Get the prediction
        label_0_score = result[0][0]['score']  # LABEL_0 
        label_1_score = result[0][1]['score']  # LABEL_1
        
        predicted_label = result[0][0]['label'] if label_0_score > label_1_score else result[0][1]['label']
        max_score = max(label_0_score, label_1_score)
        
        # Interpret the prediction (assuming 0=non-political, 1=political)
        if predicted_label == 'LABEL_1':
            prediction = "Political"
        else:
            prediction = "Non-Political"
        
        # Check if prediction is correct (only for clear cases)
        if expected_category != "Ambiguous":
            total_clear_cases += 1
            if prediction == expected_category:
                correct_predictions += 1
                status = "✅ CORRECT"
            else:
                status = "❌ WRONG"
        else:
            status = "⚪ AMBIGUOUS"
        
        print(f"Text: {text}")
        print(f"Expected: {expected_category}, Predicted: {prediction} (confidence: {max_score:.3f}) {status}")
        print(f"Raw scores: LABEL_0={label_0_score:.3f}, LABEL_1={label_1_score:.3f}")
        print("-" * 60)
        
    except Exception as e:
        print(f"ERROR processing '{text}': {e}")

# Calculate accuracy for clear cases
if total_clear_cases > 0:
    accuracy = (correct_predictions / total_clear_cases) * 100
    print(f"\nACCURACY ON CLEAR CASES: {accuracy:.1f}% ({correct_predictions}/{total_clear_cases})")
    
    if accuracy >= 80:
        print("✅ MODEL PERFORMANCE: GOOD")
    elif accuracy >= 60:
        print("⚠️ MODEL PERFORMANCE: MODERATE") 
    else:
        print("❌ MODEL PERFORMANCE: POOR")


Device set to use cuda
Token indices sequence length is longer than the specified maximum sequence length for this model (3366 > 512). Running this sequence through the model will result in indexing errors



Testing improved model:
Text: “...washing your hands regularly and covering sneezes and coughs. But if you are “sick and need to go out you should wear a mask.”

Read why the CDC is not recomending a face mask to protect against Coronavirus.

#OneVoice1 

https://t.co/kkbTSlymgW
Expected: Political, Predicted: Non-Political (confidence: 0.982) ❌ WRONG
Raw scores: LABEL_0=0.982, LABEL_1=0.018
------------------------------------------------------------
Text: This must be a joke, satire or parody. The wildfires are directly attributed to your state's abysmal land management history and release more carbon in a year than every vehicle trapped in your ridiculously under-served infrastructure. You are the problem; Trump is the solution.
Expected: Political, Predicted: Non-Political (confidence: 0.999) ❌ WRONG
Raw scores: LABEL_0=0.999, LABEL_1=0.001
------------------------------------------------------------
ERROR processing ' What will the  new cold war  look like? (Illustration by Brad 