In [1]:
# built-in libraries
import gc
import json
from pathlib import Path
import re
import unicodedata
from typing import List

# 3rd party libraries
import datasets
import pandas as pd
import requests
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# custom libraries


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_sets_dir = Path('.').resolve() / '.data_sets' / 'liar_dataset'
print(data_sets_dir)

/home/ksull18/code/iu-autonomous-fact-checker/aieng/mini_6atters/.data_sets/liar_dataset


In [3]:
column_names = [
    'id', 'label', 'statement', 'subject', 'speaker', 
    'speaker_job_title', 'state', 'party', 'barely_true_counts',
    'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context',
]
training_df = pd.read_csv(data_sets_dir / 'train.tsv', sep='\t', encoding='utf-8', header=None, names=column_names)

In [4]:
for statement in training_df['statement'].head(10):
    print(statement)

Says the Annies List political group supports third-trimester abortions on demand.
When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.
Hillary Clinton agrees with John McCain "by voting to give George Bush the benefit of the doubt on Iran."
Health care reform legislation is likely to mandate free sex change surgeries.
The economic turnaround started at the end of my term.
The Chicago Bears have had more starting quarterbacks in the last 10 years than the total number of tenured (UW) faculty fired during the last two decades.
Jim Dunnam has not lived in the district he represents for years now.
I'm the only person on this stage who has worked actively just last year passing, along with Russ Feingold, some of the toughest ethics reform since Watergate.
However, it took $19.5 million in Oregon Lottery funds for the Port of Newport to eventually land the new NOAA Marine Operations Center-Pacific.
Says

In [5]:
# Check for missing data
print(training_df.isnull().sum())

id                         0
label                      0
statement                  0
subject                    2
speaker                    2
speaker_job_title       2898
state                   2210
party                      2
barely_true_counts         2
false_counts               2
half_true_counts           2
mostly_true_counts         2
pants_on_fire_counts       2
context                  102
dtype: int64


In [6]:
# distribtuion of labels
print(training_df['label'].value_counts())

label
half-true      2114
false          1995
mostly-true    1962
true           1676
barely-true    1654
pants-fire      839
Name: count, dtype: int64


In [None]:
# https://huggingface.co/datasets/roupenminassian/twitter-misinformation?library=datasets

# twitter_ds = datasets.load_dataset("roupenminassian/twitter-misinformation")
# train_df = twitter_ds['train'].to_pandas()
# test_df = twitter_ds['test'].to_pandas()

In [None]:
# train_df.to_csv('.data_sets/twitter_misinformation/train.csv', index=False)
# test_df.to_csv('.data_sets/twitter_misinformation/test.csv', index=False)
# train_df.to_json('.data_sets/twitter_misinformation/train.json', orient='records')
# test_df.to_json('.data_sets/twitter_misinformation/test.json', orient='records')


In [2]:
raw_dfs: List[pd.DataFrame] = []
with open(Path('./.data_sets/twitter_misinformation/train.json'), 'r') as file:
    raw_dfs.append(pd.DataFrame(json.load(file)))

with open(Path('./.data_sets/twitter_misinformation/test.json'), 'r') as file:
    raw_dfs.append(pd.DataFrame(json.load(file)))

df = pd.concat(raw_dfs)

# tweet_train_df = pd.read_csv('./.data_sets/twitter_misinformation/train.csv')
# tweet_test_df = pd.read_csv('./.data_sets/twitter_misinformation/test.csv')

In [6]:
df['text'] = df['text'].apply(lambda x: unicodedata.normalize('NFKD', x))
df.to_json('./.data_sets/twitter_misinformation/cleaned.json', orient='records')



for tweet in df['text'].head(10):
    print(tweet)
    if r"\u" in tweet:
        print("Yes")
    # print("---"*10)

Local Charlotte, NC news station WSOCTV is reporting that sources tell them dash cameras captured Keith Scott getting out of car and coming towards officers with a gun in his hand:#BREAKING: Sources tell Channel 9 dash camera video shows #KeithScott getting out car, coming toward officers with gun in his hand pic.twitter.com/GGuM2Ow3wk  WSOCTV (@wsoctv) September 21, 2016For a second night, protests over a deadly officer-involved shooting in Charlotte, North Carolina, turned violent, with police firing tear gas and demonstrators throwing objects and trying to damage vehicles.Keith Lamont Scott, a father of seven, was killed by police in an apartment complex parking lot Tuesday as officers looked for another man named in a warrant they were trying to serve. The shooting set off a long night of violent protests and Wednesday the demonstrations continued for a second night, starting off as a peaceful march through downtown Charlotte. But when the demonstrators neared an Omni Hotel, some p

In [None]:
# The following will require ollama to be running as well - or a model of your choosing.

# Did not work...
def extract_5w1h(tweet_text):
    prompt = f"""
    Extract the 5W1H elements from this tweet. Return as JSON:
    
    Tweet: "{tweet_text}"
    
    Extract:
    - WHO: The main person/entity being discussed
    - WHAT: The main action or claim
    - WHERE: Location (if mentioned)  
    - WHEN: Time reference (if mentioned)
    - WHY: Reason or motivation (if mentioned)
    - HOW: Method or manner (if mentioned)
    
    Return only valid JSON. Use null for missing elements.
    """
    
    # Call Ollama API
    # response = requests.post('http://172.17.0.1:11434/api/generate',
    # response = requests.post('http://172.26.112.1:11434/api/generate',
    response = requests.post('http://localhost:11434/api/generate',
                           json={'model': 'deepseek-llm:7b', 'prompt': prompt, "stream": False})
    return response.json()

https://superuser.com/questions/1679757/accessing-windows-localhost-from-wsl2
WSL2 is like a VM in Windows. It has a virual router on the windows host to connect to Windows host and outside world. 
Use `ip route` and look for "default via"

I used the "172.17.0.1" IP address because it is another known name for WSL2 - localhost did not work. 

In [None]:
# Testing!!!

# Could not get WSL2 to succesfully connect to Ollama on Windows and I don't want to install again in WSL = new approach
# thing = extract_5w1h(df.iloc[1]['text'])
# print(thing)

In [None]:
# Trying small hugging face model
# def get_info_hf_small(text: str):
#     pl = pipeline("text2text-generation", model="google/flan-t5-small") # ~308M
#     prompt = f"Extract WHO, WHAT, WHERE, WHEN, WHY, HOW FROM: {text}"
#     return pl(prompt, max_length=150, do_sample=False)

# result = get_info_hf_small(df['text'].iloc[1])
# print(result)

https://huggingface.co/HuggingFaceTB/SmolLM3-3B
could try SmolLm3
Smol Stats -> 5GB and 1GB
But it keeps crashing the Kernel

http://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct
SmolLM3 only comes in 3B flavor so dropping down to SmolLM2 because 3 is crashing Kernel
SmolLM2 -> 3.42G

In [None]:
# model_name = "HuggingFaceTB/SmolLM3-3B"
# device = "cuda"  # for GPU usage or "cpu" for CPU usage

# # load the tokenizer and the model
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
# ).to(device)

# # prepare the model input
# prompt = "Give me a brief explanation of gravity in simple terms."
# messages_think = [
#     {"role": "user", "content": prompt}
# ]

# text = tokenizer.apply_chat_template(
#     messages_think,
#     tokenize=False,
#     add_generation_prompt=True,
# )
# model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# # Generate the output
# generated_ids = model.generate(**model_inputs, max_new_tokens=32768)

# # Get and decode the output
# output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :]
# print(tokenizer.decode(output_ids, skip_special_tokens=True))

In [4]:
# reloading the same model I think caused RAM issues
if 'smol_model' not in globals():
    print("Loading model for first time...")
    checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
    device = "cpu"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    smol_model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

    # Fix the pad token issue
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    print("Model loaded")
else:
    print("Model already loaded, skip loading again...")

# TODO: Proper Class for the return type might make this easier.
# The Function  
def extract_5w1h_chat(text: str):
    messages = [{
        "role": "user", 
        "content": f"""Extract 5W1H information from this text. Return only valid JSON. If a value cannot be determined, please indicate with 'null':

Text: {text}

JSON format:
{{"WHO": "person", "WHAT": "action", "WHERE": "location", "WHEN": "time", "WHY": "reason", "HOW": "method"}}"""
    }]
    
    input_text = tokenizer.apply_chat_template(messages, tokenize=False)
    inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
    
    outputs = smol_model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,  # Fix the warning
        max_new_tokens=120,
        temperature=0.1,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id
    )
    
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    # Clean up the response - extract just JSON part
    response = response.strip()

    # Remove "assistant" prefix if present
    if response.startswith("assistant"):
        response = response[len("assistant"):].strip()
    
    json_match = re.search(r'\{.*\}', response, re.DOTALL)
    if json_match:
        json_str = json_match.group(0)
        try:
            parsed = json.loads(json_str)
            return parsed
        except json.JSONDecodeError as err:
            print(err)
            return {"raw": response}

    return {"raw": response}

# Test
# test_text = "Biden announced new climate policies in Washington yesterday"
test_results = []
for tweet in df['text'].head(25):
    print(f"Tweet: {tweet}")
    result = extract_5w1h_chat(tweet)
    print(f"Result: {result}")
    test_results.append({
        'input': tweet,
        'output': result
    })

with open('./test_results.json', 'w') as file:
    json.dump(test_results, file, indent=2)

# Try to parse as JSON
# print("Clean JSON:")
# print(json.dumps(result, indent=2))

Loading model for first time...
Model loaded
Tweet: Local Charlotte, NC news station WSOCTV is reporting that sources tell them dash cameras captured Keith Scott getting out of car and coming towards officers with a gun in his hand:#BREAKING: Sources tell Channel 9 dash camera video shows #KeithScott getting out car, coming toward officers with gun in his hand pic.twitter.com/GGuM2Ow3wk  WSOCTV (@wsoctv) September 21, 2016For a second night, protests over a deadly officer-involved shooting in Charlotte, North Carolina, turned violent, with police firing tear gas and demonstrators throwing objects and trying to damage vehicles.Keith Lamont Scott, a father of seven, was killed by police in an apartment complex parking lot Tuesday as officers looked for another man named in a warrant they were trying to serve. The shooting set off a long night of violent protests and Wednesday the demonstrations continued for a second night, starting off as a peaceful march through downtown Charlotte. But

KeyboardInterrupt: 

First SmolLM2-1.7B-Instruct test was not successful but did obtain results. 
Then Kernel crashed