In [20]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [21]:
import pandas as pd
from utils import llama_chat_hf
import torch
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score, classification_report

In [22]:
torch.cuda.get_device_name()

'NVIDIA A100-PCIE-40GB'

In [23]:
df = pd.read_csv("../data/fakenewsnet.csv")
df = df[df["dataset"] == "politifact"]
signal_df = pd.read_csv("../data/signals.csv")

In [5]:
model = llama_chat_hf(size=13)

Loading checkpoint shards: 100%|██████████| 3/3 [00:20<00:00,  6.96s/it]


In [6]:
system_context = \
    """You are a helpful and unbiased news verification assistant. You will be provided with the title and the full body of text of a news article. Then, you will answer further questions related to the given article. Ensure that your answers are grounded in reality, truthful and reliable. {abstain_context}It is essential that you only answer objectively with one of the following options: {options}. Please do not answer with anything other than the option provided."""

prompt = """{title}\n{text}\n\n{question} ({options})"""
abstain_context = "You are only allowed to answer with 'Unsure' if you do not have enough information or context to provide a reliable answer."
abstain_context = ""
# fake_news_context = "Fake news is false or inaccurate information, especially that which is deliberately intended to deceive." # Decreases performance.

In [7]:
def category_mapping(answer):
    if answer.lower().startswith("no"):
        category = 0
    elif answer.lower().startswith("yes"):
        category = 1
    else:
        category = -1

    return category

In [17]:
preds = []
trues = []
with tqdm(total=len(df)) as pbar:
    for i, article_row in enumerate(df.sample(frac=1).itertuples()):
        # ZS Question
        system_context_zs = system_context.format(options="Yes/No", abstain_context="")
        prompt_formatted = prompt.format(title=article_row.title, text=article_row.text, question="Can this article be categorised as fake news?", options="Yes/No")
        try:
            answer_zs = model.prompt(prompt_formatted, system_context=system_context_zs, allow_abstain=False)
        except torch.cuda.OutOfMemoryError as e:
            continue # stop processing this example
        
        category = category_mapping(answer_zs)
        preds.append(category)
        label_converter = lambda x: 0 if x == "real" else 1
        trues.append(label_converter(article_row.objective))

        acc = accuracy_score(trues, preds)
        f1 = f1_score(trues, preds, average="macro")
        num_yes = len([x for x in preds if x == 1])
        num_no = len([x for x in preds if x == 0])
        num_abstain = len([x for x in preds if x == -1])
        updated_description = f"Acc={acc*100:.2f}, F1={f1*100:.2f}, Total={i}, Num_Yes={num_yes}, Num_No={num_no}, Num_Abstain={num_abstain}"
        pbar.set_description(updated_description)
        
        # print(answer_zs)
        # WS Questions
        # print(10*"-")
        # print("Objective:", article_row.objective)
        # for j, question_row in enumerate(signal_df.itertuples()):
        #     system_context_ws = system_context.format(options="Yes/Unsure/No", abstain_context=abstain_context)
        #     prompt_formatted_ws = prompt.format(title=article_row.title, text=article_row.text, question=question_row.Question, options="Yes/Unsure/No")

        #     try:
        #         answer_ws = model.prompt(prompt_formatted_ws, system_context=system_context_ws, allow_abstain=True)
        #     except torch.cuda.OutOfMemoryError as e:
        #         break # stop processing this example and the next questions

        #     category_ws = category_mapping(answer_ws)
        #     print(question_row.Question, category_ws)
        #     print(answer_ws)
        if i > 100:
            break

  0%|          | 0/922 [00:00<?, ?it/s]

Acc=78.57, F1=63.48, Total=14, Num_Yes=1, Num_No=13, Num_Abstain=0:   0%|          | 0/922 [00:18<?, ?it/s]


KeyboardInterrupt: 

In [12]:
print(f"Coverage: {len([x for x in preds if x != -1])/len(preds)*100:.2f}%")
print("Acc:", accuracy_score(trues, preds))
print("F1-macro:", f1_score(trues, preds, average="macro"))
print(classification_report(trues, preds))

Coverage: 100.00%
Acc: 0.5280898876404494
F1-macro: 0.4159375
              precision    recall  f1-score   support

           0       0.52      0.93      0.67        46
           1       0.57      0.09      0.16        43

    accuracy                           0.53        89
   macro avg       0.55      0.51      0.42        89
weighted avg       0.55      0.53      0.42        89

