In [29]:
import openai
import pandas as pd
import time

In [30]:
response = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[{"role": "user", "content": "Say 'Hello, world!'"}]
)
print(response.choices[0].message.content)


Hello, world!


In [31]:
def classify_news(article_text, model="gpt-4-turbo"):
    """Calls OpenAI API to classify a news article as real (1) or fake (0)."""
    if not isinstance(article_text, str) or article_text.strip() == "":
        return None  # Avoid sending empty or invalid requests
    
    prompt = (
        f"""Determine whether the following news article is real (1) or fake (0). 
        Provide only the number 1 or 0 as output.
        
        Article:
        {article_text}
        """
    )
    
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": "You are a fact-checking AI."},
                      {"role": "user", "content": prompt}],
            temperature=0
        )
        prediction = response.choices[0].message.content.strip()
        return int(prediction) if prediction in ["0", "1"] else None
    except Exception as e:
        print(f"Error processing article: {e}")
        return None

def process_news_file(input_file, output_file):
    """Reads a CSV file, classifies news articles, updates rows, and calculates accuracy."""
    df = pd.read_csv(input_file, low_memory=False)
    
    if "text" not in df.columns or "original label" not in df.columns:
        raise ValueError("CSV file must contain 'text' and 'original label' columns.")
    
    # Initialize an empty list for predictions
    predictions = []
    
    for index, row in df.iterrows():
        gpt_label = classify_news(row["text"])
        predictions.append(gpt_label)
        df.at[index, "gpt labels"] = gpt_label  # Update row immediately
    
    # Drop rows where GPT didn't return a valid prediction
    df.dropna(subset=["gpt labels"], inplace=True)
    
    # Ensure GPT predictions are integers
    df["gpt labels"] = df["gpt labels"].astype(int)
    
    # Save the updated dataframe immediately
    df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"Results saved to {output_file}")

def evaluate_metrics(processed_file):
    """Evaluates precision, recall, and F1-score for the GPT predictions from scratch."""
    df = pd.read_csv(processed_file)
    
    if "gpt labels" not in df.columns or "original label" not in df.columns:
        raise ValueError("Processed CSV file must contain 'gpt labels' and 'original label' columns.")
    
    true_positives = sum((df["gpt labels"] == 1) & (df["original label"] == 1))
    false_positives = sum((df["gpt labels"] == 1) & (df["original label"] == 0))
    false_negatives = sum((df["gpt labels"] == 0) & (df["original label"] == 1))
    true_negatives = sum((df["gpt labels"] == 0) & (df["original label"] == 0))
    
    accuracy = (true_positives + true_negatives) / len(df)
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Precision: {precision:.2%}")
    print(f"Recall: {recall:.2%}")
    print(f"F1 Score: {f1:.2%}")

In [32]:
if __name__ == "__main__":
    input_file = "../data/news_data.csv"
    output_file = "ISOT_Data_With_LLM_Predictions.csv"
    process_news_file(input_file, output_file)
    evaluate_metrics(output_file)

Results saved to news_data_with_predictions.csv
Accuracy: 88.00%
Precision: 82.76%
Recall: 96.00%
F1 Score: 88.89%
