## Import Libraries

In [None]:
import pandas as pd
import plotly.express as px
import gradio as gr
import warnings
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from transformers import TFAutoModelForSequenceClassification
import torch
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.inspection import permutation_importance
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments


## Read Data

In [None]:
#load data
true_df = pd.read_csv("archive/True.csv")
false_df = pd.read_csv("archive/Fake.csv")

In [None]:
true_df['auth'] = "real"
false_df['auth'] = "fake"

In [None]:
# change classifications to binary
true_df['label'] = 0
false_df['label'] = 1

In [None]:
#combine dfs
news_df = pd.concat([true_df, false_df])

In [None]:
# combine texts
news_df['article'] = news_df['title'] + ":" + news_df['text']


In [None]:
news_df

# Testing models 
#### (Do not need to run)

## zero shot classification

In [None]:
#use zero shot classifcation to train
def zero_shot(df):
    # Loads a pre-trained BART model for sequence classification
    bart = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')
    
    # Loads a tokenizer associated with the pre-trained model, which converts text into a format suitable for input into a machine learning model 
    tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
    
    
    results = {}
    classifications = ["true", "false"]
    
    for classification in classifications:
        statement = f"This example is about {classification}."
        
        token = tokenizer.encode(df, statement, return_tensors = "pt", truncation = True)
        
        logits = bart(token)[0]
        
        contra_entail = logits[:, [0,2]]
        
        prob = contra_entail.softmax(dim=1)
        true_label = prob[:, 1].item()
        
        results[classification] = true_label
    return results
        

In [None]:
#takes too long to run
news_df['predict'] = news_df['article'].apply(lambda x: zero_shot(x[:512]))


In [None]:
from transformers import pipeline

# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
news_df['article'] = df['title'] + ":" + df['text']

news_df['bart_result'] = classifier(news_df['article'], news_df['subject'].unique())

print(result)

# Final Run

## Base Model

In [None]:
#create training and test datasets
X_train, X_test, y_train, y_test = train_test_split(news_df['article'], news_df['label'], test_size=0.2, random_state=42, stratify=df['label'])


In [None]:
#using logistic regression to create binary classifications
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)
y_pred = lr.predict(X_test_tfidf)


## Performance Metrics

#### Domain Specific

In [None]:
test_domains = df.loc[X_test.index, 'subject']
uv = df['subject'].unique()
for domain in uv:
    idx = test_domains == domain
    print(f"\nDomain: {domain}")
    print(classification_report(y_test[idx], y_pred[idx]))

## Visualizations

In [None]:
#feature importance analysis
features= np.array(tfidf.get_feature_names_out())
importance = lr.coef_[0]
pos = np.argsort(importance)[-15:]
neg = np.argsort(importance)[:15]

plt.figure(figsize=(10,6))
plt.barh(features[pos], importance[pos], color='blue')
plt.barh(features[neg], importance[neg], color='orange')
plt.title("Weight of Identified Key Words from Logistic Regression")
plt.xlabel("Weight")
plt.tight_layout()
plt.show()

## Improvement

### roBERTa Model

In [None]:
#reading in roberta model
# tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
# model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')


tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", 
                                                         problem_type="multi_label_classification",
                                                         num_labels=2)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def roberta(texts, batch_size=16):
    preds = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts.tolist(), return_tensors="pt", truncation=True,
                           padding=True, max_length=512).to(device)
        with torch.no_grad():
            logits = model(**inputs).logits
        batch_preds = torch.argmax(logits, dim=1).cpu().tolist()
        preds.extend(batch_preds)
    return preds

In [None]:
news_df['roberta'] = roberta(news_df['article'])

print(classification_report(news_df['auth'], news_df['roberta'], target_names=["fake", "real"]))