In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix

def load_data(real_path, fake_path):
    with open(real_path, 'r', encoding='utf-8') as file:
        real_headlines = file.readlines()
    with open(fake_path, 'r', encoding='utf-8') as file:
        fake_headlines = file.readlines()
    
    # Preprocess: Remove extra spaces and newlines
    real_headlines = [line.strip().lower() for line in real_headlines if line.strip()]
    fake_headlines = [line.strip().lower() for line in fake_headlines if line.strip()]
    
    # Create DataFrame with labels
    df_real = pd.DataFrame({'headline': real_headlines, 'label': 1})  # Real = 1
    df_fake = pd.DataFrame({'headline': fake_headlines, 'label': 0})  # Fake = 0
    
    # Combine datasets
    df = pd.concat([df_real, df_fake], ignore_index=True)
    return df

def preprocess_data(df):
    # Split dataset into train (70%) and test (30%)
    return train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])

def extract_features(train_data, test_data):
    # Convert text into numerical feature vectors
    vectorizer = CountVectorizer(stop_words='english')
    X_train_counts = vectorizer.fit_transform(train_data['headline'])
    X_test_counts = vectorizer.transform(test_data['headline'])
    return vectorizer, X_train_counts, X_test_counts

def filter_words(vectorizer, X_train_counts):
    # Compute document frequency
    doc_freq = np.asarray((X_train_counts > 0).sum(axis=0)).flatten()
    
    # Define thresholds (words in >70% or <0.5% of headlines are removed)
    min_threshold = 0.005 * X_train_counts.shape[0]
    max_threshold = 0.70 * X_train_counts.shape[0]
    
    # Select valid words
    valid_indices = np.where((doc_freq >= min_threshold) & (doc_freq <= max_threshold))[0]
    
    # Filter vocabulary
    valid_vocab = [vectorizer.get_feature_names_out()[i] for i in valid_indices]
    filtered_vectorizer = CountVectorizer(stop_words='english', vocabulary=valid_vocab)
    
    return filtered_vectorizer

def train_naive_bayes(X_train_filtered, train_data):
    alpha = 1  # Laplace smoothing factor
    
    # Compute priors
    total_real = train_data['label'].sum()
    total_fake = len(train_data) - total_real
    P_real = total_real / len(train_data)
    P_fake = total_fake / len(train_data)
    
    # Convert labels to NumPy array for proper indexing
    real_indices = np.array(train_data['label'] == 1)
    fake_indices = np.array(train_data['label'] == 0)
    
    # Compute word counts
    X_train_real = X_train_filtered[real_indices]
    X_train_fake = X_train_filtered[fake_indices]
    
    word_count_real = np.asarray(X_train_real.sum(axis=0)).flatten() + alpha
    word_count_fake = np.asarray(X_train_fake.sum(axis=0)).flatten() + alpha
    
    # Compute conditional probabilities
    P_word_given_real = word_count_real / word_count_real.sum()
    P_word_given_fake = word_count_fake / word_count_fake.sum()
    
    # Convert to log probabilities for numerical stability
    log_P_real = np.log(P_real)
    log_P_fake = np.log(P_fake)
    log_P_word_given_real = np.log(P_word_given_real)
    log_P_word_given_fake = np.log(P_word_given_fake)
    
    return log_P_real, log_P_fake, log_P_word_given_real, log_P_word_given_fake


def predict_naive_bayes(X_test_filtered, log_P_real, log_P_fake, log_P_word_given_real, log_P_word_given_fake):
    # Convert test data to dense array for matrix multiplication
    X_test_counts = X_test_filtered.toarray()
    
    # Compute posterior probabilities for each class
    log_P_real_test = log_P_real + X_test_counts @ log_P_word_given_real.T
    log_P_fake_test = log_P_fake + X_test_counts @ log_P_word_given_fake.T
    
    # Predict class (1 if log P(Real) > log P(Fake), else 0)
    return (log_P_real_test > log_P_fake_test).astype(int)

def evaluate_model(y_true, y_pred):
    accuracy = np.mean(y_pred == y_true)
    conf_matrix = confusion_matrix(y_true, y_pred)
    return accuracy, conf_matrix

# Main Execution
def main():
    real_path = "real.txt"  # Update path if necessary
    fake_path = "fake.txt"  # Update path if necessary
    
    # Load and preprocess data
    df = load_data(real_path, fake_path)
    train_data, test_data = preprocess_data(df)
    
    # Extract features
    vectorizer, X_train_counts, X_test_counts = extract_features(train_data, test_data)
    
    # Filter out uninformative words
    filtered_vectorizer = filter_words(vectorizer, X_train_counts)
    X_train_filtered = filtered_vectorizer.fit_transform(train_data['headline'])
    X_test_filtered = filtered_vectorizer.transform(test_data['headline'])
    
    # Train Na√Øve Bayes model
    log_P_real, log_P_fake, log_P_word_given_real, log_P_word_given_fake = train_naive_bayes(X_train_filtered, train_data)
    
    # Predict on test set
    y_pred = predict_naive_bayes(X_test_filtered, log_P_real, log_P_fake, log_P_word_given_real, log_P_word_given_fake)
    
    # Evaluate model
    accuracy, conf_matrix = evaluate_model(test_data['label'].to_numpy(), y_pred)
    
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Confusion Matrix:")
    print(conf_matrix)

if __name__ == "__main__":
    main()


FileNotFoundError: [Errno 2] No such file or directory: 'real.txt'