In [1]:
import sys
import pandas as pd
import numpy as np
import nltk
import re
from math import log
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict



In [2]:
lowercase = None
if len(sys.argv) == 2 and sys.argv[1].upper() == 'YES':
    ignore_step = 'lowercase'
else:
    ignore_step = ''

In [3]:
fake_news = pd.read_csv('Fake.csv')
real_news = pd.read_csv('True.csv')

In [4]:
fake_news['class'] = 0  
real_news['class'] = 1  

data = pd.concat([fake_news, real_news], ignore_index=True)
data = data.sample(frac=1).reset_index(drop=True)  ## Shuffle
data['text'] = data['title'] + ' ' + data['text']  ## Combine title and text

In [5]:
ps = PorterStemmer() #Stemmer that will be used for stemming
stop_words = set(stopwords.words('english'))
##If argument YES given, ingore_step will be 'lowercase', and lowercasing step will be skipped.
if ignore_step != 'lowercase':
    data['text'] = data['text'].apply(lambda x: x.lower())  # Lowercase
##Remove Stopwords
    data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))  # Remove stop words
##Perfrom Stemming
    data['text'] = [' '.join([ps.stem(word) for word in x.split()]) for x in data['text']]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.2, random_state=42)

In [7]:
def my_preprocessor(text):
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    return text

In [58]:
def train_naive_bayes(X_train):
    ##Binary count vectorizer object
    vectorizer = CountVectorizer(binary=True, preprocessor= my_preprocessor)

    vectorizer.fit(X_train)

    X_train_bow_matrix = vectorizer.transform(X_train).toarray()
    ##Separate BOW into different matrices
    
    X_fake = X_train_bow_matrix[y_train == 0, :]
    X_real = X_train_bow_matrix[y_train == 1, :]

    log_prior = {}

    # Calculate P(c) term
    numb_doc = len(X_train_bow_matrix)
    numb_classes = 2
    class_counts = np.bincount(y_train)
    for label in range(numb_classes):
        log_prior[label] = np.log(class_counts[label]/numb_doc) 

    # Create Vocabulary of D
    V = vectorizer.get_feature_names_out()
    ##Get necessary counts to calcualte probability
    real_word_counts = np.sum(X_real, axis=0)
    fake_word_counts = np.sum(X_fake, axis=0)
    real_words_total = np.sum(real_word_counts)
    fake_words_total = np.sum(fake_word_counts)
    real_doc_count = len(X_real)
    fake_doc_count = len(X_fake)

    #Calculate probabilites using lapace smoothing of 1
    fake_probs = {}
    real_probs = {}
    for word in range(len(V)):
        fake_count = fake_word_counts[word]
        real_count = real_word_counts[word]
        fake_probs[V[word]] = np.log((fake_count + 1) / (fake_words_total + len(V)))
        real_probs[V[word]] = np.log((real_count + 1) / (real_words_total + len(V)))
    # Create log_likelihood dictionary
    log_likelihood = {}
    log_likelihood[0] = fake_probs
    log_likelihood[1] = real_probs

    V_list = V.tolist()
    
    return log_prior,log_likelihood,V_list,




In [8]:
##Binary count vectorizer object
vectorizer = CountVectorizer(binary=True, preprocessor= my_preprocessor)

vectorizer.fit(X_train)

X_train_bow_matrix = vectorizer.transform(X_train).toarray()



In [9]:
X_fake = X_train_bow_matrix[y_train == 0, :]
X_real = X_train_bow_matrix[y_train == 1, :]


In [55]:
log_prior = {}

# Calculate P(c) term
numb_doc = len(X_train_bow_matrix)
numb_classes = 2
class_counts = np.bincount(y_train)
for label in range(numb_classes):
    log_prior[label] = np.log(class_counts[label]/numb_doc)

In [None]:
# Create Vocabulary of D
V = vectorizer.get_feature_names_out()
##Get necessary counts to calcualte probability
real_word_counts = np.sum(X_real, axis=0)
fake_word_counts = np.sum(X_fake, axis=0)
real_words_total = np.sum(real_word_counts)
fake_words_total = np.sum(fake_word_counts)
real_doc_count = len(X_real)
fake_doc_count = len(X_fake)

#Calculate probabilites using lapace smoothing of 1
fake_probs = {}
real_probs = {}
for word in range(len(V)):
    fake_count = fake_word_counts[word]
    real_count = real_word_counts[word]
    fake_probs[V[word]] = np.log((fake_count + 1) / (fake_words_total + len(V)))
    real_probs[V[word]] = np.log((real_count + 1) / (real_words_total + len(V)))
# Create log_likelihood dictionary
log_likelihood = {}
log_likelihood[0] = fake_probs
log_likelihood[1] = real_probs

return V.tolist()


In [63]:
def test_naive_bayes(X_test, log_prior, log_likelihood, C, V):
    # Initialize array to hold the predicted class for each document
    vectorizer = CountVectorizer(vocabulary=V, binary=True, preprocessor=my_preprocessor)
    testdoc = vectorizer.transform(X_test).toarray()

    # Create a matrix of log likelihoods for all words in the vocabulary for each class
    log_likelihood_matrix = np.array([list(log_likelihood[c].values()) for c in C]).T

    # Calculate the sum of log likelihoods for each document and class using broadcasting
    sum_c = (testdoc @ log_likelihood_matrix) + list(log_prior.values())

    # Choose the class with the highest sum
    best_c = np.argmax(sum_c, axis=1)

    return best_c


In [60]:
log_prior, log_likelihood, V = train_naive_bayes(X_train)

In [64]:
y_pred = test_naive_bayes(X_test, log_prior, log_likelihood, [0,1], V_list)