In [1]:
import sys
import pandas as pd
import numpy as np
import nltk
from math import log
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split

In [2]:
if len(sys.argv) == 2 and sys.argv[1].upper() == 'YES':
    ignore_step = 'lowercase'
else:
    ignore_step = ''

In [3]:
fake_news = pd.read_csv('Fake.csv')
real_news = pd.read_csv('True.csv')

In [4]:
fake_news['class'] = 0  
real_news['class'] = 1  

data = pd.concat([fake_news, real_news], ignore_index=True)
data = data.sample(frac=1).reset_index(drop=True)  ## Shuffle
data['text'] = data['title'] + ' ' + data['text']  ## Combine title and text

In [5]:
ps = PorterStemmer() #Stemmer that will be used for stemming
stop_words = set(stopwords.words('english'))
##If argument YES given, ingore_step will be 'lowercase', and lowercasing step will be skipped.
if ignore_step != 'lowercase':
    data['text'] = data['text'].apply(lambda x: x.lower())  # Lowercase
##Remove Stopwords
    data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))  # Remove stop words
##Perfrom Stemming
    data['text'] = data['text'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))  # Stemming


In [6]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.2, random_state=42)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object with binary=True and add-1 smoothing
vectorizer = CountVectorizer(binary=True, lowercase=True, analyzer='word', min_df=1, stop_words='english')

# Fit the vectorizer on the documents
vectorizer.fit(X_train)

# Transform the documents into a binary bag of words
binary_bow_matrix = vectorizer.transform(X_train).toarray()


In [33]:
vocabulary = set(vectorizer.get_feature_names_out())
vocabulary

{'imchriskelly',
 'counselors',
 'dichotom',
 'hickory',
 'grant',
 'chozick',
 'naiveti',
 'restrepo',
 '2lnpkaq',
 'wwba',
 'breaks',
 'fitzmorri',
 'psyd',
 'shakespeare',
 'bernie',
 'barometr',
 'floundering',
 'imbassahi',
 'drewluminati_',
 'moors',
 'hubertus',
 'clny',
 'soot',
 'frizzel',
 'fever',
 'gwadar',
 'tunics',
 'trudi',
 'mineworkers',
 'segolen',
 'haven',
 'iacaucu',
 'redbox',
 'aie0wvbokv',
 'repos',
 'roschdi',
 'galaxy',
 'breastpocket',
 'recognizes',
 'goali',
 'cbn',
 'treepublican',
 'genis',
 'hendrean',
 'hierarch',
 'reformation',
 'liberty',
 'o362ugxmd2',
 'gjdtaki6o',
 'clint',
 'best',
 'rajapaksa',
 'bstandsforb',
 'kangaroo',
 'mad_jamaican',
 'minu',
 'barnett',
 'frommer',
 'dixit',
 'transparently',
 'hollingsbee',
 'onda',
 'wfmz',
 'librevil',
 'murithi',
 'tristano',
 'rabbis',
 'synopses',
 'ipera',
 'richey',
 'giffiords',
 'kaoru',
 'kvvt2tgwfd',
 'grammy',
 'wh2mpjvqzlgirls',
 'lilt',
 'simonwdc',
 'thirti',
 'species',
 'techno',
 'vind

In [34]:
import numpy as np

class NaiveBayesClassifier:
    def __init__(self, fake_word_counts={}, real_word_counts={}, fake_docs_count=0, real_docs_count=0):
        self.fake_word_counts = fake_word_counts
        self.real_word_counts = real_word_counts
        self.fake_docs_count = fake_docs_count
        self.real_docs_count = real_docs_count

        # Add dictionary attributes to keep track of probabilities
        self.fake_probs = {}
        self.real_probs = {}
        self.p_fake = None
        self.p_real = None
    
    
    def train(self, X, y):
        # Split the bag of words matrix into two matrices, one for spam and one for ham
        X_fake = X[y == 1]
        X_real = X[y == 0]

        # Calculate the prior probabilities for each class
        self.p_fake = len(X_fake) / len(X)
        self.p_real = len(X_real) / len(X)

        # Calculate the likelihood probabilities for each word
        vocabulary = set(list(self.fake_word_counts.keys()) + list(self.real_word_counts.keys()))
        fake_word_counts = np.array([self.fake_word_counts.get(word, 0) for word in vocabulary])
        real_word_counts = np.array([self.real_word_counts.get(word, 0) for word in vocabulary])
        self.fake_probs = (fake_word_counts + 1) / (np.sum(X_fake) + len(vocabulary))
        self.real_probs = (real_word_counts + 1) / (np.sum(X_real) + len(vocabulary))

    
    def predict(self, X):
        y_pred = []
        for i in range(len(X)):
            # Calculate the log probabilities of the document belonging to each class
            fake_log_prob = np.log(self.p_fake)
            real_log_prob = np.log(self.p_real)
            for j in range(len(X[i])):
                if X[i][j] > 0:
                    fake_log_prob += X[i][j] * np.log(self.fake_probs.get(j, 1))
                    real_log_prob += X[i][j] * np.log(self.real_probs.get(j, 1))
            # Choose the class with the higher probability
            if fake_log_prob > real_log_prob:
                y_pred.append(1)
            else:
                y_pred.append(0)
        return y_pred


In [27]:
# Train the classifier
vocabulary = set()
fake_word_counts = {}
real_word_counts = {}
fake_docs_count = 0
real_docs_count = 0
for i in range(len(X_train)):
    words = set(X_train.iloc[i].split())
    vocabulary = vocabulary.union(words)
    if y_train.iloc[i] == 0:
        fake_docs_count += 1
        for word in words:
            if word not in fake_word_counts:
                fake_word_counts[word] = 1
            else:
                fake_word_counts[word] += 1
    else:
        real_docs_count += 1
        for word in words:
            if word not in real_word_counts:
                real_word_counts[word] = 1
            else:
                real_word_counts[word] += 1

fake_prior_prob = fake_docs_count / len(X_train)
real_prior_prob = real_docs_count / len(X_train)

In [35]:
cls = NaiveBayesClassifier(fake_word_counts, real_word_counts, fake_docs_count, real_docs_count)

In [36]:
cls.train(binary_bow_matrix,y_train)

In [8]:
def classify(document, fake_word_counts, real_word_counts,fake_prior_prob, real_prior_prob, vocabulary):
    words = set(document.split())
    fake_prob = 0.0 
    real_prob = 0.0
    ## Apply Bayes Rule with lapace 1 smoothing
    for word in vocabulary:
        if word in fake_word_counts: 
            fake_prob += log((fake_word_counts[word] + 1) /
                (sum(fake_word_counts.values()) + len(vocabulary))) 
        
        else:
            fake_prob += log(1 / (sum(fake_word_counts.values()) + len(vocabulary)))
        if word in real_word_counts:
            real_prob += log((real_word_counts[word] + 1) / (sum(real_word_counts.values()) + len(vocabulary)))
        else:
            real_prob += log(1 / (sum(real_word_counts.values()) + len(vocabulary)))
        
    fake_prob += log(fake_prior_prob)
    real_prob += log(real_prior_prob)

    if fake_prob > real_prob:
        return 0
    else:
        return 1        

In [16]:
# Test the classifier
tp = tn = fp = fn = 0
for i in range(len(X_test)):
    document = X_test.iloc[i]
    predicted_class = classify(document, fake_word_counts, real_word_counts, fake_prior_prob, real_prior_prob, vocabulary)
    true_class = y_test.iloc[i]
    
    # Calculate the counts of true positives, true negatives, false positives, and false negatives
    if true_class == 0 and predicted_class == 0:
        tn += 1
    elif true_class == 0 and predicted_class == 1:
        fp += 1
    elif true_class == 1 and predicted_class == 0:
        fn += 1
    elif true_class == 1 and predicted_class == 1:
        tp += 1

# Calculate the various metrics
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
precision = tp / (tp + fp)
negative_predictive_value = tn / (tn + fn)
accuracy = (tp + tn) / (tp + tn + fp + fn)
f_score = 2 * precision * sensitivity / (precision + sensitivity)

# Display the results
print("Number of true positives: ", tp)
print("Number of true negatives: ", tn)
print("Number of false positives: ", fp)
print("Number of false negatives: ", fn)
print("Sensitivity: ", sensitivity)
print("Specificity: ", specificity)
print("Precision: ", precision)
print("Negative predictive value: ", negative_predictive_value)
print("Accuracy: ", accuracy)
print("F-score: ", f_score)

KeyboardInterrupt: 