In [1]:
import sys
import pandas as pd
import numpy as np
import nltk
from math import log
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split

In [2]:
if len(sys.argv) == 2 and sys.argv[1].upper() == 'YES':
    ignore_step = 'lowercase'
else:
    ignore_step = ''

In [3]:
fake_news = pd.read_csv('Fake.csv')
real_news = pd.read_csv('True.csv')

In [4]:
fake_news['class'] = 0  
real_news['class'] = 1  

data = pd.concat([fake_news, real_news], ignore_index=True)
data = data.sample(frac=1).reset_index(drop=True)  ## Shuffle
data['text'] = data['title'] + ' ' + data['text']  ## Combine title and text

In [5]:
ps = PorterStemmer() #Stemmer that will be used for stemming
stop_words = set(stopwords.words('english'))
##If argument YES given, ingore_step will be 'lowercase', and lowercasing step will be skipped.
if ignore_step != 'lowercase':
    data['text'] = data['text'].apply(lambda x: x.lower())  # Lowercase
##Remove Stopwords
    data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))  # Remove stop words
##Perfrom Stemming
    data['text'] = data['text'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))  # Stemming


In [6]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.2, random_state=42)

In [7]:
# Train the classifier
vocabulary = set()
fake_word_counts = {}
real_word_counts = {}
fake_docs_count = 0
real_docs_count = 0
for i in range(len(X_train)):
    words = set(X_train.iloc[i].split())
    vocabulary = vocabulary.union(words)
    if y_train.iloc[i] == 0:
        fake_docs_count += 1
        for word in words:
            if word not in fake_word_counts:
                fake_word_counts[word] = 1
            else:
                fake_word_counts[word] += 1
    else:
        real_docs_count += 1
        for word in words:
            if word not in real_word_counts:
                real_word_counts[word] = 1
            else:
                real_word_counts[word] += 1

fake_prior_prob = fake_docs_count / len(X_train)
real_prior_prob = real_docs_count / len(X_train)

In [8]:
def classify(document, fake_word_counts, real_word_counts,fake_prior_prob, real_prior_prob, vocabulary):
    words = set(document.split())
    fake_prob = 0.0 
    real_prob = 0.0
    for word in vocabulary:
        if word in fake_word_counts:
            fake_prob += log((fake_word_counts[word] + 1) /
                (sum(fake_word_counts.values()) + len(vocabulary)))
        else:
            fake_prob += log(1 / (sum(fake_word_counts.values()) + len(vocabulary)))
        if word in real_word_counts:
            real_prob += log((real_word_counts[word] + 1) / (sum(real_word_counts.values()) + len(vocabulary)))
        else:
            real_prob += log(1 / (sum(real_word_counts.values()) + len(vocabulary)))
        
    fake_prob += log(fake_prior_prob)
    real_prob += log(real_prior_prob)

    if fake_prob > real_prob:
        return 0
    else:
        return 1        

In [None]:
classify_parallel(X_test.iloc[0],fake_word_counts, real_word_counts, fake_prior_prob, real_prior_prob, vocabulary)

In [16]:
# Test the classifier
tp = tn = fp = fn = 0
for i in range(len(X_test)):
    document = X_test.iloc[i]
    predicted_class = classify(document, fake_word_counts, real_word_counts, fake_prior_prob, real_prior_prob, vocabulary)
    true_class = y_test.iloc[i]
    
    # Calculate the counts of true positives, true negatives, false positives, and false negatives
    if true_class == 0 and predicted_class == 0:
        tn += 1
    elif true_class == 0 and predicted_class == 1:
        fp += 1
    elif true_class == 1 and predicted_class == 0:
        fn += 1
    elif true_class == 1 and predicted_class == 1:
        tp += 1

# Calculate the various metrics
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
precision = tp / (tp + fp)
negative_predictive_value = tn / (tn + fn)
accuracy = (tp + tn) / (tp + tn + fp + fn)
f_score = 2 * precision * sensitivity / (precision + sensitivity)

# Display the results
print("Number of true positives: ", tp)
print("Number of true negatives: ", tn)
print("Number of false positives: ", fp)
print("Number of false negatives: ", fn)
print("Sensitivity: ", sensitivity)
print("Specificity: ", specificity)
print("Precision: ", precision)
print("Negative predictive value: ", negative_predictive_value)
print("Accuracy: ", accuracy)
print("F-score: ", f_score)

KeyboardInterrupt: 