# Fake News Classifier: Naive Bayes, Linear SVM, and Multilayer Perceptron
### CS 4824 / ECE 4424, Spring '21

In [43]:
###### standard imports ######
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from linear_classifier import linear_predict, log_reg_train
from naive_bayes import naive_bayes_train, naive_bayes_predict
from mlp import mlp_train, mlp_predict, logistic, nll
from kernelsvm import kernel_svm_train, kernel_svm_predict
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
plt.style.use('ggplot')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johnventura/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johnventura/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/johnventura/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [45]:
###### Import our train data and check out its dimensions ######
train_data = pd.read_csv("train.csv")
train_data = train_data.head(1000)
test_data = pd.read_csv("test.csv")
test_data = test_data.head(160)
rows_train, cols_train = train_data.shape
rows_test, cols_test = test_data.shape

In [46]:
###### Check out the training dataset ######
train_data.head(15)

Unnamed: 0,title,text,subject,date,label
0,Clinton faces pressure to pick VP who is tough...,WASHINGTON (Reuters) - Members of the Democrat...,politicsNews,"July 21, 2016",real
1,"Ryan, Trump cite 'positive step' toward Republ...",WASHINGTON (Reuters) - Presumptive Republican ...,politicsNews,"May 12, 2016",real
2,WATCH: President Obama Dares Republicans To S...,Conservatives talk the talk but can they walk ...,News,"July 9, 2016",fake
3,Hariri warns Lebanon faces Arab sanctions risk...,BEIRUT (Reuters) - Saad al-Hariri warned on Su...,worldnews,"November 12, 2017",real
4,A POEM: ‘Twas The Night Before CNN’s Christmas…’,ACR s BOILER ROOM presents a Christmas poem Tw...,Middle-east,"December 25, 2017",fake
5,Viral Video Shows Trump For The Fascist He Tr...,If you re not scared to death of the prospect ...,News,"February 29, 2016",fake
6,Ana Navarro Lets Trump Know EXACTLY What Lati...,Donald Trump and his campaign know that they s...,News,"August 23, 2016",fake
7,BREAKING: WHY IS OBAMA Sending Huge Number Of ...,The largest armed U.S. military brigade to be ...,Government News,"Jan 9, 2017",fake
8,Australia's population growth outpaces world a...,SYDNEY (Reuters) - Australia s population is e...,worldnews,"December 14, 2017",real
9,Syrian opposition says Russian jets kill civil...,AMMAN (Reuters) - Military jets believed to be...,worldnews,"October 5, 2017",real


In [47]:
###### Preprocess the training data ######
stopwords = set(stopwords.words('english'))
for i in range(rows_train):
    # Remove characters not in range a-z or A-Z
    train_data['text'][i] = re.sub('[^a-zA-Z]', ' ', train_data['text'][i])

    # Make all lowercase
    train_data['text'][i] = train_data['text'][i].lower()

    # Remove stopwords from texts
    text_token = word_tokenize(train_data['text'][i])
    train_data['text'][i] = [word for word in text_token if not word in stopwords]
    
# Finally, turn list of characters to sentence without stopwords
train_data['text'] = [[' '.join(word)] for word in train_data['text']]

In [48]:
###### Preprocess the testing data ######
for i in range(rows_test):
    test_data['text'][i] = re.sub('[^a-zA-Z]', ' ', test_data['text'][i])
    
    test_data['text'][i] = test_data['text'][i].lower()
    
    text_token = word_tokenize(test_data['text'][i])
    test_data['text'][i] = [word for word in text_token if not word in stopwords]
    
test_data['text'] = [[' '.join(word)] for word in test_data['text']]

In [49]:
###### Convert list of list to list of strings in order to vectorize ######
X_train = [''.join(i) for i in train_data['text']]
y_train = [''.join(i) for i in train_data['label']]
X_test = [''.join(i) for i in test_data['text']]
y_test = [''.join(i) for i in test_data['label']]

In [50]:
###### Convert labels to integer values ######
for i in range(rows_train):
    if y_train[i] == 'fake':
        y_train[i] = 0
    else:
        y_train[i] = 1
        
for i in range(rows_test):
    if y_test[i] == 'fake':
        y_test[i] = 0
    else:
        y_test[i] = 1

In [51]:
###### Obtain Tfidf Vectors #####
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.7)
X_train = tfidf.fit_transform(X_train).toarray()
X_test = tfidf.transform(X_test).toarray()

In [53]:
##### Implement Naive Bayes #####
from naive_bayes import naive_bayes_train, naive_bayes_predict
y_train = np.array(y_train)
nb_params = {}
nb_model = naive_bayes_train(X_train.T, y_train, nb_params)

# Compute training accuracy
nb_train_predictions = naive_bayes_predict(X_train.T, nb_model)
nb_train_accuracy = np.mean(nb_train_predictions == y_train)
print("Naive Bayes training accuracy: %f" % nb_train_accuracy)

Naive Bayes training accuracy: 0.961000


In [54]:
# Compute testing accuracy
nb_test_predictions = naive_bayes_predict(X_test.T, nb_model)
nb_test_accuracy = np.mean(nb_test_predictions == y_test)
print("Naive Bayes testing accuracy: %f" % nb_test_accuracy)

Naive Bayes testing accuracy: 0.893750


In [61]:
X_train = X_train.T
X_test = X_test.T

In [62]:
##### Implement Linear SVM #####
c_vals = 10 ** np.linspace(-3, 3, 7)
best_params = {
    'kernel': 'linear',
    'C': c_vals[0],
}
lin_svm_model = kernel_svm_train(X_train, y_train, best_params)
predictions, _ = kernel_svm_predict(X_test, lin_svm_model)
for i in range(len(predictions)):
    if predictions[i] == -1:
        predictions[i] = 0
test_accuracy = np.mean(predictions == y_test)

print("Linear SVM had test accuracy %f " % (test_accuracy))

Linear SVM had test accuracy 0.543750 


In [57]:
##### Implement Multilayer Perceptron #####
structures = [[1], [4], [2, 2], [2, 4], [4, 4]]
lambda_vals = [0.01, 0.1, 1]
    
best_params = {
    'max_iter': 400,
    'activation_function': logistic,
    'loss_function': nll,
    'num_hidden_units': structures[4],
    'lambda': lambda_vals[1]
}
                
mlp_model = mlp_train(X_train, y_train, best_params)
predictions, _, _, _ = mlp_predict(X_test, mlp_model)
for i in range(len(predictions)):
    if predictions[i] == -1:
        predictions[i] = 0
test_accuracy = np.mean(predictions == y_test)

print("MLP had test accuracy %f" % (test_accuracy))

MLP had test accuracy 0.956250
