# Fake News Classifier: Logistic Regression, Multilayer Perceptron, K-Means Clustering
### CS 4824 / ECE 4484, Spring '21

In [172]:
###### standard imports ######
%load_ext autoreload
%autoreload 2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
plt.style.use('ggplot')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johnventura/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johnventura/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/johnventura/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [173]:
###### Import our train data and check out its dimensions ######
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
rows_train, cols_train = train_data.shape
rows_test, cols_test = test_data.shape

In [185]:
###### Check out the training dataset ######
test_data.head(15)

Unnamed: 0,title,text,subject,date,label
0,"Factbox: From taxes to budget, U.S. Congress's...",[reuters u congress careening toward major dea...,politicsNews,"November 28, 2017",real
1,"BREAKING: Israel’s “Worst Fears Confirmed,” S...",[bombshell revelation new york times named isr...,politicsNews,"May 16, 2017",fake
2,U.S. drug enforcement chief to step down from ...,[reuters u drug enforcement administration act...,News,"September 26, 2017",real
3,Factbox: Trump on Twitter (Oct 1) - Rex Tiller...,[following statements posted verified twitter ...,worldnews,"October 2, 2017",real
4,FCC chief plans to ditch U.S. 'net neutrality'...,[washington reuters head u federal communicati...,Middle-east,"November 21, 2017",real
5,Trump Said He Could Easily End The Bloodshed ...,[trump repeatedly decried gun violence plagues...,News,"June 21, 2017",fake
6,"Senate panel snubs Trump's pick to run EXIM, O...",[washington reuters u senate banking committee...,News,"December 19, 2017",real
7,U.S. civil liberties group to challenge Trump'...,[washington reuters american civil liberties u...,Government News,"September 29, 2017",real
8,U.S. expresses 'strong disappointment and prot...,[washington reuters u ambassador moscow john t...,worldnews,"July 28, 2017",real
9,Reporter Calls Out The Racist Writer Trump In...,[may work fox radio man deserves ton respect b...,worldnews,"March 10, 2017",fake


In [175]:
###### Preprocess the training data ######
stopwords = set(stopwords.words('english'))
for i in range(rows_train):
    # Remove characters not in range a-z or A-Z
    train_data['text'][i] = re.sub('[^a-zA-Z]', ' ', train_data['text'][i])

    # Make all lowercase
    train_data['text'][i] = train_data['text'][i].lower()

    # Remove stopwords from texts
    text_token = word_tokenize(train_data['text'][i])
    train_data['text'][i] = [word for word in text_token if not word in stopwords]
    
# Finally, turn list of characters to sentance without stopwords
train_data['text'] = [[' '.join(word)] for word in train_data['text']]

In [176]:
###### Preprocess the testing data ######
for i in range(rows_test):
    test_data['text'][i] = re.sub('[^a-zA-Z]', ' ', test_data['text'][i])

    test_data['text'][i] = test_data['text'][i].lower()

    text_token = word_tokenize(test_data['text'][i])
    test_data['text'][i] = [word for word in text_token if not word in stopwords]
    
test_data['text'] = [[' '.join(word)] for word in test_data['text']]

In [181]:
###### Convert list of list to list of strings in order to vectorize ######
X_train = [''.join(i) for i in train_data['text']]
y_train = [''.join(i) for i in train_data['label']]
X_test = [''.join(i) for i in test_data['text']]
y_test = [''.join(i) for i in test_data['label']]

In [182]:
###### Convert labels to integer values ######
for i in range(rows_train):
    if y_train[i] == 'fake':
        y_train[i] = 0
    else:
        y_train[i] = 1
        
for i in range(rows_test):
    if y_test[i] == 'fake':
        y_test[i] = 0
    else:
        y_test[i] = 1

In [183]:
###### Obtain Tfidf Vectors #####
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
X_train = X_train_tfidf
X_test = X_test_tfidf

In [184]:
###### Implement sklearn Log Reg algorithm for comparison #####
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter = 1000)
log_reg.fit(X_train, y_train)

# Predict
print("Train Accuracy: ", log_reg.score(X_train, y_train))
print("Test Accuracy: ", log_reg.score(X_test, y_test))

Train Accuracy:  0.9923805704546593
Test Accuracy:  0.9888
