In [5]:
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

from sklearn.utils import shuffle

from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

np.random.seed(0)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# kaggle dataset 1

fake_raw_1 = pd.read_csv('https://raw.githubusercontent.com/jacobbarkow/w207-final-project-barkow-laface-meehan-skokowski/main/code/dataset_1/Fake.csv')
fake_raw_1['label'] = 'fake'

true_raw_1 = pd.read_csv('https://raw.githubusercontent.com/jacobbarkow/w207-final-project-barkow-laface-meehan-skokowski/main/code/dataset_1/True.csv')
true_raw_1['label'] = 'true'

In [7]:
# kaggle dataset 2

raw_2 = pd.read_csv('https://raw.githubusercontent.com/jacobbarkow/w207-final-project-barkow-laface-meehan-skokowski/main/code/dataset_2/fake_train.csv')
raw_2['label'] = np.where(raw_2['label'] == 1, 'true', 'fake')

In [8]:
# dataset 3 - COVID

fake_covid = pd.read_csv('https://raw.githubusercontent.com/jacobbarkow/w207-final-project-barkow-laface-meehan-skokowski/main/code/dataset_3/fake_covid_dataset.csv')
fake_covid_clean = fake_covid.loc[~fake_covid["subcategory"].str.contains("partially false")][["title", "text", "subcategory"]]
fake_covid_clean["label"] = np.where(fake_covid_clean["subcategory"].str.contains("false news"), "fake", "true")
fake_covid_shuffled = fake_covid_clean.sample(frac=1)[["title", "text", "label"]].reset_index(drop=True)
fake_covid_final = pd.concat([fake_covid_shuffled.loc[fake_covid_shuffled["label"]=="true"].head(659), fake_covid_shuffled.loc[fake_covid_shuffled["label"]=="fake"]]).dropna().reset_index(drop=True)

In [9]:
# keep only text, title and label and combine to form to full set

full_set = fake_raw_1[['title', 'text', 'label']]
full_set = full_set.append(true_raw_1[['title', 'text', 'label']])
full_set = full_set.append(raw_2[['title', 'text', 'label']])
full_set = full_set.append(fake_covid_final[['title', 'text', 'label']])

print(full_set.shape)

(67014, 3)


In [10]:
full_set = shuffle(full_set)

train_data = full_set[:52000]
test_data = full_set[52000:62000]
dev_data = full_set[62000:]

train_text, train_title, train_labels = train_data['text'], train_data['title'], train_data['label']
test_text, test_title, test_labels = test_data['text'], test_data['title'], test_data['label']
dev_text, dev_title, dev_labels = dev_data['text'], dev_data['title'], dev_data['label']

In [11]:
train_data.head()

Unnamed: 0,title,text,label
19663,SPEAKER SCHEDULED TO Praise Hillary Totally Tr...,A Bernie Sanders supporter who slammed Hillary...,fake
11674,Rory McIlroy Comes Back Early From Golf Rehab ...,Rory McIlroy joined President Donald Trump on ...,fake
12305,German police raid locations linked to Islamis...,BERLIN (Reuters) - German police investigating...,true
5089,Outdoor Clothing Giant Patagonia Pledging to L...,The CEO of outdoor clothing giant Patagonia is...,fake
12845,WOW! DEM STRATEGIST Bob Beckel Says Wikileaks ...,Is CNN contributor and Democratic Strategist B...,fake


In [12]:
# create a CountVectorizer of the training corpus
# the nltk list of stopwords is used, and the word 'reuters' is added to avoid overfitting
# porterstemmer is used to tokenize the input as well as the stopwords list
# the lambda function is used to cast text to strings (causes errors otherwise)

def stem_preprocess(word):
    ps = nltk.stem.PorterStemmer()
    return ps.stem(word)

sw = stopwords.words('english')
sw.append('reuters')
sw_preprocess = sw.copy()

for word in sw:
    sw_preprocess.append(stem_preprocess(word))

cv_train = CountVectorizer(stop_words=sw_preprocess,
                          strip_accents='ascii', 
                          lowercase=True, 
                          preprocessor=stem_preprocess)
train_text_cv = cv_train.fit_transform(train_text.apply(lambda x: np.str_(x)))
test_text_cv = cv_train.transform(test_text.apply(lambda x: np.str_(x)))

print(np.shape(train_text_cv))

(52000, 213537)


# Decision Trees - Exploration

In [19]:
# Decision Tree Classifier - Sample

print("\n---------------------Decision Trees--------------------")

for depth in [2,3,4,5,10,15,20]:
  clf = tree.DecisionTreeClassifier(max_depth=depth)
  clf.fit(train_text_cv, train_labels)
  test_predicted = clf.predict(test_text_cv)
  print("Tree depth: ", depth, "F1 Score: ", metrics.f1_score(test_labels, test_predicted, average="weighted"))


---------------------Decision Tree--------------------
Tree depth:  2 F1 Score:  0.6848124152560653
Tree depth:  3 F1 Score:  0.7356219548544219
Tree depth:  4 F1 Score:  0.7456380977748216
Tree depth:  5 F1 Score:  0.7828158310536464
Tree depth:  10 F1 Score:  0.8208322582494911
Tree depth:  15 F1 Score:  0.8527800499676458
Tree depth:  20 F1 Score:  0.8620249289650428


# Random Forest - Exploration

In [20]:
print("\n---------------------Random Forests--------------------")

for depth in [2,3,4,5,10,15,20]:
  rfc = RandomForestClassifier(max_depth=depth)
  rfc.fit(train_text_cv, train_labels)
  test_predicted = rfc.predict(test_text_cv)
  print("Tree depth: ", depth, "F1 Score: ", metrics.f1_score(test_labels, test_predicted, average="weighted"))


---------------------Random Forests--------------------
Tree depth:  2 F1 Score:  0.572174712139491
Tree depth:  3 F1 Score:  0.7489877280229595
Tree depth:  4 F1 Score:  0.780011607724576
Tree depth:  5 F1 Score:  0.7825628642949174
Tree depth:  10 F1 Score:  0.8055055897832193
Tree depth:  15 F1 Score:  0.8214493521030416
Tree depth:  20 F1 Score:  0.8297816970622229


# Multinomial Naive Bayes - Exploration

In [15]:
alphas = [1.0e-10, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]
l2s = [0.4,0.5,0.6,0.9,1,1.5,5,10,20]

In [16]:
print("\n---------------------Naive Bayes--------------------")
for a in alphas:
  nb_model = MultinomialNB(alpha=a)
  nb_fit = nb_model.fit(train_text_cv,train_labels)
  pred_nb = nb_model.predict(test_text_cv)
  f1score = metrics.f1_score(test_labels,pred_nb ,average="weighted")
  print("Alpha: ", a, "F1 Score: ",f1score)


---------------------Naive Bayes--------------------
Alpha:  1e-10 F1 Score:  0.8582299813084111
Alpha:  0.0001 F1 Score:  0.8548427071702348
Alpha:  0.001 F1 Score:  0.8544137354028543
Alpha:  0.01 F1 Score:  0.8499620437016919
Alpha:  0.1 F1 Score:  0.8464707776119172
Alpha:  0.5 F1 Score:  0.8407565676132636
Alpha:  1.0 F1 Score:  0.8377795686825913
Alpha:  2.0 F1 Score:  0.8338646447974721
Alpha:  10.0 F1 Score:  0.8198095691961272


# Logistic Regression - Exploration

In [18]:
print("\n---------------------Logistic Regression--------------------")
c=1
for c in l2s:
  lr_model =LogisticRegression(C=c, solver="liblinear", multi_class="auto")
  lr_fit = lr_model.fit(train_text_cv,train_labels)
  pred_lr = lr_model.predict(test_text_cv)
  f1score = metrics.f1_score(test_labels,pred_lr ,average="weighted")
  # Get the weights for each
  weights = lr_model.coef_
  ssw = np.sum(weights,axis=1)**2 ###axis=1 will print out the sum of squared weights for each topic
  print("L2 Regularization Strength: ", c, "F1 Score: ",f1score,"C Values: ",ssw)


---------------------Logistic Regression--------------------
L2 Regularization Strength:  0.4 F1 Score:  0.9224960079283675 C Values:  [9980.13291228]
L2 Regularization Strength:  0.5 F1 Score:  0.9211927381477373 C Values:  [10062.11337502]
L2 Regularization Strength:  0.6 F1 Score:  0.9210924278922512 C Values:  [6857.49217124]
L2 Regularization Strength:  0.9 F1 Score:  0.9178797453414754 C Values:  [8491.37430308]
L2 Regularization Strength:  1 F1 Score:  0.9205869857984273 C Values:  [8575.63130901]
L2 Regularization Strength:  1.5 F1 Score:  0.916982853739491 C Values:  [12718.78775773]
L2 Regularization Strength:  5 F1 Score:  0.9147762256204318 C Values:  [20332.20025108]
L2 Regularization Strength:  10 F1 Score:  0.9177851435405319 C Values:  [15031.63123608]
L2 Regularization Strength:  20 F1 Score:  0.9202947390443358 C Values:  [8504.703211]
