In [28]:
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

from sklearn.utils import shuffle

from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

np.random.seed(0)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jakeb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
# kaggle dataset 1

fake_raw_1 = pd.read_csv('dataset_1/Fake.csv')
fake_raw_1['label'] = 'fake'

true_raw_1 = pd.read_csv('dataset_1/True.csv')
true_raw_1['label'] = 'true'

In [30]:
# kaggle dataset 2

raw_2 = pd.read_csv('dataset_2/fake_train.csv')
raw_2['label'] = np.where(raw_2['label'] == 1, 'true', 'fake')

In [40]:
# dataset 3 - COVID

fake_covid = pd.read_csv('dataset_3/fake_covid_dataset.csv')
fake_covid_clean = fake_covid.loc[~fake_covid["subcategory"].str.contains("partially false")][["title", "text", "subcategory"]]
fake_covid_clean["label"] = np.where(fake_covid_clean["subcategory"].str.contains("false news"), "fake", "true")
fake_covid_shuffled = fake_covid_clean.sample(frac=1)[["title", "text", "label"]].reset_index(drop=True)
fake_covid_final = pd.concat([fake_covid_shuffled.loc[fake_covid_shuffled["label"]=="true"].head(659), fake_covid_shuffled.loc[fake_covid_shuffled["label"]=="fake"]]).dropna().reset_index(drop=True)

In [44]:
# keep only text, title and label and combine to form to full set

full_set = fake_raw_1[['title', 'text', 'label']]
full_set = full_set.append(true_raw_1[['title', 'text', 'label']])
full_set = full_set.append(raw_2[['title', 'text', 'label']])
full_set = full_set.append(fake_covid_final[['title', 'text', 'label']])

print(full_set.shape)

(67014, 3)


In [50]:
full_set = shuffle(full_set)

train_data = full_set[:52000]
test_data = full_set[52000:62000]
dev_data = full_set[62000:]

train_text, train_title, train_labels = train_data['text'], train_data['title'], train_data['label']
test_text, test_title, test_labels = test_data['text'], test_data['title'], test_data['label']
dev_text, dev_title, dev_labels = dev_data['text'], dev_data['title'], dev_data['label']

In [51]:
train_data.head()

Unnamed: 0,title,text,label
15898,Silicon Valley Slams Elon Musk for Tweeting Re...,Tesla and SpaceX CEO Elon Musk suffered huge ...,fake
9257,WATCH: 3 PRESIDENTS Before Trump Promised To M...,"On Tuesday, Democratic Senator Dianne Feinstei...",fake
19711,Rohingya refugees tell of new violence; call f...,"COX S BAZAR, Bangladesh (Reuters) - Rohingya M...",true
19519,Ukraine's Poroshenko rejects Russia's 'hybrid'...,UNITED NATIONS (Reuters) - Ukrainian President...,true
4944,Florida Republican Has An INSANE Meltdown Aft...,"Dan Bongino, a former Secret Service agent, is...",fake


In [52]:
# create a CountVectorizer of the training corpus
# the nltk list of stopwords is used, and the word 'reuters' is added to avoid overfitting
# porterstemmer is used to tokenize the input as well as the stopwords list
# the lambda function is used to cast text to strings (causes errors otherwise)

def stem_preprocess(word):
    ps = nltk.stem.PorterStemmer()
    return ps.stem(word)

sw = stopwords.words('english')
sw.append('reuters')
sw_preprocess = sw.copy()

for word in sw:
    sw_preprocess.append(stem_preprocess(word))

cv_train = CountVectorizer(stop_words=sw_preprocess,
                          strip_accents='ascii', 
                          lowercase=True, 
                          preprocessor=stem_preprocess)
train_text_cv = cv_train.fit_transform(train_text.apply(lambda x: np.str_(x)))
test_text_cv = cv_train.transform(test_text.apply(lambda x: np.str_(x)))

print(np.shape(train_text_cv))

(52000, 212941)


In [53]:
# Decision Tree Classifier - Sample

clf = tree.DecisionTreeClassifier(max_depth=20)
clf.fit(train_text_cv, train_labels)

DecisionTreeClassifier(max_depth=20)

In [54]:
clf.score(test_text_cv, test_labels)

0.8674