In [70]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

from sklearn.utils import shuffle

from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn.ensemble import RandomForestClassifier

import nltk

np.random.seed(0)

In [2]:
# kaggle dataset 1

fake_raw = pd.read_csv('Fake.csv')
fake_raw['label'] = 'fake'

true_raw = pd.read_csv('True.csv')
true_raw['label'] = 'true'

In [3]:
# kaggle dataset 2

raw_2 = pd.read_csv('archive_2/fake_train.csv')

In [4]:
raw_2['label'] = np.where(raw_2['label'] == 1, 'true', 'fake')


In [5]:
# keep only text, title and label and add to full set

full_set = fake_raw[['title', 'text', 'label']]
full_set = full_set.append(true_raw[['title', 'text', 'label']])
full_set = full_set.append(raw_2[['title', 'text', 'label']])

In [6]:
full_set = shuffle(full_set)

train_data = full_set[:50000]
test_data = full_set[50000:60000]
dev_data = full_set[60000:]

train_text, train_title, train_labels = train_data['text'], train_data['title'], train_data['label']
test_text, test_title, test_labels = test_data['text'], test_data['title'], test_data['label']
dev_text, dev_title, dev_labels = dev_data['text'], dev_data['title'], dev_data['label']

In [7]:
train_data.head()

Unnamed: 0,title,text,label
470,Georgia Republican Threatens To ‘Disappear’ D...,A furious Georgia Republican issued a horrifyi...,fake
13972,Philippines' Duterte ditches peace process wit...,MANILA (Reuters) - Philippine President Rodrig...,true
14882,What Gives? Saudi King Spends A Fortune In 201...,Even though Obama has to pay for anything he w...,fake
8927,Facebook's political influence under a microscope,(Reuters) - As the U.S. presidential campaign ...,true
5349,Trump's defense chief says has no problems wit...,ABU DHABI (Reuters) - U.S. Defense Secretary J...,true


In [111]:
# limiting features because otherwise it takes too long

cv_train = CountVectorizer(stop_words=['reuters'])
train_text_cv = cv_train.fit_transform(train_text.apply(lambda x: np.str_(x)))
test_text_cv = cv_train.transform(test_text.apply(lambda x: np.str_(x)))

print(np.shape(train_text_cv))

(50000, 203765)


In [112]:
clf = tree.DecisionTreeClassifier(max_depth=20)
clf.fit(train_text_cv, train_labels)

DecisionTreeClassifier(max_depth=20)

In [113]:
# fig = plt.figure(figsize=(50,50))
# _ = tree.plot_tree(clf, 
#                    feature_names=cv_train.get_feature_names(),  
#                    class_names=clf.classes_,
#                    filled=True)

In [114]:
clf.score(test_text_cv, test_labels)

0.8789

In [115]:
test_predicted = clf.predict(test_text_cv)
print(metrics.f1_score(test_labels, test_predicted, average="weighted"))

0.878923458046983


- getting rid of reuters 
- 10000 features with unrestricted depth gives 0.88 with max depth of 243
- limiting max depth to 100 does not change accuracy much
- max depth of 20 is still 0.87

- unrestricted features with max_depth of 20 gives 0.88

## SVD Tests

In [64]:
svd = TruncatedSVD(n_components=100, n_iter=7)
train_text_svd = svd.fit_transform(train_text_cv)
test_text_svd = svd.transform(test_text_cv)

In [68]:
clf_svd = tree.DecisionTreeClassifier(max_depth=20)
clf_svd.fit(train_text_svd, train_labels)

DecisionTreeClassifier(max_depth=20)

In [69]:
test_predicted_svd = clf_svd.predict(test_text_svd)
print(metrics.f1_score(test_labels, test_predicted_svd, average="weighted"))

0.7563503214547175


## Random Forest

In [82]:
rfc = RandomForestClassifier(max_depth=20)
rfc.fit(train_text_cv, train_labels)

RandomForestClassifier(max_depth=20)

In [83]:
rfc.score(test_text_cv, test_labels)

0.8688

In [116]:
rfc_svd = RandomForestClassifier(max_depth=20)
rfc_svd.fit(train_text_svd, train_labels)

rfc_svd.score(test_text_svd, test_labels)

0.8528