In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import unicodedata

%matplotlib inline

In [2]:
dataset = pd.read_csv('./news.csv')

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [16]:
dataset['Unnamed: 0'][:5]

0     8476
1    10294
2     3608
3    10142
4      875
Name: Unnamed: 0, dtype: int64

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
Unnamed: 0    6335 non-null int64
title         6335 non-null object
text          6335 non-null object
label         6335 non-null object
dtypes: int64(1), object(3)
memory usage: 198.0+ KB


In [5]:
dataset.shape

(6335, 4)

In [6]:
dataset.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [7]:
dataset.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [13]:
# Convert LABEL feature values to integers for better training (1 for REAL & 0 for FAKE).
label_feature_mapping = {'REAL': 1, 'FAKE': 0}

dataset['label'] = dataset['label'].map(label_feature_mapping)

In [14]:
dataset['label'][:5]

0    0
1    0
2    1
3    0
4    1
Name: label, dtype: int64

In [9]:
dataset['text']

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
5         \nI’m not an immigrant, but my grandparents ...
6       Share This Baylee Luciani (left), Screenshot o...
7       A Czech stockbroker who saved more than 650 Je...
8       Hillary Clinton and Donald Trump made some ina...
9       Iranian negotiators reportedly have made a las...
10      CEDAR RAPIDS, Iowa — “I had one of the most wo...
11      Donald Trump’s organizational problems have go...
12      Click Here To Learn More About Alexandra's Per...
13      October 31, 2016 at 4:52 am \nPretty factual e...
14      Killing Obama administration rules, dismantlin...
15      As more women move into high offices, they oft...
16      Shocking! Michele Obama & Hillary Caught Glamo...
17      0 \nHi

In [10]:
# Create a dictionary that contains all unnecessary punctuation characters.
punctuation_dictionary = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))

In [15]:
dataset['text'] = [string.translate(punctuation_dictionary) for string in dataset['text']]

In [19]:
dataset = dataset.drop(['Unnamed: 0', 'title'], axis = 1)

In [66]:
dataset.head()

Unnamed: 0,text,label
0,Daniel Greenfield a Shillman Journalism Fellow...,0
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,US Secretary of State John F Kerry said Monday...,1
3,Kaydee King KaydeeKing November 9 2016 The le...,0
4,Its primary day in New York and frontrunners H...,1


In [67]:
number_of_real_news = dataset[dataset['label'] == 1]
number_of_fake_news = dataset[dataset['label'] == 0]

In [68]:
number_of_real_news.shape

(3171, 2)

In [69]:
number_of_fake_news.shape

(3164, 2)

In [21]:
feature = dataset['text']
target = dataset['label']

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [46]:
# Split data into TRAINING and TESTING.
# 80% and 20% for the features (x_train & y_train).
# 80% and 20% for the target (y_train & y_test).
x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size = 0.2, random_state = 0)

In [47]:
x_train.head()

2402    Christian Whiton is a former deputy special en...
1922    Super Tuesday Brings Harsh Light And Heartache...
3475    Prev post Page 1 of 4 Next \nNurses are among ...
6197    The deadly hostage situation at a luxury hotel...
4748    Our new country Women and minorities hit harde...
Name: text, dtype: object

In [55]:
x_train.shape

(5068,)

In [48]:
x_test.head()

3789    Watch the above reports by CBNs David Brody an...
733      Bernie Sanders BernieSanders October 27 2016 ...
4783    \nAs of October 29 there have been at least 14...
3067    Members of Congress have said that if Hillary ...
5288    Julian Zelizer is a professor of history and p...
Name: text, dtype: object

In [56]:
x_test.shape

(1267,)

In [49]:
y_train.head()

2402    1
1922    1
3475    0
6197    1
4748    0
Name: label, dtype: int64

In [57]:
y_train.shape

(5068,)

In [50]:
y_test.head()

3789    1
733     0
4783    0
3067    0
5288    1
Name: label, dtype: int64

In [58]:
y_test.shape

(1267,)

In [51]:
vectorizer = TfidfVectorizer()
vectorized_x_train = vectorizer.fit_transform(x_train)
vectorized_x_test = vectorizer.transform(x_test)

In [71]:
# Logistic Regression.
logistic_regressor = LogisticRegression()
logistic_regressor.fit(vectorized_x_train, y_train) # <-- Train using the 80% of the features (x_train) and targets (y_train).
logistic_regressor_predictions = logistic_regressor.predict(vectorized_x_test) # <-- Predict the 20% of the feature (x_test).
accuracy_score_of_logistic_regressor = round(accuracy_score(logistic_regressor_predictions, y_test) * 100, 2) # <-- Compare model accuracy using the predictions (x_test) made with the 20% test (y_test).  
print(accuracy_score_of_logistic_regressor)

90.29


In [73]:
# Decision Tree Classifier.
decision_tree = DecisionTreeClassifier()
decision_tree.fit(vectorized_x_train, y_train)
decision_tree_predictions = decision_tree.predict(vectorized_x_test)
accuracy_score_of_decision_tree = round(accuracy_score(decision_tree_predictions, y_test) * 100, 2)
print(accuracy_score_of_decision_tree)

80.35


In [74]:
# Random Forest Classifier.
random_forest_classifier = RandomForestClassifier()
random_forest_classifier.fit(vectorized_x_train, y_train)
random_forest_classifier_predictions = random_forest_classifier.predict(vectorized_x_test)
accuracy_score_of_random_forest_classifier = round(accuracy_score(random_forest_classifier_predictions, y_test) * 100, 2)
print(accuracy_score_of_random_forest_classifier)

82.16
