In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df = pd.read_csv('fake_or_real_news.csv')

print(df.head())
print(df.info())

   Unnamed: 0                                              title  \
0        8476                       You Can Smell Hillary’s Fear   
1       10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2        3608        Kerry to go to Paris in gesture of sympathy   
3       10142  Bernie supporters on Twitter erupt in anger ag...   
4         875   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
Unnamed: 0    6335 non-null int64
title         6335 non-null object
text          6335 non-null object
label         6

In [13]:
# Create a pandas series to store the labels
y = df['label']

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'],y,test_size = 0.33,random_state = 53)
# Why not use 'title' as well?

# Initialize a CountVectorizer object: simply counts number of times a word appears
count_vectorizer = CountVectorizer(stop_words = 'english')

# Fit-Transform the training data into word vectors, ignoring stop words
count_train = count_vectorizer.fit_transform(X_train.values)

# Transform the test data into word vectors
# Potential issue: words appearing in test data and not appearing in training data
count_test = count_vectorizer.transform(X_test.values)

print(type(count_train))
print(count_train.A[:25])

<class 'scipy.sparse.csr.csr_matrix'>
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 3 0 ..., 0 0 0]
 [0 2 0 ..., 0 0 0]]


In [14]:
# Initialize a TfidfVectorizer object: weight given by product of word frequency and log of inverse document frequency
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', max_df=0.7)
# max_df = 0.7 means "ignore tokens that appear in more than 70% of the documents" (in addition to stop words)

# Fit-Transform the training data
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)

# Transform the test data
tfidf_test = tfidf_vectorizer.transform(X_test.values)

# Print the first 25 vectors of the tfidf training data
print(type(tfidf_train))
print(tfidf_train.A[:25])

<class 'scipy.sparse.csr.csr_matrix'>
[[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.02017005  0.         ...,  0.          0.          0.        ]
 [ 0.          0.0332992   0.         ...,  0.          0.          0.        ]]


In [21]:
print(type(count_vectorizer), type(tfidf_vectorizer))

# Create the CountVectorizer DataFrame
count_df = pd.DataFrame(data = count_train.A, columns = count_vectorizer.get_feature_names())

# Create the TfidfVectorizer DataFrame
tfidf_df = pd.DataFrame(data = tfidf_train.A, columns = tfidf_vectorizer.get_feature_names())

print(count_df.head())
print(tfidf_df.head())

# Calculate the difference between sets of columns
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference) # difference is the empty set

# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))
# Same columns but different word vectors

<class 'sklearn.feature_extraction.text.CountVectorizer'> <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
   00  000  0000  00000031  000035  00006  0001  0001pt  000ft  000km  ...    \
0   0    0     0         0       0      0     0       0      0      0  ...     
1   0    0     0         0       0      0     0       0      0      0  ...     
2   0    0     0         0       0      0     0       0      0      0  ...     
3   0    0     0         0       0      0     0       0      0      0  ...     
4   0    0     0         0       0      0     0       0      0      0  ...     

   حلب  عربي  عن  لم  ما  محاولات  من  هذا  والمرضى  ยงade  
0    0     0   0   0   0        0   0    0        0      0  
1    0     0   0   0   0        0   0    0        0      0  
2    0     0   0   0   0        0   0    0        0      0  
3    0     0   0   0   0        0   0    0        0      0  
4    0     0   0   0   0        0   0    0        0      0  

[5 rows x 56922 columns]
    00  000

In [26]:
# Import the necessary modules
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Instantiate a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train,y_train.values)

# Create the predicted tags
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score
accuracy = metrics.accuracy_score(y_test.values, pred)
print(accuracy)

# Calculate the confusion matrix
cm = metrics.confusion_matrix(y_test, pred, labels = ['FAKE', 'REAL'])
print(cm)

0.893352462936
[[ 865  143]
 [  80 1003]]


In [29]:
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

# Create the predicted tags
tfidf_pred = nb_classifier.predict(tfidf_test)

# Calculate the accuracy score
tfidf_accuracy = metrics.accuracy_score(y_test, tfidf_pred)
print(tfidf_accuracy)

# Calculate the confusion matrix
tfidf_cm = metrics.confusion_matrix(y_test, tfidf_pred, labels=['FAKE', 'REAL'])
print(tfidf_cm)

0.856527977044
[[ 739  269]
 [  31 1052]]


In [31]:
import numpy as np

# Create the list of alpha values
alphas = np.arange(0.1,1.1,0.1)
# 1.0 is the default alpha value in MultinomialNB

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier
    nb_classifier = MultinomialNB(alpha = alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predict the labels
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()

Alpha:  0.1
Score:  0.897656623625

Alpha:  0.2
Score:  0.893830703013

Alpha:  0.3
Score:  0.890004782401

Alpha:  0.4
Score:  0.885700621712

Alpha:  0.5
Score:  0.884265901483

Alpha:  0.6
Score:  0.874701099952

Alpha:  0.7
Score:  0.870396939264

Alpha:  0.8
Score:  0.866092778575

Alpha:  0.9
Score:  0.858919177427

Alpha:  1.0
Score:  0.856527977044



In [48]:
# Get the class labels, namely FAKE and REAL
class_labels = nb_classifier.classes_

# Extract the features, as before
feature_names = tfidf_vectorizer.get_feature_names()

# Zip the feature names together with the coefficient array and sort by weights (in what order?)
feat_with_weights = sorted(zip(feature_names, nb_classifier.coef_[0]))

# Top 50 features for FAKE classification (along with their weights)
#print(class_labels[0], feat_with_weights[:50])

# Top 50 features for REAL classification (equivalently, bottom 50 features for FAKE classification) 
print(class_labels[1], feat_with_weights[-50:])

REAL [('ראש', -11.316312804238807), ('רבה', -11.316312804238807), ('רלוונטיים', -11.316312804238807), ('רק', -11.316312804238807), ('שאוסלו', -11.316312804238807), ('שהוגדר', -11.316312804238807), ('שהיא', -11.316312804238807), ('שהיו', -11.316312804238807), ('שהמבצע', -11.316312804238807), ('שוך', -11.316312804238807), ('שולטים', -11.316312804238807), ('שזו', -11.316312804238807), ('שטחים', -11.316312804238807), ('שינוי', -11.316312804238807), ('שיתעקש', -11.316312804238807), ('שכל', -11.316312804238807), ('שכמוני', -11.316312804238807), ('של', -11.316312804238807), ('שלו', -11.316312804238807), ('שנדרש', -11.316312804238807), ('שני', -11.316312804238807), ('שעת', -11.316312804238807), ('שתי', -11.316312804238807), ('תאמצנה', -11.316312804238807), ('תוצאה', -11.316312804238807), ('תחל', -11.316312804238807), ('תיירות', -11.316312804238807), ('תנותק', -11.316312804238807), ('תעודת', -11.316312804238807), ('תתרכז', -11.316312804238807), ('أن', -11.316312804238807), ('إجلاء', -11.3163128