## Explore the data

In [60]:
# import libs
import pandas as pd
import numpy as np

# load data into dataframe
data_path = './data/fake_or_real_news.csv'
df = pd.read_csv(data_path, index_col=0)

# get data shape and preview
print(df.shape)
print(df.head())

(6335, 3)
                                                   title  \
8476                        You Can Smell Hillary’s Fear   
10294  Watch The Exact Moment Paul Ryan Committed Pol...   
3608         Kerry to go to Paris in gesture of sympathy   
10142  Bernie supporters on Twitter erupt in anger ag...   
875     The Battle of New York: Why This Primary Matters   

                                                    text label  
8476   Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
10294  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
3608   U.S. Secretary of State John F. Kerry said Mon...  REAL  
10142  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
875    It's primary day in New York and front-runners...  REAL  


In [20]:
# check number of FAKE and REAL rows
df.groupby(['label'])['label'].count()

label
FAKE    3164
REAL    3171
Name: label, dtype: int64

## Prepare training and testing data

In [41]:
# import libs
from sklearn.model_selection import train_test_split

# split features and labels
labels = df['label']
features = df['text']

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels,
                                                    test_size=0.25,
                                                    random_state=25)

X_train.head()

1502    Republican presidential frontrunner Donald Tru...
3072    Data scientists at Facebook recently published...
8996    Get short URL 0 0 0 0 US Deputy Secretary of D...
799     Republican presidential candidate Donald Trump...
976     Republican presidential candidate Ted Cruz is ...
Name: text, dtype: object

## Create word vectors

In [42]:
# import classes
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Count Vectorizer

In [44]:
# create count_vectorizer
count_vectorizer = CountVectorizer(stop_words='english')

# fit and transform X_train
X_train_count_vec = count_vectorizer.fit_transform(X_train)

# transfor X_test
X_test_count_vec = count_vectorizer.transform(X_test)

(4751, 59469)

In [52]:
count_vectorizer.get_feature_names()[-10:]

['ťthird',
 'ťtwo',
 'ťwho',
 'ťđ',
 'ź50',
 'νοεμβρίου',
 'главная',
 'октября',
 'яркий',
 'القادمون']

### TF-IDF Vectorizer

In [45]:
# Create tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# fit and transform X_train
X_train_tfidf_vec = tfidf_vectorizer.fit_transform(X_train)

# transform X_test
X_test_tfidf_vec = tfidf_vectorizer.transform(X_test)

In [49]:
tfidf_vectorizer.get_feature_names()[:10]

['00',
 '000',
 '0000',
 '000000031',
 '00000031',
 '000035',
 '00006',
 '0001',
 '0001pt',
 '0002']

Looking at the bag of words, part of them are non-sense words, such as '00' and non-English characters

### Encoding labels

In [54]:
# import LabelEncoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# fit and transform Y_train
y_train_encode = le.fit_transform(y_train)

# transform Y_test
y_test_encode = le.transform(y_test)

In [55]:
le.classes_

array(['FAKE', 'REAL'], dtype=object)

## Build the model

Using naive bayes model (MultinormialNB)

In [58]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


def predict(X_train, X_test, vectorizer):
    classifer = MultinomialNB()
    classifer.fit(X_train, y_train_encode)
    y_predict = classifer.predict(X_test)
    accuracy = accuracy_score(y_test_encode, y_predict)

    print('The accuracy of {} Vectorizer method is {}'.format(vectorizer, accuracy))


predict(X_train_count_vec, X_test_count_vec, 'Count')
predict(X_train_tfidf_vec, X_test_tfidf_vec, 'TF-IDF')

The accuracy of Count Vectorizer method is 0.9002525252525253
The accuracy of TF-IDF Vectorizer method is 0.8535353535353535


## Extra - using Pipeline and GridSearchCV to find better parameters

In [77]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform

pipe = Pipeline([('vector', CountVectorizer()),
                 ('clf', MultinomialNB())])

# specify parameter options and distributions to try
# max_df = [0.3, 0.5, 0.7, 1]
# alpha = [0.3, 0.5, 0.7, 1]

# count_vectors = [CountVectorizer(max_df=md) for md in max_df]
# tfidf_vectors = [TfidfVectorizer(max_df=md) for md in max_df]

# cls = [MultinomialNB(alpha=a) for a in alpha]

param_dist = {
    'vector': [CountVectorizer(), TfidfVectorizer()],
    'vector__max_df': uniform.rvs(size=1000, random_state=25),
    'vector__lowercase': [True, False],
    'vector__stop_words': ['english', None],
    'clf__alpha': uniform.rvs(size=1000, random_state=25)
}

# create GridSearchCV
grid = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=10, scoring='accuracy', cv=10)
grid.fit(X_train, y_train)

# make prediction
test_accuracy = grid.score(X_test, y_test)
print('Accuracy of Grid search CV is {}'.format(test_accuracy))

Accuracy of Grid search CV is 0.9191919191919192


In [81]:
results = pd.DataFrame(grid.cv_results_)
results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_clf__alpha,param_vector,param_vector__lowercase,param_vector__max_df,param_vector__stop_words,params,...,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,2.149603,0.243885,0.890549,0.949601,0.588797,"CountVectorizer(analyzer='word', binary=False,...",True,0.375215,,"{'vector__stop_words': None, 'vector__max_df':...",...,0.905263,0.949953,0.892632,0.949252,0.888186,0.949264,0.076742,0.009199,0.011149,0.001351
1,2.115995,0.219956,0.909703,0.962745,0.661949,"CountVectorizer(analyzer='word', binary=False,...",False,0.212457,english,"{'vector__stop_words': 'english', 'vector__max...",...,0.924211,0.960711,0.905263,0.962816,0.902954,0.962824,0.030318,0.006661,0.010293,0.0012
2,2.116126,0.220981,0.913702,0.966136,0.433797,"CountVectorizer(analyzer='word', binary=False,...",False,0.719055,english,"{'vector__stop_words': 'english', 'vector__max...",...,0.924211,0.964453,0.917895,0.965622,0.907173,0.964695,0.029326,0.006829,0.007576,0.001099
3,2.121314,0.223713,0.86466,0.929629,0.591205,"TfidfVectorizer(analyzer='word', binary=False,...",False,0.665257,english,"{'vector__stop_words': 'english', 'vector__max...",...,0.869474,0.928438,0.863158,0.931244,0.869198,0.927285,0.032162,0.005072,0.011409,0.001403
4,2.226498,0.249053,0.911177,0.961903,0.505411,"CountVectorizer(analyzer='word', binary=False,...",False,0.686236,,"{'vector__stop_words': None, 'vector__max_df':...",...,0.928421,0.960711,0.913684,0.962348,0.909283,0.961188,0.023214,0.006201,0.008931,0.001244
5,2.138186,0.225069,0.849926,0.914498,0.80522,"TfidfVectorizer(analyzer='word', binary=False,...",False,0.921901,english,"{'vector__stop_words': 'english', 'vector__max...",...,0.852632,0.915809,0.852632,0.915341,0.85654,0.910919,0.035922,0.008178,0.0144,0.002136
6,2.233769,0.247308,0.863608,0.930868,0.437611,"TfidfVectorizer(analyzer='word', binary=False,...",False,0.910929,,"{'vector__stop_words': None, 'vector__max_df':...",...,0.865263,0.931244,0.858947,0.931712,0.871308,0.927519,0.026698,0.005981,0.008164,0.001338
7,2.221219,0.24808,0.870133,0.935452,0.514244,"TfidfVectorizer(analyzer='word', binary=False,...",False,0.325647,,"{'vector__stop_words': None, 'vector__max_df':...",...,0.873684,0.934284,0.871579,0.936857,0.871308,0.933131,0.017949,0.00769,0.006734,0.001281
8,2.230145,0.254175,0.866765,0.932529,0.519465,"TfidfVectorizer(analyzer='word', binary=False,...",True,0.374185,,"{'vector__stop_words': None, 'vector__max_df':...",...,0.867368,0.933583,0.873684,0.934518,0.873418,0.930325,0.022595,0.006164,0.011657,0.001278
9,2.219765,0.239571,0.879394,0.971819,0.408203,"TfidfVectorizer(analyzer='word', binary=False,...",True,0.0314957,,"{'vector__stop_words': None, 'vector__max_df':...",...,0.877895,0.972404,0.877895,0.971235,0.896624,0.971475,0.027118,0.006756,0.012197,0.001009
