# Building a basic fake news classifier using machine learning.  
### Steps:  
1. Importing libraries 
2. Data preprocessing 
3. TF-IDF Vectorizer Calculation  
4. Model Building  
5. Model Evaluation - Cross Validation  
6. Result Analysis  

In [1]:
import pandas as pd
from collections import Counter
import re
import numpy as np
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score, confusion_matrix
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

In [2]:
alldata = pd.read_csv('news_dataset.csv')

In [3]:
alldata.head()

Unnamed: 0.1,Unnamed: 0,title,content,publication,label
0,0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,100percentfedup,fake
1,1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,100percentfedup,fake
2,2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,100percentfedup,fake
3,3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,100percentfedup,fake
4,4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,100percentfedup,fake


In [4]:
alldata['label'].replace({'fake':1, 'real':0}, inplace=True)

In [5]:
alldata.isnull().values.any()

True

In [6]:
alldata.dropna(inplace=True)

In [7]:
X_body_text = alldata['content']
X_headline_text = alldata['title']
X_combined_text = 10*(alldata['title'] + " ") + alldata['content']
y = alldata['label']

In [10]:
# max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
# max_df = 25 means "ignore terms that appear in more than 25 documents".
# The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.
tfidf = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,ngram_range=(1,2),max_df= 0.85, min_df= 0.01)

In [11]:
X_body_tfidf = tfidf.fit_transform(X_body_text)
X_headline_tfidf = tfidf.fit_transform (X_headline_text)
X_combined_tfidf = tfidf.fit_transform (X_combined_text)

In [13]:
X_headline_tfidf_train, X_headline_tfidf_test, y_headline_train, y_headline_test = train_test_split(X_headline_tfidf,y, test_size = 0.2, random_state=1234)
X_body_tfidf_train, X_body_tfidf_test, y_body_train, y_body_test = train_test_split(X_body_tfidf,y, test_size = 0.2, random_state=1234)
X_combined_tfidf_train, X_combined_tfidf_test, y_combined_train, y_combined_test = train_test_split(X_combined_tfidf,y, test_size = 0.2, random_state=1234)

## Logistic Regression

#### Using Headline

In [11]:
lr_headline = LogisticRegression(penalty='l2', solver='lbfgs')
lr_headline.fit(X_headline_tfidf_train, y_headline_train)
y_headline_pred = lr_headline.predict(X_headline_tfidf_test)
print ("Logistic Regression:\n")
print ("F1 Score {:.4}%".format( f1_score(y_headline_test, y_headline_pred, average='macro')*100 ) )
print ("Accuracy Score {:.4}%".format(accuracy_score(y_headline_test, y_headline_pred)*100) )
print(f"Recall Score: {recall_score(y_headline_test, y_headline_pred)}")
print(f"Precision Score: {precision_score(y_headline_test, y_headline_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_headline_test, y_headline_pred))

Logistic Regression:

F1 Score 61.16%
Accuracy Score 65.54%
Recall Score: 0.37014061207609594
Precision Score: 0.6879323597232898

Confusion Matrix:
[[2773  406]
 [1523  895]]


In [12]:
cros_val_list = cross_val_score(lr_headline, X_headline_tfidf, y, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

[0.64607143 0.66035714 0.65916399 0.64724803 0.6336669  0.6350965
 0.63652609 0.65618299 0.65582559 0.64081487]

Cross Validation Mean Score: 0.6470953530689779


### Using content

In [13]:
lr_body = LogisticRegression(penalty='l2', solver='lbfgs')
lr_body.fit(X_body_tfidf_train, y_body_train)
y_body_pred = lr_body.predict(X_body_tfidf_test)
print ("Logistic Regression:\n")
print ("F1 Score {:.4}%".format( f1_score(y_body_test, y_body_pred, average='macro')*100 ) )
print ("Accuracy Score {:.4}%".format(accuracy_score(y_body_test, y_body_pred)*100) )
print(f"Recall Score: {recall_score(y_body_test, y_body_pred)}")
print(f"Precision Score: {precision_score(y_body_test, y_body_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_body_test, y_body_pred))

Logistic Regression:

F1 Score 92.77%
Accuracy Score 92.91%
Recall Score: 0.9181141439205955
Precision Score: 0.9177346010748243

Confusion Matrix:
[[2980  199]
 [ 198 2220]]


In [14]:
cros_val_list = cross_val_score(lr_body, X_body_tfidf, y, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

[0.92285714 0.92285714 0.93068953 0.92530379 0.94031451 0.93459614
 0.93280915 0.93209435 0.93173695 0.91887062]

Cross Validation Mean Score: 0.9292129335917296


### Using headline and content

In [15]:
lr_combined = LogisticRegression(penalty='l2', solver='lbfgs')
lr_combined.fit(X_combined_tfidf_train, y_combined_train)
y_combined_pred = lr_combined.predict(X_combined_tfidf_test)
print ("Logistic Regression:\n")
print ("F1 Score {:.4}%".format( f1_score(y_combined_test, y_combined_pred, average='macro')*100 ) )
print ("Accuracy Score {:.4}%".format(accuracy_score(y_combined_test, y_combined_pred)*100) )
print(f"Recall Score: {recall_score(y_combined_test, y_combined_pred)}")
print(f"Precision Score: {precision_score(y_combined_test, y_combined_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_combined_test, y_combined_pred))

Logistic Regression:

F1 Score 89.98%
Accuracy Score 90.17%
Recall Score: 0.8842018196856907
Precision Score: 0.8878737541528239

Confusion Matrix:
[[2909  270]
 [ 280 2138]]


In [16]:
cros_val_list = cross_val_score(lr_combined, X_combined_tfidf, y, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

[0.89857143 0.9        0.90139335 0.90564689 0.90850608 0.9042173
 0.90135811 0.9031451  0.9038599  0.89706934]

Cross Validation Mean Score: 0.9023767499566866


## Random Forest

### Using headline

In [17]:
rcf_headline = RandomForestClassifier(n_estimators=50,n_jobs=3)
rcf_headline.fit(X_headline_tfidf_train, y_headline_train)
y_rc_headline_pred = rcf_headline.predict(X_headline_tfidf_test)
print("Random Forest: \n")
print("F1 Score {:.4}%".format( f1_score(y_headline_test, y_rc_headline_pred, average='macro')*100 ))
print("Accuracy Score {:.4}%".format(accuracy_score(y_headline_test, y_rc_headline_pred)*100))
print(f"Recall Score: {recall_score(y_headline_test, y_rc_headline_pred)}")
print(f"Precision Score: {precision_score(y_headline_test, y_rc_headline_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_headline_test, y_rc_headline_pred))

Random Forest: 

F1 Score 61.17%
Accuracy Score 66.02%
Recall Score: 0.35525227460711334
Precision Score: 0.7146422628951747

Confusion Matrix:
[[2836  343]
 [1559  859]]


In [18]:
cros_val_list = cross_val_score(rcf_headline, X_headline_tfidf, y, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

[0.66607143 0.655      0.66416577 0.654396   0.6343817  0.64045747
 0.65046462 0.66619014 0.66511794 0.64974982]

Cross Validation Mean Score: 0.6545994886122004


### Using content

In [19]:
rcf_body = RandomForestClassifier(n_estimators=10,n_jobs=3)
rcf_body.fit(X_body_tfidf_train, y_body_train)
y_rc_body_pred = rcf_body.predict(X_body_tfidf_test)
print("Random Forest: \n")
print("F1 Score {:.4}%".format( f1_score(y_body_test, y_rc_body_pred, average='macro')*100 ))
print("Accuracy Score {:.4}%".format(accuracy_score(y_body_test, y_rc_body_pred)*100))
print(f"Recall Score: {recall_score(y_body_test, y_rc_body_pred)}")
print(f"Precision Score: {precision_score(y_body_test, y_rc_body_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_body_test, y_rc_body_pred))

Random Forest: 

F1 Score 86.05%
Accuracy Score 86.6%
Recall Score: 0.7725392886683209
Precision Score: 0.9032882011605415

Confusion Matrix:
[[2979  200]
 [ 550 1868]]


In [20]:
cros_val_list = cross_val_score(rcf_body, X_body_tfidf, y, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

[0.87357143 0.86071429 0.8742408  0.86025733 0.85561115 0.87312366
 0.87240886 0.8645461  0.87455325 0.8652609 ]

Cross Validation Mean Score: 0.867428777261299


### Using headline and content

In [21]:
rcf_combined = RandomForestClassifier(n_estimators=10,n_jobs=3)
rcf_combined.fit(X_combined_tfidf_train, y_combined_train)
y_rc_combined_pred = rcf_combined.predict(X_combined_tfidf_test)
print("Random Forest: \n")
print("F1 Score {:.4}%".format( f1_score(y_combined_test, y_rc_combined_pred, average='macro')*100 ))
print("Accuracy Score {:.4}%".format(accuracy_score(y_combined_test, y_rc_combined_pred)*100))
print(f"Recall Score: {recall_score(y_combined_test, y_rc_combined_pred)}")
print(f"Precision Score: {precision_score(y_combined_test, y_rc_combined_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_combined_test, y_rc_combined_pred))

Random Forest: 

F1 Score 84.85%
Accuracy Score 85.51%
Recall Score: 0.7485525227460711
Precision Score: 0.8991554893194238

Confusion Matrix:
[[2976  203]
 [ 608 1810]]


In [22]:
cros_val_list = cross_val_score(rcf_combined, X_combined_tfidf, y, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

[0.86178571 0.86035714 0.85780636 0.85632595 0.86669049 0.86347391
 0.86812009 0.86347391 0.85739814 0.86132952]

Cross Validation Mean Score: 0.8616761225134489


## XGBoost Classifier

### Using headline

In [23]:
xgb_headline = XGBClassifier()
xgb_headline.fit(X_headline_tfidf_train, y_headline_train)
y_xgb_headline_pred = xgb_headline.predict(X_headline_tfidf_test)
print("XGBoost: \n")
print( "F1 Score {:.4}%".format( f1_score(y_headline_test, y_xgb_headline_pred, average='macro')*100 ))
print( "Accuracy Score {:.4}%".format(accuracy_score(y_headline_test, y_xgb_headline_pred)*100))
print(f"Recall Score: {recall_score(y_headline_test, y_xgb_headline_pred)}")
print(f"Precision Score: {precision_score(y_headline_test, y_xgb_headline_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_headline_test, y_xgb_headline_pred))

XGBoost: 

F1 Score 59.19%
Accuracy Score 65.86%
Recall Score: 0.2944582299421009
Precision Score: 0.7764449291166848

Confusion Matrix:
[[2974  205]
 [1706  712]]


In [24]:
cros_val_list = cross_val_score(xgb_headline, X_headline_tfidf, y, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

[0.655      0.65678571 0.66095034 0.64939242 0.63295211 0.64045747
 0.64045747 0.65832738 0.65582559 0.64724803]

Cross Validation Mean Score: 0.6497396525458192


### Using content

In [25]:
xgb_body = XGBClassifier()
xgb_body.fit(X_body_tfidf_train, y_body_train)
y_xgb_body_pred = xgb_body.predict(X_body_tfidf_test)
print("XGBoost: \n")
print( "F1 Score {:.4}%".format( f1_score(y_body_test, y_xgb_body_pred, average='macro')*100 ))
print( "Accuracy Score {:.4}%".format(accuracy_score(y_body_test, y_xgb_body_pred)*100))
print(f"Recall Score: {recall_score(y_body_test, y_xgb_body_pred)}")
print(f"Precision Score: {precision_score(y_body_test, y_xgb_body_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_body_test, y_xgb_body_pred))

XGBoost: 

F1 Score 90.59%
Accuracy Score 90.78%
Recall Score: 0.8875103391232424
Precision Score: 0.897907949790795

Confusion Matrix:
[[2935  244]
 [ 272 2146]]


In [26]:
cros_val_list = cross_val_score(xgb_body, X_body_tfidf, y, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

[0.89773824 0.90201544 0.89644082]

Cross Validation Mean Score: 0.8987314987766005


### Using headline and content

In [27]:
xgb_combined = XGBClassifier()
xgb_combined.fit(X_combined_tfidf_train, y_combined_train)
y_xgb_combined_pred = xgb_combined.predict(X_combined_tfidf_test)
print("XGBoost: \n")
print( "F1 Score {:.4}%".format( f1_score(y_combined_test, y_xgb_combined_pred, average='macro')*100 ))
print( "Accuracy Score {:.4}%".format(accuracy_score(y_combined_test, y_xgb_combined_pred)*100))
print(f"Recall Score: {recall_score(y_combined_test, y_xgb_combined_pred)}")
print(f"Precision Score: {precision_score(y_combined_test, y_xgb_combined_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_combined_test, y_xgb_combined_pred))

XGBoost: 

F1 Score 90.01%
Accuracy Score 90.19%
Recall Score: 0.8875103391232424
Precision Score: 0.8856789104416013

Confusion Matrix:
[[2902  277]
 [ 272 2146]]


In [28]:
cros_val_list = cross_val_score(xgb_combined, X_combined_tfidf, y, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

[0.9011684  0.90501715 0.89944254]

Cross Validation Mean Score: 0.9018760302887502


## Naive Bayes

### Using headline

In [29]:
nb_headline = GaussianNB()
nb_headline.fit(X_headline_tfidf_train.toarray(), y_headline_train)
y_nb_headline_pred = nb_headline.predict(X_headline_tfidf_test.toarray())
print("Naive Bayes: \n")
print( "F1 Score {:.4}%".format( f1_score(y_headline_test, y_nb_headline_pred, average='macro')*100 ))
print( "Accuracy Score {:.4}%".format(accuracy_score(y_headline_test, y_nb_headline_pred)*100))
print(f"Recall Score: {recall_score(y_headline_test, y_nb_headline_pred)}")
print(f"Precision Score: {precision_score(y_headline_test, y_nb_headline_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_headline_test, y_nb_headline_pred))

Naive Bayes: 

F1 Score 61.85%
Accuracy Score 64.5%
Recall Score: 0.4412737799834574
Precision Score: 0.6265413975337639

Confusion Matrix:
[[2543  636]
 [1351 1067]]


In [30]:
cros_val_list = cross_val_score(nb_headline, X_headline_tfidf.toarray(), y, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

[0.64285714 0.6475     0.64951768 0.64152966 0.62830593 0.62294496
 0.63045032 0.64832023 0.65010722 0.62473195]

Cross Validation Mean Score: 0.6386265106515154


### Using content

In [31]:
nb_body = GaussianNB()
nb_body.fit(X_body_tfidf_train.toarray(), y_body_train)
y_nb_body_pred = nb_body.predict(X_body_tfidf_test.toarray())
print("Naive Bayes: \n")
print( "F1 Score {:.4}%".format( f1_score(y_body_test, y_nb_body_pred, average='macro')*100 ))
print( "Accuracy Score {:.4}%".format(accuracy_score(y_body_test, y_nb_body_pred)*100))
print(f"Recall Score: {recall_score(y_body_test, y_nb_body_pred)}")
print(f"Precision Score: {precision_score(y_body_test, y_nb_body_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_body_test, y_nb_body_pred))

Naive Bayes: 

F1 Score 75.75%
Accuracy Score 75.84%
Recall Score: 0.9499586435070306
Precision Score: 0.6510770975056689

Confusion Matrix:
[[1948 1231]
 [ 121 2297]]


In [32]:
cros_val_list = cross_val_score(nb_body, X_body_tfidf.toarray(), y, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

[0.76714286 0.76678571 0.75884244 0.7687634  0.76626162 0.76590422
 0.76590422 0.77877055 0.76090064 0.7698356 ]

Cross Validation Mean Score: 0.7669111258189212


### Using headline and content

In [33]:
nb_combined = GaussianNB()
nb_combined.fit(X_combined_tfidf_train.toarray(), y_combined_train)
y_nb_combined_pred = nb_combined.predict(X_combined_tfidf_test.toarray())
print("Naive Bayes: \n")
print( "F1 Score {:.4}%".format( f1_score(y_combined_test, y_nb_combined_pred, average='macro')*100 ))
print( "Accuracy Score {:.4}%".format(accuracy_score(y_combined_test, y_nb_combined_pred)*100))
print(f"Recall Score: {recall_score(y_combined_test, y_nb_combined_pred)}")
print(f"Precision Score: {precision_score(y_combined_test, y_nb_combined_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_combined_test, y_nb_combined_pred))

Naive Bayes: 

F1 Score 76.84%
Accuracy Score 76.84%
Recall Score: 0.8672456575682382
Precision Score: 0.6826171875

Confusion Matrix:
[[2204  975]
 [ 321 2097]]


In [34]:
cros_val_list = cross_val_score(nb_combined, X_combined_tfidf.toarray(), y, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

[0.76777421 0.76813148 0.77077006 0.77698356 0.77090779]

Cross Validation Mean Score: 0.7709134173904008
