# word2vec with Simple Models 

In [1]:
import pandas as pd
from collections import Counter
import re
import numpy as np
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score, confusion_matrix
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
%matplotlib inline
import gensim
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
headline_vec = pd.read_csv("Headline Vector.csv")
body_vec = pd.read_csv("Body Vector.csv")

In [3]:
headline_vec.dropna(inplace=True)
body_vec.dropna(inplace=True)

In [4]:
X_headline_vec = headline_vec.drop(["Fake"],axis=1)
X_body_vec = body_vec.drop(["Fake"],axis=1)

In [5]:
y_headline_vec = headline_vec["Fake"]
y_body_vec = body_vec["Fake"]

In [6]:
X_headline_vec_train, X_headline_vec_test, y_headline_train, y_headline_test = train_test_split(X_headline_vec,y_headline_vec, test_size = 0.2, random_state=1234)

In [7]:
X_body_vec_train, X_body_vec_test, y_body_train, y_body_test = train_test_split(X_body_vec,y_body_vec, test_size = 0.2, random_state=1234)

## Using headline

In [11]:
lr_headline = LogisticRegression(penalty='l2', solver='lbfgs')
lr_headline.fit(X_headline_vec_train, y_headline_train)
y_headline_pred = lr_headline.predict(X_headline_vec_test)
print ("Logistic Regression:\n")
print ("F1 Score {:.4}%".format( f1_score(y_headline_test, y_headline_pred, average='macro')*100 ) )
print ("Accuracy Score {:.4}%".format(accuracy_score(y_headline_test, y_headline_pred)*100) )
print(f"Recall Score: {recall_score(y_headline_test, y_headline_pred)}")
print(f"Precision Score: {precision_score(y_headline_test, y_headline_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_headline_test, y_headline_pred))
cros_val_list = cross_val_score(lr_headline, X_headline_vec, y_headline_vec, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

Logistic Regression:

F1 Score 67.35%
Accuracy Score 68.09%
Recall Score: 0.5888572590483937
Precision Score: 0.6651355075792375

Confusion Matrix:
[[2264  729]
 [1011 1448]]
[0.68683535 0.67290062 0.68390172 0.68451944 0.67241379 0.68561996
 0.69981651 0.68513761 0.67045872 0.67449541]

Cross Validation Mean Score: 0.6816099145473139


In [12]:
xgb_headline = XGBClassifier()
xgb_headline.fit(X_headline_vec_train, y_headline_train)
y_xgb_headline_pred = xgb_headline.predict(X_headline_vec_test)
print("XGBoost: \n")
print( "F1 Score {:.4}%".format( f1_score(y_headline_test, y_xgb_headline_pred, average='macro')*100 ))
print( "Accuracy Score {:.4}%".format(accuracy_score(y_headline_test, y_xgb_headline_pred)*100))
print(f"Recall Score: {recall_score(y_headline_test, y_xgb_headline_pred)}")
print(f"Precision Score: {precision_score(y_headline_test, y_xgb_headline_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_headline_test, y_xgb_headline_pred))
cros_val_list = cross_val_score(xgb_headline, X_headline_vec, y_headline_vec, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

XGBoost: 

F1 Score 66.47%
Accuracy Score 67.75%
Recall Score: 0.5339568930459536
Precision Score: 0.6820779220779221

Confusion Matrix:
[[2381  612]
 [1146 1313]]
[0.6919692  0.67546755 0.6813348  0.67608217 0.68818782 0.68525312
 0.70238532 0.68917431 0.6759633  0.68036697]

Cross Validation Mean Score: 0.6846184562862964


In [13]:
nb_headline = GaussianNB()
nb_headline.fit(X_headline_vec_train, y_headline_train)
y_nb_headline_pred = nb_headline.predict(X_headline_vec_test)
print("Naive Bayes: \n")
print( "F1 Score {:.4}%".format( f1_score(y_headline_test, y_nb_headline_pred, average='macro')*100 ))
print( "Accuracy Score {:.4}%".format(accuracy_score(y_headline_test, y_nb_headline_pred)*100))
print(f"Recall Score: {recall_score(y_headline_test, y_nb_headline_pred)}")
print(f"Precision Score: {precision_score(y_headline_test, y_nb_headline_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_headline_test, y_nb_headline_pred))
cros_val_list = cross_val_score(nb_headline, X_headline_vec, y_headline_vec, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

Naive Bayes: 

F1 Score 57.0%
Accuracy Score 58.47%
Recall Score: 0.44326962179747864
Precision Score: 0.5491183879093199

Confusion Matrix:
[[2098  895]
 [1369 1090]]
[0.61716172 0.58452512 0.60359369 0.59831255 0.61482025 0.58950844
 0.61174312 0.57981651 0.59449541 0.59669725]

Cross Validation Mean Score: 0.5990674054205989


In [14]:
rcf_headline = RandomForestClassifier(n_estimators=50,n_jobs=3)
rcf_headline.fit(X_headline_vec_train, y_headline_train)
y_rc_headline_pred = rcf_headline.predict(X_headline_vec_test)
print("Random Forest: \n")
print("F1 Score {:.4}%".format( f1_score(y_headline_test, y_rc_headline_pred, average='macro')*100 ))
print("Accuracy Score {:.4}%".format(accuracy_score(y_headline_test, y_rc_headline_pred)*100))
print(f"Recall Score: {recall_score(y_headline_test, y_rc_headline_pred)}")
print(f"Precision Score: {precision_score(y_headline_test, y_rc_headline_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_headline_test, y_rc_headline_pred))
cros_val_list = cross_val_score(rcf_headline, X_headline_vec, y_headline_vec, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

Random Forest: 

F1 Score 66.12%
Accuracy Score 68.12%
Recall Score: 0.4855632370882473
Precision Score: 0.7162567486502699

Confusion Matrix:
[[2520  473]
 [1265 1194]]
[0.7037037  0.6809681  0.70187019 0.68818782 0.67865004 0.69809244
 0.70788991 0.69981651 0.68073394 0.6866055 ]

Cross Validation Mean Score: 0.6926518159898764


## Using body

In [15]:
lr_body = LogisticRegression(penalty='l2', solver='lbfgs')
lr_body.fit(X_body_vec_train, y_body_train)
y_body_pred = lr_body.predict(X_body_vec_test)
print ("Logistic Regression:\n")
print ("F1 Score {:.4}%".format( f1_score(y_body_test, y_body_pred, average='macro')*100 ) )
print ("Accuracy Score {:.4}%".format(accuracy_score(y_body_test, y_body_pred)*100) )
print(f"Recall Score: {recall_score(y_body_test, y_body_pred)}")
print(f"Precision Score: {precision_score(y_body_test, y_body_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_body_test, y_body_pred))
cros_val_list = cross_val_score(lr_body, X_body_vec, y_body_vec, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

Logistic Regression:

F1 Score 79.81%
Accuracy Score 80.32%
Recall Score: 0.7145180967873119
Precision Score: 0.8256578947368421

Confusion Matrix:
[[2622  371]
 [ 702 1757]]
[0.81188119 0.80711404 0.8023469  0.81768158 0.80374175 0.7945708
 0.80293578 0.79669725 0.79449541 0.81724771]

Cross Validation Mean Score: 0.8048712411596689


In [16]:
xgb_body = XGBClassifier()
xgb_body.fit(X_body_vec_train, y_body_train)
y_xgb_body_pred = xgb_body.predict(X_body_vec_test)
print("XGBoost: \n")
print( "F1 Score {:.4}%".format( f1_score(y_body_test, y_xgb_body_pred, average='macro')*100 ))
print( "Accuracy Score {:.4}%".format(accuracy_score(y_body_test, y_xgb_body_pred)*100))
print(f"Recall Score: {recall_score(y_body_test, y_xgb_body_pred)}")
print(f"Precision Score: {precision_score(y_body_test, y_xgb_body_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_body_test, y_xgb_body_pred))
cros_val_list = cross_val_score(xgb_body, X_body_vec, y_body_vec, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

XGBoost: 

F1 Score 77.72%
Accuracy Score 78.5%
Recall Score: 0.6628710858072387
Precision Score: 0.8261530663963508

Confusion Matrix:
[[2650  343]
 [ 829 1630]]
[0.78731206 0.78657866 0.7909791  0.78980191 0.7839325  0.77769626
 0.78495413 0.77798165 0.78605505 0.79633028]

Cross Validation Mean Score: 0.7861621588877697


In [17]:
nb_body = GaussianNB()
nb_body.fit(X_body_vec_train, y_body_train)
y_nb_body_pred = nb_body.predict(X_body_vec_test)
print("Naive Bayes: \n")
print( "F1 Score {:.4}%".format( f1_score(y_body_test, y_nb_body_pred, average='macro')*100 ))
print( "Accuracy Score {:.4}%".format(accuracy_score(y_body_test, y_nb_body_pred)*100))
print(f"Recall Score: {recall_score(y_body_test, y_nb_body_pred)}")
print(f"Precision Score: {precision_score(y_body_test, y_nb_body_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_body_test, y_nb_body_pred))
cros_val_list = cross_val_score(nb_body, X_body_vec, y_body_vec, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

Naive Bayes: 

F1 Score 60.86%
Accuracy Score 65.54%
Recall Score: 0.34322895485969906
Precision Score: 0.7617328519855595

Confusion Matrix:
[[2729  264]
 [1615  844]]
[0.65639897 0.65346535 0.65236524 0.65663977 0.65884079 0.6511372
 0.65431193 0.63743119 0.64844037 0.64880734]

Cross Validation Mean Score: 0.6517838136929347


In [18]:
rcf_body = RandomForestClassifier(n_estimators=50,n_jobs=3)
rcf_body.fit(X_body_vec_train, y_body_train)
y_rc_body_pred = rcf_body.predict(X_body_vec_test)
print("Random Forest: \n")
print("F1 Score {:.4}%".format( f1_score(y_body_test, y_rc_body_pred, average='macro')*100 ))
print("Accuracy Score {:.4}%".format(accuracy_score(y_body_test, y_rc_body_pred)*100))
print(f"Recall Score: {recall_score(y_body_test, y_rc_body_pred)}")
print(f"Precision Score: {precision_score(y_body_test, y_rc_body_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_body_test, y_rc_body_pred))
cros_val_list = cross_val_score(rcf_body, X_body_vec, y_body_vec, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))
print(cros_val_list)
print(f"\nCross Validation Mean Score: {cros_val_list.mean()}")

Random Forest: 

F1 Score 79.44%
Accuracy Score 80.01%
Recall Score: 0.7023180154534363
Precision Score: 0.8282973621103118

Confusion Matrix:
[[2635  358]
 [ 732 1727]]
[0.80894756 0.81004767 0.80821415 0.81034483 0.80741012 0.80484226
 0.80697248 0.80917431 0.80623853 0.82788991]

Cross Validation Mean Score: 0.8100081828995507
