In [2]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
import nltk
stop_words = nltk.corpus.stopwords.words('english')
from nltk.stem import PorterStemmer
my_stem = PorterStemmer()
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
dictionary = set(w.lower() for w in nltk.corpus.words.words())
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score

In [3]:
politifact = pd.read_csv("raw-data/FakeNewsNet-master/Data/Cleaned_DSPP/politifact.csv")

In [5]:
### remove articles with no text
politifact = politifact[~politifact.text.isna()]

In [6]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(politifact, test_size=0.4, random_state=9)

In [7]:
train.head()

Unnamed: 0.1,Unnamed: 0,source,date,title,text,fake,date_time
91,PolitiFact_Fake_73-Webpage.json,,,,It seems as though there may be more than the ...,1,
202,PolitiFact_Real_65-Webpage.json,http://cnn.it,,Debate breaks record as most-watched in U.S. h...,Monday night was the most-watched debate in Am...,0,
148,PolitiFact_Real_16-Webpage.json,http://politi.co,2016-09-26,Clinton on Trump's birther claim: A 'racist lie',"""He has a long record of engaging in racist be...",0,2016-09-26 18:27:04
206,PolitiFact_Real_69-Webpage.json,http://politi.co,2016-09-27,Obama nominates ambassador to Cuba in long-sho...,Jeffrey DeLaurentis will serve as America's am...,0,2016-09-27 13:14:10
107,PolitiFact_Fake_88-Webpage.json,http://uspoln.com,2017-05-23,Rubio: “Rape Victims Should Be In Custody If T...,At a campaign town hall event at St. Anselm Co...,1,2017-05-22 20:00:00


In [8]:
the_df = pd.DataFrame()
## clean/prepare text
for rev, outcome in zip(train.text.tolist(), train.fake.tolist()):
    
    # only keep words (remove other characters)
    tmp_read = re.sub('[^a-zA-Z]+', ' ', rev).lower()

    #Tokenization and remove stop words
    tmp_read = [word for word in tmp_read.split() if word not in stop_words]

    #dictionary words
    dict_read = [word for word in tmp_read if word in dictionary]
    
    # stemming
    tmp_read_stm = [my_stem.stem(word) for word in tmp_read]
    dict_read_stm = [my_stem.stem(word) for word in dict_read]

    # lemminization
    tmp_read_lem = [lemmatizer.lemmatize(word) for word in tmp_read]
    dict_read_lem = [lemmatizer.lemmatize(word) for word in dict_read]

    
    # rejoin reviews
    tmp_read = ' '.join(tmp_read)
    tmp_read_stm = ' '.join(tmp_read_stm)
    tmp_read_lem = ' '.join(tmp_read_lem)
    
    dict_read = ' '.join(dict_read)
    dict_read_stm = ' '.join(dict_read_stm)
    dict_read_lem = ' '.join(dict_read_lem)


    # add to new df
    tmp = pd.DataFrame([rev], columns=['original review'])
    tmp['body'] = tmp_read
    tmp['body_stem'] = tmp_read_stm
    tmp['body_lem'] = tmp_read_lem
    tmp['body_dict'] = dict_read
    tmp['body_dict_stem'] = dict_read_stm
    tmp['fake'] = outcome

    the_df = the_df.append(tmp, ignore_index=True)

In [9]:
the_df.head()

Unnamed: 0,original review,body,body_stem,body_lem,body_dict,body_dict_stem,fake
0,It seems as though there may be more than the ...,seems though may normal forces play former gov...,seem though may normal forc play former govern...,seems though may normal force play former gove...,though may normal play former governor alaska ...,though may normal play former governor alaska ...,1
1,Monday night was the most-watched debate in Am...,monday night watched debate american history w...,monday night watch debat american histori well...,monday night watched debate american history w...,monday night watched debate american history w...,monday night watch debat american histori well...,0
2,"""He has a long record of engaging in racist be...",long record engaging racist behavior hillary c...,long record engag racist behavior hillari clin...,long record engaging racist behavior hillary c...,long record engaging racist behavior hillary c...,long record engag racist behavior hillari clin...,0
3,Jeffrey DeLaurentis will serve as America's am...,jeffrey delaurentis serve america ambassador c...,jeffrey delaurenti serv america ambassador com...,jeffrey delaurentis serve america ambassador c...,jeffrey serve america ambassador communist led...,jeffrey serv america ambassador communist led ...,0
4,At a campaign town hall event at St. Anselm Co...,campaign town hall event st anselm college new...,campaign town hall event st anselm colleg new ...,campaign town hall event st anselm college new...,campaign town hall event st anselm college new...,campaign town hall event st anselm colleg new ...,1


In [10]:
label = np.array(the_df['fake'])
# choose column to use has main text for models
body_text = np.array(the_df['body_lem'])

In [11]:
# unigram vectorizer
# vectorize
my_vec_tfidf_out = TfidfVectorizer()
my_xform_tfidf_out = my_vec_tfidf_out.fit_transform(body_text)
my_pd = pd.DataFrame(my_xform_tfidf_out.toarray())
my_pd.columns = my_vec_tfidf_out.get_feature_names()
X = my_pd.values

# vectorize test data
test_vec=my_vec_tfidf_out.transform(test.text.tolist())
my_pd = pd.DataFrame(test_vec.toarray())
my_pd.columns = my_vec_tfidf_out.get_feature_names()
X_test = my_pd.values

In [71]:
# # unigram/bigram vectorizer
# my_vec_tfidf_out = TfidfVectorizer(ngram_range = (1,2)) 
# my_xform_tfidf_out = my_vec_tfidf_out.fit_transform(body_text)
# my_pd = pd.DataFrame(my_xform_tfidf_out.toarray())
# my_pd.columns = my_vec_tfidf_out.get_feature_names()
# X = my_pd.values

# test_vec=my_vec_tfidf_out.transform(test.text.tolist())
# my_pd = pd.DataFrame(test_vec.toarray())
# my_pd.columns = my_vec_tfidf_out.get_feature_names()
# X_test = my_pd.values

In [12]:
# are the classes balanced?
balance_check = pd.concat([the_df.fake.value_counts(), 
                the_df.fake.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage'))
print(balance_check)

   counts  percentage
0      77   54.225352
1      65   45.774648


### Modeling 

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score, precision_recall_curve, plot_precision_recall_curve
import matplotlib.pyplot as plt
import timeit
f1_scorer = make_scorer(f1_score)

In [14]:
# scale data
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
X_test_scaled = scaler.transform(X_test)

In [31]:
# Logistic Regression
start = timeit.default_timer()
logreg = LogisticRegression(penalty='none',random_state=9).fit(X_scaled, label)
stop = timeit.default_timer()

# # cross-validation for model evaluation
cv_mean = np.mean(cross_val_score(LogisticRegression(penalty='none'), X_scaled, label, cv=5, scoring=f1_scorer))
print("best mean cross-validation score: {:.3f}".format(cv_mean))

y_predictions = logreg.predict(X_test_scaled)
print("test-set score: {:.3f}".format(f1_score(np.array(test.fake), y_predictions)))



# print('model time: ', stop - start) 

best mean cross-validation score: 0.657
test-set score: 0.745


In [32]:
# L1 Penalized Logistic Regression
param_gridlasso = {'C': [0.001 , 0.01 , 0.1 , 1 , 10]} 

# cv = number of folds
gridlasso = GridSearchCV(LogisticRegression(penalty='l1',solver='liblinear',random_state=9), 
                    param_grid=param_gridlasso, cv=5, scoring=f1_scorer)

#use meta model methods to fit score and predict model:
start = timeit.default_timer()
gridlasso.fit(X_scaled, label)
stop = timeit.default_timer()

print("best mean cross-validation score: {:.3f}".format(gridlasso.best_score_))
y_predictions = gridlasso.predict(X_test_scaled)
print("test-set score: {:.3f}".format(f1_score(np.array(test.fake), y_predictions)))

print('model time: ', stop - start) 

best mean cross-validation score: 0.775
test-set score: 0.792
model time:  0.827356151999993


**Gradient Boosting**

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {'n_estimators': [400 , 500,600],
             "max_depth":[3,4,5],
             "learning_rate":[.001,.01,.1]} 

# cv = number of folds
gridgb = GridSearchCV(GradientBoostingClassifier(random_state=9), 
                    param_grid=param_grid, cv=5, scoring=f1_scorer)

#use meta model methods to fit score and predict model:
start = timeit.default_timer()
gridgb.fit(X, label)
stop = timeit.default_timer()

print("best parameters: {}".format(gridgb.best_params_))

print("best mean cross-validation score: {:.3f}".format(gridgb.best_score_))
y_predictions = gridgb.predict(X_test)
print("test-set score: {:.3f}".format(f1_score(np.array(test.fake), y_predictions)))

best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400}
best mean cross-validation score: 0.758
test-set score: 0.855


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(learning_rate=0.01, max_depth=3, n_estimators=500, random_state=9)
model.fit(X, label)
y_predictions = model.predict(X_test)
print("test-set score: {:.3f}".format(f1_score(np.array(test.fake), y_predictions)))
print("test-set score: {:.3f}".format(accuracy_score(np.array(test.fake), y_predictions)))

**Random Forest**

In [17]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {'n_estimators': [100 , 300,500,800],
             "max_depth":[1,5,10,15]} 

# cv = number of folds
gridrf = GridSearchCV(RandomForestClassifier(random_state=9), 
                    param_grid=param_grid, cv=5, scoring=f1_scorer)

#use meta model methods to fit score and predict model:
start = timeit.default_timer()
gridrf.fit(X, label)
stop = timeit.default_timer()

print("best parameters: {}".format(gridrf.best_params_))
print("best mean cross-validation score: {:.3f}".format(gridrf.best_score_))

y_predictions = gridrf.predict(X_test)
print("test-set score: {:.3f}".format(f1_score(np.array(test.fake), y_predictions)))

best parameters: {'max_depth': 15, 'n_estimators': 300}
best mean cross-validation score: 0.750
test-set score: 0.852


In [34]:
model = RandomForestClassifier(n_estimators=500, max_depth=15,random_state=9)
model.fit(X, label)
y_predictions = model.predict(X_test)
print("test-set score: {:.3f}".format(f1_score(np.array(test.fake), y_predictions)))
print("test-set score: {:.3f}".format(accuracy_score(np.array(test.fake), y_predictions)))

test-set score: 0.852
test-set score: 0.833


**SVC**

In [19]:
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'gamma':['sclae','auto'],
             "C":[1,5,8,10,15]} 

# cv = number of folds
gridsvc = GridSearchCV(SVC(random_state=9), 
                    param_grid=param_grid, cv=5, scoring=f1_scorer)

#use meta model methods to fit score and predict model:
start = timeit.default_timer()
gridsvc.fit(X, label)
stop = timeit.default_timer()

print("best parameters: {}".format(gridsvc.best_params_))
print("best mean cross-validation score: {:.3f}".format(gridsvc.best_score_))

y_predictions = gridsvc.predict(X_test)
print("test-set f1-score: {:.3f}".format(f1_score(np.array(test.fake), y_predictions)))
print("test-set acc score: {:.4f}".format(accuracy_score(np.array(test.fake), y_predictions)))

best parameters: {'C': 1, 'gamma': 'auto', 'kernel': 'linear'}
best mean cross-validation score: 0.761
test-set f1-score: 0.893
test-set acc score: 0.8854


In [20]:
# body_lem -- unigram and bigram
from sklearn.svm import SVC
clf = SVC(kernel='linear', C=5, gamma='auto', random_state=9).fit(X, label)
# clf.score(X_test, np.array(test.fake))
y_predictions = clf.predict(X_test)
print("test-set score: {:.4f}".format(f1_score(np.array(test.fake), y_predictions)))
print("test-set score: {:.4f}".format(accuracy_score(np.array(test.fake), y_predictions)))

test-set score: 0.9020
test-set score: 0.8958


**XGBoost**

In [18]:
from xgboost import XGBClassifier
param = {'max_depth':[1,2, 3], 'eta':[1,2], 'n_estimators':[100]}

# cv = number of folds
gridxgb = GridSearchCV(XGBClassifier(random_state=9), 
                    param_grid=param, cv=5, scoring=f1_scorer)
#use meta model methods to fit score and predict model:
start = timeit.default_timer()
gridxgb.fit(X, label)
stop = timeit.default_timer()

y_predictions = gridxgb.predict(X_test)
print("best parameters: {}".format(gridxgb.best_params_))
print("best mean cross-validation score: {:.3f}".format(gridxgb.best_score_))
print("test-set score: {:.3f}".format(f1_score(np.array(test.fake), y_predictions)))



























































































































best parameters: {'eta': 1, 'max_depth': 3, 'n_estimators': 100}
best mean cross-validation score: 0.735
test-set score: 0.750


In [74]:
xgboost = XGBClassifier(random_state=9, max_depth=2,eta=1,n_estimators=100)

start = timeit.default_timer()
xgboost.fit(X, label)
stop = timeit.default_timer()

# clf.score(X_test, np.array(test.fake))
y_predictions = xgboost.predict(X_test)
print("test-set score: {:.4f}".format(f1_score(np.array(test.fake), y_predictions)))
print("test-set score: {:.4f}".format(accuracy_score(np.array(test.fake), y_predictions)))

test-set score: 0.8052
test-set score: 0.7917
