## Import Dependencies

In [1]:
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import PunktSentenceTokenizer

import numpy as np
import xgboost as xgb
from tqdm import tqdm

#Keras/TF
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils

#SKLearn
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

#NLTK Functions
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

Using TensorFlow backend.


## Define LogLoss Function

In [2]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

## Read in Data

In [3]:
data = pd.read_csv('../merged_df.csv')

In [4]:
#Import stopwords
stopWords = set(stopwords.words('english'))
data['title_tokenized'] = [word_tokenize(i) for i in data['Headline']]

filtered = []
for words in data['title_tokenized']:
    temp = []
    for w in words:
        if w not in stopWords:
            temp.append(w)
    filtered.append(temp)

data['title_no_stops'] = filtered

In [5]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Headline,Negative,Positive,Neutral,Compound Score,Read/Fake,Character Count,Word Count,Upper Characters,Lower Case Characters,SpecialChar Count,title_tokenized,title_no_stops
0,0,0,#2816: Clinton Pride’s 8(a) Pig Farm Bridge – ...,0.0,0.0,1.0,0.0,fake,97,16,13,56,8,"[#, 2816, :, Clinton, Pride, ’, s, 8, (, a, ),...","[#, 2816, :, Clinton, Pride, ’, 8, (, ), Pig, ..."
1,1,1,#2817: Serco's Zulu Starnet Blackmail – Clinto...,0.0,0.0,1.0,0.0,fake,88,15,11,51,7,"[#, 2817, :, Serco, 's, Zulu, Starnet, Blackma...","[#, 2817, :, Serco, 's, Zulu, Starnet, Blackma..."
2,2,2,Roger Stone update on Stop the Steal exit poll...,0.237,0.05,0.713,-0.9313,fake,456,72,14,358,13,"[Roger, Stone, update, on, Stop, the, Steal, e...","[Roger, Stone, update, Stop, Steal, exit, poll..."
3,3,3,#2818: Serco's Zulu Bridge To Mumbai Pig Farm ...,0.0,0.0,1.0,0.0,fake,91,17,12,47,8,"[#, 2818, :, Serco, 's, Zulu, Bridge, To, Mumb...","[#, 2818, :, Serco, 's, Zulu, Bridge, To, Mumb..."
4,4,4,Trump Advocates the American People's Control ...,0.0,0.0,1.0,0.0,fake,66,9,9,46,3,"[Trump, Advocates, the, American, People, 's, ...","[Trump, Advocates, American, People, 's, Contr..."


## Encode y's and train test split

In [7]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data['Read/Fake'].values)
X = data['Headline'].values
X_feat = data[['Negative','Positive','Neutral','Character Count','Word Count','Upper Characters','Lower Case Characters','SpecialChar Count']]

In [8]:
xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [9]:
xfeattrain, xfeatvalid, ytrain, yvalid = train_test_split(X_feat, y, stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

## Use OOTB Vectorizer Functions

In [10]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

## Logistic Function Classifier

In [11]:
# Fitting a simple Logistic Regression on TF-IDF
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
predictions_y = clf.predict(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print (confusion_matrix(yvalid,predictions_y))
print (f'Score: {clf.score(xvalid_tfv,yvalid)}')

logloss: 0.355 
[[1331  330]
 [ 216 2102]]
Score: 0.8627795928625283




In [12]:
# Fitting a simple Logistic Regression on Features
clf = LogisticRegression(C=1.0)
clf.fit(xfeattrain, ytrain)
predictions = clf.predict_proba(xfeatvalid)
predictions_y = clf.predict(xfeatvalid)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print (confusion_matrix(yvalid,predictions_y))
print (f'Score: {clf.score(xfeatvalid,yvalid)}')

logloss: 0.563 
[[ 856  805]
 [ 210 2108]]
Score: 0.744910781603418


## Naive Bayes

In [13]:
# Fitting a simple Naive Bayes on TFIDF
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
predictions_y = clf.predict(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print (confusion_matrix(yvalid,predictions_y))
print (f'Score: {clf.score(xvalid_tfv,yvalid)}')
# print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.329 
[[1324  337]
 [ 195 2123]]
Score: 0.8662980648404122


In [14]:
# Fitting a simple Naive Bayes on Features
clf = MultinomialNB()
clf.fit(xfeattrain, ytrain)
predictions = clf.predict_proba(xfeatvalid)
predictions_y = clf.predict(xfeatvalid)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print (confusion_matrix(yvalid,predictions_y))
print (f'Score: {clf.score(xfeatvalid,yvalid)}')

logloss: 0.708 
[[ 907  754]
 [ 443 1875]]
Score: 0.6991706458909274


## XG BOOOOOOOST

In [15]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())
predictions_y = clf.predict(xvalid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print (confusion_matrix(yvalid,predictions_y))
print (f'Score: {clf.score(xvalid_tfv,yvalid)}')

logloss: 0.459 
[[ 975  686]
 [ 161 2157]]
Score: 0.7871324453380246


In [16]:
# Fitting a simple xgboost on Features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xfeattrain, ytrain)
predictions = clf.predict_proba(xfeatvalid)
predictions_y = clf.predict(xfeatvalid)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print (confusion_matrix(yvalid,predictions_y))
print (f'Score: {clf.score(xfeatvalid,yvalid)}')

logloss: 0.418 
[[1122  539]
 [ 211 2107]]
Score: 0.8115104297562201


## Word Cloud Stuff

In [72]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words("english"))

In [56]:
fakes = data[data['Read/Fake']=='fake'].Headline
reals = data[data['Read/Fake']=='real'].Headline

In [93]:
def uniqueify(listofstrings):
    tokenizer = RegexpTokenizer(r'\w+')
    unique = []
    for item in listofstrings:
        temp = tokenizer.tokenize(item)
        for s in temp:
            if (s.lower() not in stop_words and s.lower() not in unique):
                unique.append(s.lower())
    return unique

In [94]:
def flattenify(listofstrings):
    tokenizer = RegexpTokenizer(r'\w+')
    flat = []
    for item in listofstrings:
        temp = tokenizer.tokenize(item)
        for s in temp:
            if s.lower() not in stop_words:
                flat.append(s.lower())
    return flat

In [95]:
flat_fakes = flattenify(fakes)
flat_reals = flattenify(reals)
unique_fakes = uniqueify(fakes)
unique_reals = uniqueify(reals)

In [96]:
fakecounts = []
realcounts = []

for i in unique_fakes:
    fcount = 0
    for j in flat_fakes:
        if i == j:
            fcount = fcount + 1
    fakecounts.append(fcount)
    
for i in unique_reals:
    rcount = 0 
    for j in flat_reals:
        if i == j:
            rcount = rcount + 1
    realcounts.append(rcount)

In [97]:
fakesdf = pd.DataFrame({'unique_fakes':unique_fakes, 'fake_counts':fakecounts})
realsdf = pd.DataFrame({'unique_reals':unique_reals, 'real_counts':realcounts})

In [98]:
fakesdf = fakesdf.sort_values(by = 'fake_counts', ascending = False)
realsdf = realsdf.sort_values(by = 'real_counts', ascending = False)

In [99]:
fakesdf.head(10)

Unnamed: 0,unique_fakes,fake_counts
71,trump,1797
48,hillary,1020
1,clinton,902
350,new,890
52,election,512
51,us,498
313,video,497
450,man,477
154,news,471
263,russia,412


In [100]:
realsdf.head(10)

Unnamed: 0,unique_reals,real_counts
96,u,2948
672,trump,2107
183,says,1776
9,new,1278
2994,korea,1032
107,paid,1021
108,notice,1011
618,north,983
113,deaths,920
627,china,632


In [102]:
fakesdf.to_csv('./fakesdf_wordcloud.csv')
realsdf.to_csv('./realsdf_wordcloud.csv')