In [126]:
## load dataset

import pandas as pd
import numpy as np

## keras dependencies
from keras import backend as K

from keras.layers import Conv1D, Dense, Input, Lambda, LSTM
from keras.layers.merge import concatenate
from keras.layers.embeddings import Embedding

from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence
import _pickle as cPickle

from keras.layers import Concatenate, Input, MaxPooling1D
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences # To make vectors the same size. 
# from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPool1D, MaxPool1D
from keras.optimizers import SGD
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.callbacks import TensorBoard, CSVLogger, EarlyStopping
import matplotlib.pyplot as plt


fake_df=pd.read_csv('gossipcop_fake.csv')
legit_df=pd.read_csv('gossipcop_real.csv')

In [127]:
## remove stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
stop = set(stopwords.words('english'))

fake_df['title'] = list(map(' '.join, fake_df['title'].apply(lambda x: [item for item in x.lower().split() if item not in stop])))
legit_df['title'] = list(map(' '.join, legit_df['title'].apply(lambda x: [item for item in x.lower().split() if item not in stop])))

##
fakes = [s.strip().lower() for s in fake_df.title]
legits = [s.strip().lower() for s in legit_df.title]

## preprocessing utilities
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

## clean text
x_text = fakes + legits
x_text = [clean_str(sent) for sent in x_text]

## 
# Generate labels
# legits_labels = [[0, 1] for _ in legits]
# fakes_labels = [[1, 0] for _ in fakes]
# y = np.concatenate([legits_labels, fakes_labels], 0)

labels = [0 for _ in range(len(fakes))] + [1 for _ in range(len(legits))]
labels = to_categorical(labels)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jshayi2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [128]:
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.layers import Embedding
from keras import optimizers
from keras.utils import plot_model, to_categorical
from sklearn.model_selection import train_test_split

In [129]:
## tokenize for word based embedding
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(x_text)
sequences = tokenizer.texts_to_sequences(x_text)
vocab_dict=tokenizer.word_index
data   = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')

In [148]:
## parameter
MAX_NUM_WORDS   = 15000
hidden_size = 150 #Has to be same as EMBEDDING_DIM
lstm_size = 100
num_steps = 500
num_epochs = 30
batch_size = 64
#Hyperparams for CNN
kernel_sizes = [3,4,5]
filter_size = 128
embedding_dims=300
val_size= 0.2
max_features = len(vocab_dict)+1
vocab_length = len(vocab_dict.keys())

In [149]:
random_state = np.random.randint(234)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=val_size, random_state=random_state)

In [146]:
## loading glovec
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    embeddings_index = {}
    f = open(gloveFile, encoding='utf8')
    for line in f:
        values = line.split()
        word = ''.join(values[:-300])
        coefs = np.asarray(values[-300:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

glove_model = loadGloveModel('glove.6B.300d.txt')

def build_glove_embedding_layers():
    embed_matrix=np.zeros((max_features, embedding_dims))
    for word, indx in tokenizer.word_index.items():
        if indx >= max_features:
            continue
        if word in glove_model:
            embed_vec=glove_model[word]
            if embed_vec is not None:
                embed_matrix[indx]=embed_vec
    return embed_matrix

embedding_weights=build_glove_embedding_layers()

Loading Glove Model


### CNN model for new dataset

In [151]:
import keras

kernel_arr = []
statement_input = Input(shape=(num_steps,), dtype='int32', name='main_input')
embed_sequences = Embedding(vocab_length+1,embedding_dims,weights=[embedding_weights],input_length=num_steps,trainable=False)(statement_input) #Preloaded glove embeddings

for kernel in kernel_sizes:
    x_1 = Conv1D(filters=filter_size,kernel_size=kernel, 
                 padding="same", activation="relu", strides=1)(embed_sequences)
    x_1 = MaxPool1D(3)(x_1)
    x_flat = Flatten()(x_1)
    x_drop = Dropout(0.85)(x_flat)
    kernel_arr.append(x_flat)

conv_ins = keras.layers.concatenate(kernel_arr)
conv_ins = Dropout(0.85)(conv_ins)
conv_ins = Dense(128, activation='relu')(conv_ins)

## add multi convolutional layer
l_conv1 = Conv1D(128,3,activation="relu", padding="valid", strides=1)(embed_sequences)
l_pool1 = MaxPooling1D(3)(l_conv1)
l_conv2 = Conv1D(128, 3, activation="relu")(l_pool1)
l_pool2 = MaxPooling1D(3)(l_conv2)
# l_conv3 = Conv1D(128,3,activation="relu")(l_pool2)
# l_pool3 = MaxPool1D(3)(l_conv3)
l_flat = Flatten()(l_pool2)
conv_in1 = Dropout(0.85)(l_flat)
conv_in1 = Dense(128, activation='relu')(conv_in1)

## do merge
conv_merged= keras.layers.concatenate([conv_ins, conv_in1])

# #Meta input
# meta_input = Input(shape=(x_train_metadata.shape[1],), name='aux_input')
# x_drop = Dropout(0.9)(meta_input)
# x_meta = Dense(100, activation='relu')(x_drop)
# x = keras.layers.concatenate([conv_merged, x_meta])


main_output = Dense(2, activation='softmax', name='main_output')(conv_merged)
model_hybird = Model(inputs=[statement_input], outputs=[main_output])

#************************************************************************#
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.2)
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)


model_hybird.compile(optimizer=sgd,
                    loss='categorical_crossentropy',
                    metrics=['categorical_accuracy'])

model_hybird.summary()

##
tb = TensorBoard()
csv_logger = keras.callbacks.CSVLogger('training.log')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
filepath= "weights.best.hdf5"
checkpoint = keras.callbacks.ModelCheckpoint(filepath, 
                                             monitor='val_categorical_accuracy', 
                                             verbose=1, save_best_only=True, mode='max')

history22= model_hybird.fit(X_train,
                           y_train,epochs=20, batch_size=128,
                           validation_data=(X_val,y_val),
                           callbacks=[tb,csv_logger,checkpoint, es])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 500)          0                                            
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, 500, 300)     5742600     main_input[0][0]                 
__________________________________________________________________________________________________
conv1d_56 (Conv1D)              (None, 498, 128)     115328      embedding_13[0][0]               
__________________________________________________________________________________________________
conv1d_53 (Conv1D)              (None, 500, 128)     115328      embedding_13[0][0]               
__________________________________________________________________________________________________
conv1d_54 

KeyboardInterrupt: 

In [71]:
"""
# this part is used to create metadata for given dataset
urls = fake_df.news_url.dropna()
make_me = []
for url in urls:
    lst = url.split("/")
    lst[-2] = lst[-2][:-12]
    make_me.append([x for x in lst if not x.isdigit() and not x == ""])
"""

'\n# this part is used to create metadata for given dataset\nurls = fake_df.news_url.dropna()\nmake_me = []\nfor url in urls:\n    lst = url.split("/")\n    lst[-2] = lst[-2][:-12]\n    make_me.append([x for x in lst if not x.isdigit() and not x == ""])\n'