In [1]:
# Import statements
import pandas as pd
from nltk import word_tokenize
from numpy import array
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate


In [2]:
# Loading in preproccessed data
PATH = "/content/drive/My Drive/Data/"

# create dataframes and keep only necessary features to join dataframes
data = pd.read_csv(PATH + "final_preproccessed_data.csv", lineterminator='\n')

data

Unnamed: 0,title,text,label
0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,1
1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,1
2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,1
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,1
4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,1
...,...,...,...
29484,Travel deals: Get $1200 of air credit for two ...,APT is offering savings on its new Cape York a...,0
29485,Hospital patients 'more likely to die if admit...,Patients admitted to NHS hospitals on weekends...,0
29486,A Taiwanese Recycler's Belief That All Waste I...,"TAIPEI, Taiwan , Sept. 8, 2015 /PRNewswire/ --...",0
29487,Season curtain raiser is ideal way to honour John,Blackburn Sunday League John Haydock Memorial ...,0


In [3]:
data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

In [4]:

# data['text_tokens'] = [word_tokenize(text) for text in data.text]
# # data['title_tokens'] = [word_tokenize(title) for title in data.title]

# data

In [7]:

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

tokenizer = create_tokenizer(data_train['text'])

In [8]:

# calculate the maximum document length
def max_length(lines):
	return max([len(s.split()) for s in lines])
 
maxlen = max_length(data_train['text'])

print('Max document length: %d' % maxlen)

Max document length: 3194


In [9]:

# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1

print('Vocabulary size: %d' % vocab_size)

Vocabulary size: 221431


In [11]:

# encode a list of lines
def encode_text(tokenizer, lines, length):
  # integer encode
  encoded = tokenizer.texts_to_sequences(lines)
  # pad encoded sequences
  padded = pad_sequences(encoded, maxlen=length, padding='post')
  return padded

encoded_data = encode_text(tokenizer, data_train['text'], maxlen)
print(encoded_data.shape)


(26540, 3194)


In [12]:
def define_model(length, vocab_size):
	# channel 1
	inputs1 = Input(shape=(length,))
	embedding1 = Embedding(vocab_size, 100)(inputs1)
	conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
	drop1 = Dropout(0.5)(conv1)
	pool1 = MaxPooling1D(pool_size=2)(drop1)
	flat1 = Flatten()(pool1)
	# channel 2
	inputs2 = Input(shape=(length,))
	embedding2 = Embedding(vocab_size, 100)(inputs2)
	conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
	drop2 = Dropout(0.5)(conv2)
	pool2 = MaxPooling1D(pool_size=2)(drop2)
	flat2 = Flatten()(pool2)
	# channel 3
	inputs3 = Input(shape=(length,))
	embedding3 = Embedding(vocab_size, 100)(inputs3)
	conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
	drop3 = Dropout(0.5)(conv3)
	pool3 = MaxPooling1D(pool_size=2)(drop3)
	flat3 = Flatten()(pool3)
	# merge
	merged = concatenate([flat1, flat2, flat3])
	# interpretation
	dense1 = Dense(10, activation='relu')(merged)
	outputs = Dense(1, activation='sigmoid')(dense1)
	model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
	# compile
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	# summarize
	print(model.summary())
	plot_model(model, show_shapes=True, to_file='multichannel.png')
	return model


In [13]:
encoded_data_testing = encode_text(tokenizer, data_test['text'], maxlen)


In [14]:
# define model
model = define_model(maxlen, vocab_size)
# fit model
# more epochs is more steps
# model.fit([encoded_data, encoded_data, encoded_data], array(data_train['label']), epochs=4, batch_size=16)
model.fit([encoded_data, encoded_data, encoded_data], array(data_train['label']), epochs=6, batch_size=32)
# save the model
model.save('model.h5')


Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 3194)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 3194)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 3194)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 3194, 100)    22143100    input_1[0][0]                    
_______________________________________________________________________________________

In [16]:
encoded_data_testing = encode_text(tokenizer, data_test['text'], maxlen)

# evaluate model on training dataset
loss_train, acc_train = model.evaluate([encoded_data, encoded_data, encoded_data], array(data_train['label']), verbose=0)
print('Train Accuracy: %f' % (acc_train*100))
 
# evaluate model on test dataset dataset
loss_test, acc_test = model.evaluate([encoded_data_testing, encoded_data_testing, encoded_data_testing],array(data_test['label']), verbose=0)
print('Test Accuracy: %f' % (acc_test*100))


Train Accuracy: 99.913341
Test Accuracy: 89.386231
