In [23]:
# Import statements
import pandas as pd
from nltk import word_tokenize
from numpy import array
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate


In [18]:
# Loading in preproccessed data
PATH = "/content/drive/My Drive/Data/"

# create dataframes and keep only necessary features to join dataframes
data = pd.read_csv(PATH + "final_preproccessed_data.csv", lineterminator='\n')

data

Unnamed: 0,title,text,label
0,muslims busted stole millions gov ’ benefits,print pay back money plus interest entire fami...,1
1,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1
2,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1
3,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1
4,fantastic trumps 7 point plan reform healthcar...,email healthcare reform make america great sin...,1
...,...,...,...
29483,travel deals get 1200 air credit two apt cookt...,apt offering savings new cape york outback wil...,0
29484,atlantis,week ago today september 4th robby went atlant...,0
29485,taiwanese recyclers belief waste simply mispla...,taipei taiwan sept 8 2015 prnewswire recent ye...,0
29486,season curtain raiser ideal way honour john,blackburn sunday league john haydock memorial ...,0


In [25]:
data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

In [17]:

# data['text_tokens'] = [word_tokenize(text) for text in data.text]
# # data['title_tokens'] = [word_tokenize(title) for title in data.title]

# data

Unnamed: 0,title,text,label,text_tokens
0,muslims busted stole millions gov ’ benefits,print pay back money plus interest entire fami...,1,"[print, pay, back, money, plus, interest, enti..."
1,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1,"[attorney, general, loretta, lynch, plead, fif..."
2,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1,"[red, state, fox, news, sunday, reported, morn..."
3,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1,"[email, kayla, mueller, prisoner, tortured, is..."
4,fantastic trumps 7 point plan reform healthcar...,email healthcare reform make america great sin...,1,"[email, healthcare, reform, make, america, gre..."
...,...,...,...,...
29483,travel deals get 1200 air credit two apt cookt...,apt offering savings new cape york outback wil...,0,"[apt, offering, savings, new, cape, york, outb..."
29484,atlantis,week ago today september 4th robby went atlant...,0,"[week, ago, today, september, 4th, robby, went..."
29485,taiwanese recyclers belief waste simply mispla...,taipei taiwan sept 8 2015 prnewswire recent ye...,0,"[taipei, taiwan, sept, 8, 2015, prnewswire, re..."
29486,season curtain raiser ideal way honour john,blackburn sunday league john haydock memorial ...,0,"[blackburn, sunday, league, john, haydock, mem..."


In [26]:

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

tokenizer = create_tokenizer(data_train['text'])

In [32]:

# calculate the maximum document length
def max_length(lines):
	return max([len(s.split()) for s in lines])
 
max_length = max_length(data_train['text'])

print('Max document length: %d' % max_length)

Max document length: 2202


In [31]:

# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1

print('Vocabulary size: %d' % vocab_size)

Vocabulary size: 213373


In [47]:

# encode a list of lines
def encode_text(tokenizer, lines, length):
  # integer encode
  encoded = tokenizer.texts_to_sequences(lines)
  # pad encoded sequences
  padded = pad_sequences(encoded, maxlen=length, padding='post')
  return padded

encoded_data = encode_text(tokenizer, data_train['text'], max_length)
print(encoded_data.shape)


In [38]:
def define_model(length, vocab_size):
	# channel 1
	inputs1 = Input(shape=(length,))
	embedding1 = Embedding(vocab_size, 100)(inputs1)
	conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
	drop1 = Dropout(0.5)(conv1)
	pool1 = MaxPooling1D(pool_size=2)(drop1)
	flat1 = Flatten()(pool1)
	# channel 2
	inputs2 = Input(shape=(length,))
	embedding2 = Embedding(vocab_size, 100)(inputs2)
	conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
	drop2 = Dropout(0.5)(conv2)
	pool2 = MaxPooling1D(pool_size=2)(drop2)
	flat2 = Flatten()(pool2)
	# channel 3
	inputs3 = Input(shape=(length,))
	embedding3 = Embedding(vocab_size, 100)(inputs3)
	conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
	drop3 = Dropout(0.5)(conv3)
	pool3 = MaxPooling1D(pool_size=2)(drop3)
	flat3 = Flatten()(pool3)
	# merge
	merged = concatenate([flat1, flat2, flat3])
	# interpretation
	dense1 = Dense(10, activation='relu')(merged)
	outputs = Dense(1, activation='sigmoid')(dense1)
	model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
	# compile
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	# summarize
	print(model.summary())
	plot_model(model, show_shapes=True, to_file='multichannel.png')
	return model


In [None]:
# define model
model = define_model(max_length, vocab_size)
# fit model
model.fit([encoded_data, encoded_data, encoded_data], array(data_train['label']), epochs=50, batch_size=16)
# save the model
model.save('model.h5')

Model: "functional_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           [(None, 2202)]       0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           [(None, 2202)]       0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           [(None, 2202)]       0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 2202, 100)    21337300    input_10[0][0]                   
_______________________________________________________________________________________

In [None]:
encoded_text_testing = encode_text(tokenizer, data_test['text'], max_length)

# evaluate model on training dataset
loss_train, acc_train = model.evaluate([encoded_data, encoded_data, encoded_data], array(data_train['label']), verbose=0)
print('Train Accuracy: %f' % (acc*100))
 
# evaluate model on test dataset dataset
loss_test, acc_test = model.evaluate([encoded_text_testing, encoded_text_testing, encoded_text_testing],array(data_test['label']), verbose=0)
print('Test Accuracy: %f' % (acc*100))