In [1]:
from string import punctuation
from os import listdir
from nltk.corpus import stopwords
from pickle import dump
import nltk
nltk.download('stopwords')


# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	tokens = ' '.join(tokens)
	return tokens

# load all docs in a directory
def process_docs(directory, is_trian):
	documents = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		if not is_trian:
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load the doc
		doc = load_doc(path)
		# clean doc
		tokens = clean_doc(doc)
		# add to list
		documents.append(tokens)
	return documents

# save a dataset to file
def save_dataset(dataset, filename):
	dump(dataset, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load all training reviews
negative_docs = process_docs('drive/My Drive/The_Research/test_data/neg', True)
positive_docs = process_docs('drive/My Drive/The_Research/test_data/pos', True)
trainX = negative_docs + positive_docs
trainy = [0 for _ in range(5)] + [1 for _ in range(5)]
save_dataset([trainX,trainy], 'trainbeta.pkl')

# load all test reviews
negative_docs = process_docs('drive/My Drive/The_Research/test_data/neg', False)
positive_docs = process_docs('drive/My Drive/The_Research/test_data/pos', False)
testX = negative_docs + positive_docs
print(len(testX))
testY = [0 for _ in range(5)] + [1 for _ in range(5)]
save_dataset([testX,testY], 'testbeta.pkl')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Saved: trainbeta.pkl
0
Saved: testbeta.pkl


In [21]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

# load a clean dataset
def load_dataset(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# calculate the maximum document length
def max_length(lines):
	return max([len(s.split()) for s in lines])

# encode a list of lines
def encode_text(tokenizer, lines, length):
	# integer encode
	encoded = tokenizer.texts_to_sequences(lines)
	# pad encoded sequences
	padded = pad_sequences(encoded, maxlen=length, padding='post')
	return padded

# define the model
def define_model(length, vocab_size):
	# channel 1
  inputs1 = Input(shape=(length,))
  embedding1 = Embedding(vocab_size, 100)(inputs1)
  drop1 = Dropout(0.5)(embedding1)

  embedding2 = Embedding(vocab_size, 100)(drop1)
  conv1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding1)
  pool1 = MaxPooling1D(pool_size=2)(conv1)
  flat1 = Flatten()(pool1)
  dense1 = Dense(10, activation='relu')(flat1)
  outputs = Dense(1, activation='sigmoid')(dense1)




  #drop2 = Dropout(0.5)(dense1)
  #pool1 = MaxPooling1D(pool_size=2)(drop1)
  #flat1 = Flatten()(pool1)

  model = Model(inputs=inputs1, outputs=outputs)
  print("outputs: " + str(outputs))
  # compile
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  # summarize
  print(model.summary())
  plot_model(model, show_shapes=True, to_file='multichannelbeta.png')
  return model

# load training dataset
trainLines, trainLabels = load_dataset('trainbeta.pkl')
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
print(trainX.shape)

# define model
model = define_model(length, vocab_size)
# fit model
model.fit(trainX, array(trainLabels), epochs=2, batch_size=16)
output = model.predict(trainX)
print(output)
# save the model
model.save('modelbeta.h5')

Max document length: 763
Vocabulary size: 1942
(10, 763)
outputs: Tensor("dense_21/Sigmoid:0", shape=(?, 1), dtype=float32)
Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_19 (InputLayer)        (None, 763)               0         
_________________________________________________________________
embedding_34 (Embedding)     (None, 763, 100)          194200    
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 761, 32)           9632      
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 380, 32)           0         
_________________________________________________________________
flatten_13 (Flatten)         (None, 12160)             0         
_________________________________________________________________
dense_20 (Dense)             (None, 10)                121610    
_