In [4]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from sklearn.model_selection import train_test_split
import collections
import math
import os
import random
import tarfile
import re
from six.moves import urllib
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Conv1D
from keras.layers import Dense, Dropout
from keras.layers import MaxPooling1D
from keras.layers import GlobalMaxPooling1D 
from keras.layers import Convolution1D

from sklearn.metrics import average_precision_score
from keras.layers import LSTM
from keras.models import model_from_json
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### Multiple Classification 

In [9]:
# Multiple Classifications 
def get_reviews(dirname, number):
    label = number

    reviews = []
    labels = []
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+', encoding='utf-8-sig') as f:
                review = f.read().lower()
                reviews.append(review)
                labels.append(label)
    return reviews, labels           

def extract_labels_data():
    positive_reviews, positive_labels = get_reviews("G:/Chen/PortClass/CP/", number=2)
    neutral_reviews, neutral_labels = get_reviews("G:/Chen/PortClass/CR/", number=1)
    negative_reviews, negative_labels = get_reviews("G:/Chen/PortClass/CR/", number=0)

    data = negative_reviews + neutral_reviews + positive_reviews
    labels = negative_labels + neutral_labels + positive_labels

    return labels, data

labels, data = extract_labels_data()
MAX_SEQUENCE_LENGTH = max([len(x.split(" ")) for x in data])

In [10]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(data)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(data)
#print(encoded_docs)
max_length = max([len(i.split(' ')) for i in data])
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [38]:
encoder = LabelEncoder()
encoder.fit(labels)
encoded_Y = encoder.transform(labels)
encoded_Y
dummy_y = np_utils.to_categorical(encoded_Y)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(padded_docs, dummy_y, test_size=0.13, random_state=42)

In [42]:
l = list()
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, dummy_y, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, 
                        output_dim = 60, 
                        input_length = max_length))
    model.add(Flatten())
    model.add(Dropout(0.2))
    model.add(Dense(256, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(3, kernel_initializer='normal', activation='softmax'))
    model.summary()
    model.compile(optimizer='adam', 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])

    #print(model.summary())
    # epochs = 10, batch_size = 1
    model.fit(X_train, y_train, epochs=10, batch_size=20, verbose=0)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=1)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 717, 60)           44700     
_________________________________________________________________
flatten_5 (Flatten)          (None, 43020)             0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 43020)             0         
_________________________________________________________________
dense_9 (Dense)              (None, 256)               11013376  
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 3)                 771       
Total params: 11,058,847
Trainable params: 11,058,847
Non-trainable params: 0
________________________________________________________________

random state 6: Accuracy: 44.44%
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 717, 60)           44700     
_________________________________________________________________
flatten_12 (Flatten)         (None, 43020)             0         
_________________________________________________________________
dropout_23 (Dropout)         (None, 43020)             0         
_________________________________________________________________
dense_23 (Dense)             (None, 256)               11013376  
_________________________________________________________________
dropout_24 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_24 (Dense)             (None, 3)                 771       
Total params: 11,058,847
Trainable params: 11,058,847
Non-trainable params: 0
_______________________________

## Models Below: Binary Classification


In [55]:
proxy = 'gw-proxy-la03p.corp.tcw.com:80'
os.environ['https_proxy'] = proxy

TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")

def get_reviews(dirname, positive=True):
    label = 1 if positive else 0

    reviews = []
    labels = []
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+', encoding='utf-8-sig') as f:
                review = f.read().lower()
                reviews.append(review)
                labels.append(label)
    return reviews, labels           

def extract_labels_data():
    positive_reviews, positive_labels = get_reviews("G:/Mohit/PortClass/CP/", positive=True)
    negative_reviews, negative_labels = get_reviews("G:/Mohit/PortClass/CR/", positive=False)

    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels

    return labels, data

labels, data = extract_labels_data()
MAX_SEQUENCE_LENGTH = max([len(x.split(" ")) for x in data])
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(data)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(data)
#print(encoded_docs)
max_length = max([len(i.split(' ')) for i in data])
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

#### Version 1: Embedding, Flatten, Dense.
https://keras.io/getting-started/sequential-model-guide/

Max Accuracy: 100.00%
Min Accuracy: 44.44%
Mean Accuracy: 68.33%


10 epochs
Max Accuracy: 100.00%
Min Accuracy: 44.44%
Mean Accuracy: 71.11%


15 epochs
Max Accuracy: 100.00%
Min Accuracy: 44.44%
Mean Accuracy: 68.89%

20 epochs
Max Accuracy: 100.00%
Min Accuracy: 44.44%
Mean Accuracy: 71.11%

50 epochs
Max Accuracy: 100.00%
Min Accuracy: 44.44%
Mean Accuracy: 75.56%

100 epochs
Max Accuracy: 100.00%
Min Accuracy: 55.56%
Mean Accuracy: 82.22%

In [56]:
len(padded_docs)

69

#### Convolution RNN Multiclassifier
https://medium.com/@thoszymkowiak/how-to-implement-sentiment-analysis-using-word-embedding-and-convolutional-neural-networks-on-keras-163197aef623
Max Accuracy: 100.00%
Min Accuracy: 44.44%
Mean Accuracy: 76.67%

In [74]:
l = list()
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, 
                        output_dim = 60, 
                        input_length = max_length))
    model.add(Convolution1D(64, 3, padding='same'))
    model.add(Convolution1D(32, 3, padding='same'))
    model.add(Convolution1D(16, 3, padding='same'))
    model.add(Flatten())
    model.add(Dropout(0.2))
    model.add(Dense(180,activation='sigmoid'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    print(model.summary())
    model.fit(X_train, y_train, epochs=100, batch_size=20, verbose=0)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=1)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

# Max Accuracy: 100.00%
# Min Accuracy: 44.44%
# Mean Accuracy: 76.67%

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_95 (Embedding)     (None, 717, 60)           58080     
_________________________________________________________________
conv1d_138 (Conv1D)          (None, 717, 64)           11584     
_________________________________________________________________
conv1d_139 (Conv1D)          (None, 717, 32)           6176      
_________________________________________________________________
conv1d_140 (Conv1D)          (None, 717, 16)           1552      
_________________________________________________________________
flatten_15 (Flatten)         (None, 11472)             0         
_________________________________________________________________
dropout_106 (Dropout)        (None, 11472)             0         
_________________________________________________________________
dense_106 (Dense)            (None, 180)               2065140   
__________

random state 5: Accuracy: 100.00%
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_101 (Embedding)    (None, 717, 60)           58080     
_________________________________________________________________
conv1d_156 (Conv1D)          (None, 717, 64)           11584     
_________________________________________________________________
conv1d_157 (Conv1D)          (None, 717, 32)           6176      
_________________________________________________________________
conv1d_158 (Conv1D)          (None, 717, 16)           1552      
_________________________________________________________________
flatten_21 (Flatten)         (None, 11472)             0         
_________________________________________________________________
dropout_118 (Dropout)        (None, 11472)             0         
_________________________________________________________________
dense_118 (Dense)            (None, 180)  

#### RNN Multiclassifier 
https://www.kaggle.com/vsmolyakov/keras-cnn-with-fasttext-embeddings

In [113]:
l = list()
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, 
                        output_dim = 60, 
                        input_length = max_length))
    
    # Added LSTM Below 
    #model.add(LSTM(20, return_sequences=True,dropout=0.2,recurrent_dropout=0.2))
    model.add(Conv1D(60, 7, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(60, 7, activation='relu', padding='same'))
    model.add(LSTM(60, return_sequences=True,dropout=0.2,recurrent_dropout=0.2))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    print(model.summary())
    model.fit(X_train, y_train, epochs=100, batch_size=60, verbose=0)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

# Across 10 States
# Max Accuracy: 100.00%
# Min Accuracy: 55.56%
# Mean Accuracy: 86.67%

# Across 20 States
# Max Accuracy: 100.00%
# Min Accuracy: 44.44%
# Mean Accuracy: 79.44%

# Across 20 States -> Added LSTM Layer (first)
# 74%

# Across 20 States -> Added LSTM Layer after Conv layers 
# Max Accuracy: 100.00%
# Min Accuracy: 55.56%
# Mean Accuracy: 78.89%

# Across 13 States -> Added to 60 for all 
# Mean Accuracy 80%

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_273 (Embedding)    (None, 717, 60)           58080     
_________________________________________________________________
conv1d_409 (Conv1D)          (None, 717, 60)           25260     
_________________________________________________________________
max_pooling1d_196 (MaxPoolin (None, 358, 60)           0         
_________________________________________________________________
conv1d_410 (Conv1D)          (None, 358, 60)           25260     
_________________________________________________________________
lstm_107 (LSTM)              (None, 358, 60)           29040     
_________________________________________________________________
global_max_pooling1d_176 (Gl (None, 60)                0         
_________________________________________________________________
dense_337 (Dense)            (None, 1)                 61        
Total para

KeyboardInterrupt: 

#### Attempt at Flatten + Others


In [None]:
%%time
l = list()
for i in range(1):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, 
                        output_dim = 60, 
                        input_length = max_length))
    
    # Added LSTM Below 
    model.add(Conv1D(60, 6, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(60, 3, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.5))
    #model.add(LSTM(60, return_sequences=True))
    model.add(Flatten())
    model.add(Dense(64, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


    print(model.summary())
    model.fit(X_train, y_train, epochs=100, batch_size=60, verbose=0)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

# 20 - no dropout, kernel size -> 7
#Max Accuracy: 100.00%
# Min Accuracy: 55.56%
# Mean Accuracy: 81.11%

# 10 - Dropout 0.5 added, kernel size -> 6, 3
# Max Accuracy: 100.00%
# Min Accuracy: 55.56%
# Mean Accuracy: 83.33%

# 10 - Added LSTM -> with LSTM 
# Max Accuracy: 100.00%
# Min Accuracy: 55.56%
# Mean Accuracy: 83.33%

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_324 (Embedding)    (None, 717, 60)           58080     
_________________________________________________________________
conv1d_511 (Conv1D)          (None, 717, 60)           21660     
_________________________________________________________________
max_pooling1d_295 (MaxPoolin (None, 358, 60)           0         
_________________________________________________________________
conv1d_512 (Conv1D)          (None, 358, 60)           10860     
_________________________________________________________________
max_pooling1d_296 (MaxPoolin (None, 179, 60)           0         
_________________________________________________________________
dropout_299 (Dropout)        (None, 179, 60)           0         
_________________________________________________________________
flatten_113 (Flatten)        (None, 10740)             0         
__________

#### Flatten Dense Basic Model


In [81]:
l = list()
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, 
                        output_dim = 60, 
                        input_length = max_length))
    model.add(Flatten())
    model.add(Dense(256, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])

    #print(model.summary())
    # epochs = 10, batch_size = 1
    model.fit(X_train, y_train, epochs=10, batch_size=20, verbose=0)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

# *adam optimizer, flatten - Dense 256, relu -> Dropout 0.5 -> Dense 128, relu -> Dropout -> Dense 1, sigmoid
# Mean Accuracy: 82.22% @10 -> 76.69% -> 77.78 

# 10 Runs...Added dropout 0.2 to 2 layers
# Max Accuracy: 100.00%
# Min Accuracy: 33.33%
# Mean Accuracy: 74.44%

# 20 Runs 
# Max Accuracy: 100.00%
# Min Accuracy: 33.33%
# Mean Accuracy: 71.11%

random state 0: Accuracy: 77.78%
random state 1: Accuracy: 77.78%
random state 2: Accuracy: 55.56%
random state 3: Accuracy: 100.00%
random state 4: Accuracy: 66.67%
random state 5: Accuracy: 66.67%
random state 6: Accuracy: 77.78%
random state 7: Accuracy: 66.67%
random state 8: Accuracy: 77.78%
random state 9: Accuracy: 88.89%
random state 10: Accuracy: 88.89%
random state 11: Accuracy: 88.89%
random state 12: Accuracy: 55.56%
random state 13: Accuracy: 33.33%
random state 14: Accuracy: 88.89%
random state 15: Accuracy: 44.44%
random state 16: Accuracy: 77.78%
random state 17: Accuracy: 55.56%
random state 18: Accuracy: 77.78%
random state 19: Accuracy: 55.56%
Max Accuracy: 100.00%
Min Accuracy: 33.33%
Mean Accuracy: 71.11%


In [23]:
l = list()
for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, 
                        output_dim = 60, 
                        input_length = max_length))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(128))
    model.add(Dense(64, activation='relu'))
   # model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    # compile the model
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=24, verbose=0)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=1)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))
# LSTM 128, Dense 1 -> Mean Accuracy: 44.44%
# LSTM 10, Dense 1 -> Mean Accuracy: 44.44%
# LSTM 10 -> Dense 128 -> Dense 1 -> Mean Accuracy: 44.44%
# LSTM 10 -> Dense 128 -> Dropout -> Dense 1 -> Mean Accuracy: 44.44%

random state 0: Accuracy: 22.22%
random state 1: Accuracy: 44.44%
random state 2: Accuracy: 55.56%
random state 3: Accuracy: 55.56%
random state 4: Accuracy: 44.44%
Max Accuracy: 55.56%
Min Accuracy: 22.22%
Mean Accuracy: 44.44%


In [59]:
model_trainingpath = 0
model_savepath = "G:\\Chen\\keras\\model"
results_path = "G:\\Chen\\keras\\results\\0"
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(data)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(data)
#print(encoded_docs)
max_length = max([len(i.split(' ')) for i in data])
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=int(float(model_trainingpath)))

# serialize model to JSON
model_json = model.to_json()

with open(model_savepath + ".json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(model_savepath + ".h5")
print("Saved model to disk")

Saved model to disk


In [45]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(data)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(data)
#print(encoded_docs)
max_length = max([len(i.split(' ')) for i in data])
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=int(float(model_trainingpath)))


json_file = open(model_savepath+'.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
print("Loaded model.")

# load weights into new model
loaded_model.load_weights(model_savepath+".h5")
print("Loaded weights.")

# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#         score = loaded_model.evaluate(X_test, y_test, verbose=0)
#         print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

y_ = [float(round(i[0])) for i in loaded_model.predict(X_test)]
np.savetxt(results_path + "_pred.csv", y_, delimiter = ',')
np.savetxt(results_path + "_test.csv", y_test, delimiter = ',')
print("Saved predictions at: ", results_path + "_pred.csv")
print("Saved actual results at: ", results_path + "_test.csv")

Loaded model.
Loaded weights.
Saved predictions at:  G:\Chen\keras\results\0_pred.csv
Saved actual results at:  G:\Chen\keras\results\0_test.csv


In [44]:
[float(round(i[0])) for i in loaded_model.predict(X_test)]

[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0]

In [49]:
y_pred = loaded_model.predict(X_test, batch_size=None, verbose=0, steps=None)
average_precision = average_precision_score(y_test, y_pred)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))
for i in range(5):
    # fit the model
    loaded_model.fit(X_train, y_train, epochs=100*i, verbose=0)
    loss, accuracy = loaded_model.evaluate(X_test, y_test, verbose=0)
    print((i + 1) * 100, accuracy)
    #print('Accuracy: %f' % (accuracy*100))

Average precision-recall score: 1.00
100 0.8888888955116272
200 1.0
300 1.0
400 1.0
500 1.0


#### Version 2: Multilayer Perceptron (MLP) for binary classification
source: https://keras.io/getting-started/sequential-model-guide/

Max Accuracy: 66.67%
Min Accuracy: 33.33%
Mean Accuracy: 50.00%

In [85]:
max1 = 0
min1 = 1
l = list()
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Dense(64, input_dim=max_length, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))# compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    #print(model.summary())
    model.fit(X_train, y_train, epochs=5, batch_size=69, verbose=0)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

random state 0: Accuracy: 66.67%
random state 1: Accuracy: 33.33%
random state 2: Accuracy: 33.33%
random state 3: Accuracy: 55.56%
random state 4: Accuracy: 55.56%
random state 5: Accuracy: 33.33%
random state 6: Accuracy: 44.44%
random state 7: Accuracy: 55.56%
random state 8: Accuracy: 55.56%
random state 9: Accuracy: 66.67%
random state 10: Accuracy: 44.44%
random state 11: Accuracy: 44.44%
random state 12: Accuracy: 55.56%
random state 13: Accuracy: 44.44%
random state 14: Accuracy: 66.67%
random state 15: Accuracy: 33.33%
random state 16: Accuracy: 44.44%
random state 17: Accuracy: 44.44%
random state 18: Accuracy: 55.56%
random state 19: Accuracy: 66.67%
Max Accuracy: 66.67%
Min Accuracy: 33.33%
Mean Accuracy: 50.00%


In [19]:
y_pred = model.predict(X_test, batch_size=None, verbose=0, steps=None)
average_precision = average_precision_score(y_test, y_pred)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))
for i in range(5):
    # fit the model
    model.fit(X_train, y_train, epochs=100*i, verbose=0)
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    a.append(i)
    b.append(accuracy)
    print((i + 1) * 100, accuracy)
    #print('Accuracy: %f' % (accuracy*100))
model = None 

Average precision-recall score: 0.27
100 0.3333333432674408
200 0.4444444477558136
300 0.6666666865348816
400 0.6666666865348816
500 0.5555555820465088


#### Version 3: Basic Binary Classification
Source: https://keras.io/getting-started/sequential-model-guide/

Max Accuracy: 77.78%
Min Accuracy: 0.00%
Mean Accuracy: 48.33%

In [86]:
max1 = 0
min1 = 1
l = list()
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim=max_length))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    #print(model.summary())
    model.fit(X_train, y_train, epochs=5, batch_size=69, verbose=0)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

random state 0: Accuracy: 66.67%
random state 1: Accuracy: 44.44%
random state 2: Accuracy: 33.33%
random state 3: Accuracy: 44.44%
random state 4: Accuracy: 55.56%
random state 5: Accuracy: 55.56%
random state 6: Accuracy: 55.56%
random state 7: Accuracy: 33.33%
random state 8: Accuracy: 77.78%
random state 9: Accuracy: 44.44%
random state 10: Accuracy: 77.78%
random state 11: Accuracy: 33.33%
random state 12: Accuracy: 0.00%
random state 13: Accuracy: 33.33%
random state 14: Accuracy: 55.56%
random state 15: Accuracy: 33.33%
random state 16: Accuracy: 44.44%
random state 17: Accuracy: 55.56%
random state 18: Accuracy: 55.56%
random state 19: Accuracy: 66.67%
Max Accuracy: 77.78%
Min Accuracy: 0.00%
Mean Accuracy: 48.33%


#### LSTM Classification
Source: https://keras.io/getting-started/sequential-model-guide/ <br>
Max Accuracy: 77.78%
Min Accuracy: 22.22%
Mean Accuracy: 50.56%

In [88]:
max1 = 0
min1 = 1
l = list()
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
   # print(model.summary())
    model.fit(X_train, y_train, epochs=5, batch_size=69, verbose=1)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 0: Accuracy: 22.22%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 1: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 2: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 3: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 4: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 5: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 6: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 7: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 8: Accuracy: 77.78%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 9: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 10: Accuracy: 66.67%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 11: Accuracy: 66.67%
Ep

Epoch 4/5
Epoch 5/5
random state 16: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 17: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 18: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 19: Accuracy: 55.56%
Max Accuracy: 77.78%
Min Accuracy: 22.22%
Mean Accuracy: 50.56%


In [23]:
y_pred = model.predict(X_test, batch_size=None, verbose=0, steps=None)
average_precision = average_precision_score(y_test, y_pred)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))
# for i in range(5):
#     # fit the model
#     model.fit(X_train, y_train, epochs=100*i, verbose=0)
#     loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
#     a.append(i)
#     b.append(accuracy)
#     print((i + 1)  * 100, accuracy)
#     #print('Accuracy: %f' % (accuracy*100))
model = None 
# 512 - 0.75
# 1024 - 0.75 
# 2056 - 

Average precision-recall score: 0.22


#### LSTM with Dropout
Max Accuracy: 77.78%
Min Accuracy: 22.22%
Mean Accuracy: 50.56%

In [95]:
max1 = 0
min1 = 1
l = list()
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Dropout(0.2))
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
   # print(model.summary())
    model.fit(X_train, y_train, epochs=5, batch_size=69, verbose=1)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 0: Accuracy: 22.22%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 1: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 2: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 3: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 4: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 5: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 6: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 7: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 8: Accuracy: 77.78%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 9: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 10: Accuracy: 66.67%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 11: Accuracy: 66.67%
Ep

Epoch 4/5
Epoch 5/5
random state 16: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 17: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 18: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 19: Accuracy: 55.56%
Max Accuracy: 77.78%
Min Accuracy: 22.22%
Mean Accuracy: 50.56%


#### LSTM & Convolutional Neural Network for Sequence Classification
Max Accuracy: 77.78%
Min Accuracy: 22.22%
Mean Accuracy: 50.56%

In [100]:
max1 = 0
min1 = 1
l = list()
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
   # print(model.summary())
    model.fit(X_train, y_train, epochs=5, batch_size=69, verbose=1)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 0: Accuracy: 22.22%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 1: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 2: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 3: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 4: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 5: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 6: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 7: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 8: Accuracy: 77.78%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 9: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 10: Accuracy: 66.67%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 11: Accuracy: 66.67%
Ep

Epoch 4/5
Epoch 5/5
random state 16: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 17: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 18: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 19: Accuracy: 55.56%
Max Accuracy: 77.78%
Min Accuracy: 22.22%
Mean Accuracy: 50.56%
