In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from sklearn.model_selection import train_test_split
import collections
import math
import os
import random
import tarfile
import re
from six.moves import urllib
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Conv1D
from keras.layers import Dense, Dropout
from keras.layers import MaxPooling1D
from sklearn.metrics import average_precision_score
from keras.layers import LSTM
from keras.models import model_from_json

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
proxy = 'gw-proxy-la03p.corp.tcw.com:80'
os.environ['https_proxy'] = proxy

DOWNLOADED_FILENAME = 'ImdbReviews.tar.gz'

def download_file(url_path):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)

    print('Found and verified file from this path: ', url_path)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")

def get_reviews(dirname, positive=True):
    label = 1 if positive else 0

    reviews = []
    labels = []
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+', encoding='utf-8-sig') as f:
                review = f.read().lower()
                reviews.append(review)
                labels.append(label)
    return reviews, labels           

def extract_labels_data():
    positive_reviews, positive_labels = get_reviews("G:/Mohit/PortClass/CP/", positive=True)
    negative_reviews, negative_labels = get_reviews("G:/Mohit/PortClass/CR/", positive=False)

    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels

    return labels, data

URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
labels, data = extract_labels_data()
MAX_SEQUENCE_LENGTH = max([len(x.split(" ")) for x in data])

In [3]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(data)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(data)
#print(encoded_docs)
max_length = max([len(i.split(' ')) for i in data])
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [131]:
t.word_index

{'in': 1,
 'no': 2,
 'of': 3,
 'than': 4,
 'not': 5,
 'more': 6,
 'do': 7,
 'assets': 8,
 'invest': 9,
 'total': 10,
 'securities': 11,
 'any': 12,
 '5': 13,
 'by': 14,
 's': 15,
 'rated': 16,
 'us': 17,
 'highest': 18,
 'm': 19,
 'and': 20,
 'p': 21,
 'f': 22,
 'using': 23,
 'below': 24,
 'excl': 25,
 'issuer': 26,
 'bbb': 27,
 'nrsro': 28,
 'govt': 29,
 'agcy': 30,
 'or': 31,
 'one': 32,
 'barclays': 33,
 'short': 34,
 'duration': 35,
 '2': 36,
 'non': 37,
 'max': 38,
 '10': 39,
 'the': 40,
 'only': 41,
 'index': 42,
 'portfolio': 43,
 'lehaggr': 44,
 'cash': 45,
 'less': 46,
 '20': 47,
 'funds': 48,
 'to': 49,
 'usd': 50,
 'a': 51,
 'must': 52,
 'for': 53,
 'futures': 54,
 'bank': 55,
 'agg': 56,
 'options': 57,
 'reg': 58,
 'swaps': 59,
 'bond': 60,
 'unseasoned': 61,
 'issued': 62,
 'net': 63,
 'sales': 64,
 'be': 65,
 'equivalents': 66,
 'against': 67,
 'per': 68,
 'loans': 69,
 'investments': 70,
 'agency': 71,
 'rating': 72,
 'incl': 73,
 'term': 74,
 'mgr': 75,
 'with': 76,
 '

In [66]:
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=42)

#### Version 1: Embedding, Flatten, Dense.
https://keras.io/getting-started/sequential-model-guide/

Max Accuracy: 100.00%
Min Accuracy: 44.44%
Mean Accuracy: 68.33%


10 epochs
Max Accuracy: 100.00%
Min Accuracy: 44.44%
Mean Accuracy: 71.11%


15 epochs
Max Accuracy: 100.00%
Min Accuracy: 44.44%
Mean Accuracy: 68.89%

20 epochs
Max Accuracy: 100.00%
Min Accuracy: 44.44%
Mean Accuracy: 71.11%

50 epochs
Max Accuracy: 100.00%
Min Accuracy: 44.44%
Mean Accuracy: 75.56%

100 epochs
Max Accuracy: 100.00%
Min Accuracy: 55.56%
Mean Accuracy: 82.22%

In [172]:
l = list()
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, 
                        output_dim = 60, 
                        input_length = max_length))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    # compile the model
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])

    #print(model.summary())
    # epochs = 10, batch_size = 1
    model.fit(X_train, y_train, epochs=10, batch_size=20, verbose=0)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=1)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

#lstm 128
# Max Accuracy: 100.00%
# Min Accuracy: 55.56%
# Mean Accuracy: 75.56%

#lstm 64
#random state 0: Accuracy: 22.22%

# flatten - Dense 256, relu -> Dense 1 -> sigmoid 
# Mean Accuracy: 63.33% @10

# rmsprop optimizer, flatten - Dense 256, relu -> Dropout 0.5 -> Dense 1, sigmoid
# Mean Accuracy: 74.44% @10

# *adam optimizer, flatten - Dense 256, relu -> Dropout 0.5 -> Dense 1, sigmoid
# Mean Accuracy: 81.11% @10

random state 0: Accuracy: 88.89%
random state 1: Accuracy: 88.89%
random state 2: Accuracy: 55.56%
random state 3: Accuracy: 100.00%
random state 4: Accuracy: 66.67%
random state 5: Accuracy: 77.78%
random state 6: Accuracy: 88.89%
random state 7: Accuracy: 77.78%
random state 8: Accuracy: 88.89%
random state 9: Accuracy: 77.78%
Max Accuracy: 100.00%
Min Accuracy: 55.56%
Mean Accuracy: 81.11%


In [None]:
# https://github.com/abdulfatir/twitter-sentiment-analysis/blob/master/lstm.py
l = list()
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, 
                        output_dim = 60, 
                        input_length = max_length))
    model.add(Dropout(0.5))
    model.add(LSTM(128))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    # compile the model
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])

    #print(model.summary())
    # epochs = 10, batch_size = 1
    model.fit(X_train, y_train, epochs=10, batch_size=20, verbose=0)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=1)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

random state 0: Accuracy: 22.22%


In [152]:
l = list()
for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, 
                        output_dim = 60, 
                        input_length = max_length))
    model.add(LSTM(128))
    model.add(Dense(1, activation='sigmoid'))
    # compile the model
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=24, verbose=0)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=1)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

random state 0: Accuracy: 22.22%
random state 1: Accuracy: 44.44%
random state 2: Accuracy: 55.56%
random state 3: Accuracy: 55.56%
random state 4: Accuracy: 44.44%
Max Accuracy: 55.56%
Min Accuracy: 22.22%
Mean Accuracy: 44.44%


In [59]:
model_trainingpath = 0
model_savepath = "G:\\Chen\\keras\\model"
results_path = "G:\\Chen\\keras\\results\\0"
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(data)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(data)
#print(encoded_docs)
max_length = max([len(i.split(' ')) for i in data])
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=int(float(model_trainingpath)))

# serialize model to JSON
model_json = model.to_json()

with open(model_savepath + ".json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(model_savepath + ".h5")
print("Saved model to disk")

Saved model to disk


"do not invest in cash equivalents/short-term investments rated below a1/p1 by m/s&p/f using highest (any nrsro, manager rating allowed). no more than 10% of total assets in preferred stock, including adjustable rate & convertible. no more than 5% of total assets in interest only (io's). do not invest in debt rated below b- by m/s&p/f (any nrsro) using highest. max bnk lns 5% (incl. mwflx) -  no more than 5% in bank loans (inclusive of metropolitan west float-i mwflx). no more than 20% of total assets in 144a's without registration rights. no more than 5% of total assets in principal only (po's) securities. do not invest in futures, options and swaps - except for hedging. no more than 15% of total assets in debt rated below bbb- by m/s&p/f/mgr (any nrsro) using highest. notify client with a description of the ongoing investment strategy. no more than 5% of total assets in inverse floaters. no more than 20% of total assets in municipal bonds. no margin purchases. no more than 15% of tot

In [45]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(data)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(data)
#print(encoded_docs)
max_length = max([len(i.split(' ')) for i in data])
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=int(float(model_trainingpath)))


json_file = open(model_savepath+'.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
print("Loaded model.")

# load weights into new model
loaded_model.load_weights(model_savepath+".h5")
print("Loaded weights.")

# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#         score = loaded_model.evaluate(X_test, y_test, verbose=0)
#         print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

y_ = [float(round(i[0])) for i in loaded_model.predict(X_test)]
np.savetxt(results_path + "_pred.csv", y_, delimiter = ',')
np.savetxt(results_path + "_test.csv", y_test, delimiter = ',')
print("Saved predictions at: ", results_path + "_pred.csv")
print("Saved actual results at: ", results_path + "_test.csv")

Loaded model.
Loaded weights.
Saved predictions at:  G:\Chen\keras\results\0_pred.csv
Saved actual results at:  G:\Chen\keras\results\0_test.csv


In [44]:
[float(round(i[0])) for i in loaded_model.predict(X_test)]

[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0]

In [49]:
y_pred = loaded_model.predict(X_test, batch_size=None, verbose=0, steps=None)
average_precision = average_precision_score(y_test, y_pred)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))
for i in range(5):
    # fit the model
    loaded_model.fit(X_train, y_train, epochs=100*i, verbose=0)
    loss, accuracy = loaded_model.evaluate(X_test, y_test, verbose=0)
    print((i + 1) * 100, accuracy)
    #print('Accuracy: %f' % (accuracy*100))

Average precision-recall score: 1.00
100 0.8888888955116272
200 1.0
300 1.0
400 1.0
500 1.0


#### Version 2: Multilayer Perceptron (MLP) for binary classification
source: https://keras.io/getting-started/sequential-model-guide/

Max Accuracy: 66.67%
Min Accuracy: 33.33%
Mean Accuracy: 50.00%

In [85]:
max1 = 0
min1 = 1
l = list()
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Dense(64, input_dim=max_length, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))# compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    #print(model.summary())
    model.fit(X_train, y_train, epochs=5, batch_size=69, verbose=0)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

random state 0: Accuracy: 66.67%
random state 1: Accuracy: 33.33%
random state 2: Accuracy: 33.33%
random state 3: Accuracy: 55.56%
random state 4: Accuracy: 55.56%
random state 5: Accuracy: 33.33%
random state 6: Accuracy: 44.44%
random state 7: Accuracy: 55.56%
random state 8: Accuracy: 55.56%
random state 9: Accuracy: 66.67%
random state 10: Accuracy: 44.44%
random state 11: Accuracy: 44.44%
random state 12: Accuracy: 55.56%
random state 13: Accuracy: 44.44%
random state 14: Accuracy: 66.67%
random state 15: Accuracy: 33.33%
random state 16: Accuracy: 44.44%
random state 17: Accuracy: 44.44%
random state 18: Accuracy: 55.56%
random state 19: Accuracy: 66.67%
Max Accuracy: 66.67%
Min Accuracy: 33.33%
Mean Accuracy: 50.00%


In [19]:
y_pred = model.predict(X_test, batch_size=None, verbose=0, steps=None)
average_precision = average_precision_score(y_test, y_pred)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))
for i in range(5):
    # fit the model
    model.fit(X_train, y_train, epochs=100*i, verbose=0)
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    a.append(i)
    b.append(accuracy)
    print((i + 1) * 100, accuracy)
    #print('Accuracy: %f' % (accuracy*100))
model = None 

Average precision-recall score: 0.27
100 0.3333333432674408
200 0.4444444477558136
300 0.6666666865348816
400 0.6666666865348816
500 0.5555555820465088


#### Version 3: Basic Binary Classification
Source: https://keras.io/getting-started/sequential-model-guide/

Max Accuracy: 77.78%
Min Accuracy: 0.00%
Mean Accuracy: 48.33%

In [86]:
max1 = 0
min1 = 1
l = list()
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim=max_length))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    #print(model.summary())
    model.fit(X_train, y_train, epochs=5, batch_size=69, verbose=0)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

random state 0: Accuracy: 66.67%
random state 1: Accuracy: 44.44%
random state 2: Accuracy: 33.33%
random state 3: Accuracy: 44.44%
random state 4: Accuracy: 55.56%
random state 5: Accuracy: 55.56%
random state 6: Accuracy: 55.56%
random state 7: Accuracy: 33.33%
random state 8: Accuracy: 77.78%
random state 9: Accuracy: 44.44%
random state 10: Accuracy: 77.78%
random state 11: Accuracy: 33.33%
random state 12: Accuracy: 0.00%
random state 13: Accuracy: 33.33%
random state 14: Accuracy: 55.56%
random state 15: Accuracy: 33.33%
random state 16: Accuracy: 44.44%
random state 17: Accuracy: 55.56%
random state 18: Accuracy: 55.56%
random state 19: Accuracy: 66.67%
Max Accuracy: 77.78%
Min Accuracy: 0.00%
Mean Accuracy: 48.33%


In [21]:
y_pred = model.predict(X_test, batch_size=None, verbose=0, steps=None)
average_precision = average_precision_score(y_test, y_pred)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))
for i in range(5):
    # fit the model
    model.fit(X_train, y_train, epochs=100*i, verbose=0)
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    a.append(i)
    b.append(accuracy)
    print((i + 1)  * 100, accuracy)
    #print('Accuracy: %f' % (accuracy*100))
model = None 

Average precision-recall score: 0.21
100 0.3333333432674408
200 0.4444444477558136
300 0.4444444477558136
400 0.4444444477558136
500 0.4444444477558136


#### LSTM Classification
Source: https://keras.io/getting-started/sequential-model-guide/ <br>
Max Accuracy: 77.78%
Min Accuracy: 22.22%
Mean Accuracy: 50.56%

In [88]:
max1 = 0
min1 = 1
l = list()
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
   # print(model.summary())
    model.fit(X_train, y_train, epochs=5, batch_size=69, verbose=1)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 0: Accuracy: 22.22%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 1: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 2: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 3: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 4: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 5: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 6: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 7: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 8: Accuracy: 77.78%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 9: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 10: Accuracy: 66.67%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 11: Accuracy: 66.67%
Ep

Epoch 4/5
Epoch 5/5
random state 16: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 17: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 18: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 19: Accuracy: 55.56%
Max Accuracy: 77.78%
Min Accuracy: 22.22%
Mean Accuracy: 50.56%


In [23]:
y_pred = model.predict(X_test, batch_size=None, verbose=0, steps=None)
average_precision = average_precision_score(y_test, y_pred)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))
# for i in range(5):
#     # fit the model
#     model.fit(X_train, y_train, epochs=100*i, verbose=0)
#     loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
#     a.append(i)
#     b.append(accuracy)
#     print((i + 1)  * 100, accuracy)
#     #print('Accuracy: %f' % (accuracy*100))
model = None 
# 512 - 0.75
# 1024 - 0.75 
# 2056 - 

Average precision-recall score: 0.22


#### LSTM with Dropout
Max Accuracy: 77.78%
Min Accuracy: 22.22%
Mean Accuracy: 50.56%

In [95]:
max1 = 0
min1 = 1
l = list()
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Dropout(0.2))
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
   # print(model.summary())
    model.fit(X_train, y_train, epochs=5, batch_size=69, verbose=1)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 0: Accuracy: 22.22%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 1: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 2: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 3: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 4: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 5: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 6: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 7: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 8: Accuracy: 77.78%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 9: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 10: Accuracy: 66.67%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 11: Accuracy: 66.67%
Ep

Epoch 4/5
Epoch 5/5
random state 16: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 17: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 18: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 19: Accuracy: 55.56%
Max Accuracy: 77.78%
Min Accuracy: 22.22%
Mean Accuracy: 50.56%


#### LSTM & Convolutional Neural Network for Sequence Classification
Max Accuracy: 77.78%
Min Accuracy: 22.22%
Mean Accuracy: 50.56%

In [100]:
max1 = 0
min1 = 1
l = list()
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.13, random_state=i)
    # define the model
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
   # print(model.summary())
    model.fit(X_train, y_train, epochs=5, batch_size=69, verbose=1)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("random state " + str(i) + ": Accuracy: %.2f%%" % (scores[1]*100))
    l.append(scores[1])
print("Max Accuracy: %.2f%%" % (max(l) * 100))
print("Min Accuracy: %.2f%%" % (min(l) * 100))
print("Mean Accuracy: %.2f%%" % (sum(l) * 100 / float(len(l))))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 0: Accuracy: 22.22%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 1: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 2: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 3: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 4: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 5: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 6: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 7: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 8: Accuracy: 77.78%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 9: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 10: Accuracy: 66.67%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 11: Accuracy: 66.67%
Ep

Epoch 4/5
Epoch 5/5
random state 16: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 17: Accuracy: 44.44%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 18: Accuracy: 55.56%
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random state 19: Accuracy: 55.56%
Max Accuracy: 77.78%
Min Accuracy: 22.22%
Mean Accuracy: 50.56%
