In [1]:
from __future__ import absolute_import, division, print_function

from sklearn.model_selection import train_test_split
from six.moves import urllib
import os
import numpy as np
from numpy import array
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf
import csv
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, Conv1D, Dense, Dropout, LSTM, MaxPooling1D, Convolution1D
from keras.models import model_from_json
from keras.utils import np_utils
from keras.utils import to_categorical

from sklearn.metrics import average_precision_score
from sklearn.preprocessing import LabelEncoder

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def get_reviews(dirname, label):
    reviews = []
    labels = []
    file_name = []
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+', encoding='utf-8-sig') as f:
                review = f.read().lower()
                reviews.append(review)
                labels.append(label)
                file_name.append(filename)
    return reviews, labels, file_name

def extract_labels_data(styles):
    data = []
    labels = []
    file_names = []
    for i, style in enumerate(styles):
        review, label, file_name = get_reviews("G:/Chen/PortClass/"+style+"/",label=i)
        data += review
        labels += label
        file_names += file_name
        style_dict[i] = style
    return labels, data, file_names

style_dict = {}
style_list = ['CP','CPOP','CPPE','CPPL','CR','CRRV']
num_styles = len(style_list) 
labels, data, file_name = extract_labels_data(style_list)
labels = to_categorical(array(labels))
#MAX_SEQUENCE_LENGTH = max([len(x.split(" ")) for x in data])

In [3]:
model_trainingpath = int(float("0"))
model_savepath = "G:/Chen/sentiment/model"

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(data)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(data)
#print(encoded_docs)
max_length = max([len(i.split(' ')) for i in data])
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(padded_docs, labels, range(0, len(labels)), test_size=0.2, random_state=model_trainingpath)

# define the model
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = 90, input_length = max_length))

# Added LSTM Below 
model.add(Conv1D(filters=32, kernel_size=4, activation='elu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(filters=32, kernel_size=2, activation='elu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.25))
model.add(LSTM(120, return_sequences=True))
model.add(LSTM(60, return_sequences=True))
model.add(Flatten())
model.add(Dense(64, kernel_initializer='normal', activation='elu'))
model.add(Dense(num_styles, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)
model_json = model.to_json()
with open(model_savepath + ".json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(model_savepath + ".h5")
print("Saved model to disk")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 908, 90)           112140    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 908, 32)           11552     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 454, 32)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 454, 32)           2080      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 227, 32)           0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 227, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 227, 120)          73440     
__________

Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Saved model to disk


In [4]:
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(padded_docs, labels, range(0, len(labels)),test_size=0.2, random_state=model_trainingpath)
indice_mappings = np.asarray(file_name)[indices_test] 
style_mappings = [style_dict[np.argmax(list(map(lambda x: (round(x)), i)))] for i in model.predict(X_test)]
style_mappings_actual = [style_dict[np.argmax(list(map(lambda x: (round(x)), i)))] for i in y_test]
pred = list()
actual = list()
for i, x in enumerate(indice_mappings):
    pred.append([indice_mappings[i], style_mappings[i]])
    actual.append([indice_mappings[i], style_mappings_actual[i]])

In [5]:
# l = list()
# for i in range(50):
#     #print(i)
    
#     model_trainingpath = i
#     X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.2, random_state=model_trainingpath)
#     score, acc = model.evaluate(X_test, y_test,batch_size=20, verbose=0)
# #     print('Test score:', score)
#     #print('Test accuracy:', acc)
#     l.append(acc)
# sum(l)/len(l)
# #0.9694444462656975 
# # 0.9827777788043022

# # Change to Conv1D from 6, 3 -> 4, 2 
# #0.9827777788043022

# # Added another LSTM layer -> 60, 30 
# #95

# # Added LSTM, 120 60
# # 98.277

# # LSTM 120 60, Add another dropout 0.25 
# #0.9694444462656975

# # Decrease from 60 -> 20 batch sizes

# # Top 3
# # 83.90%

# # Top 6
# # 85.33

In [6]:
from keras import backend as K
K.clear_session()
model_savepath = "G:/Chen/sentiment/model"
model_trainingpath = int(float("9"))
results_path = "G:/Chen/sentiment/results/9"
        
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(data)
vocab_size = len(t.word_index) + 1
encoded_docs = t.texts_to_sequences(data)
max_length = max([len(i.split(' ')) for i in data])
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

json_file = open(model_savepath+'.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(model_savepath+".h5")
# evaluate loaded model on test data
loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print("Loaded model.")

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(padded_docs, labels, range(0, len(labels)),test_size=0.2, random_state=model_trainingpath)
indice_mappings = np.asarray(file_name)[indices_test] 
style_mappings = [style_dict[np.argmax(list(map(lambda x: (round(x)), i)))] for i in loaded_model.predict(X_test)]
style_mappings_actual = [style_dict[np.argmax(list(map(lambda x: (round(x)), i)))] for i in y_test]
pred = list()
actual = list()
for i, x in enumerate(indice_mappings):
    pred.append([indice_mappings[i], style_mappings[i]])
    actual.append([indice_mappings[i], style_mappings_actual[i]])

Loaded model.
0.9166666865348816


In [27]:
l = list()
for i in range(500):
    #print(i)
    
    model_trainingpath = i
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.20, random_state=model_trainingpath)
    score, acc = loaded_model.evaluate(X_test, y_test, batch_size=32, verbose=0)
#     print('Test score:', score)
#     print('Test accuracy:', acc)
    l.append(acc)
sum(l)/len(l)

#0.9694444
#0.8850000005960464 
#0.905666666

0.9056666676998139

### Which portfolios are being incorrectly classified?
CP (37): 241 (44), 3662 (36), 241 (44), 3735 (42), 679 (47) = 213 times <br>
CPOP (6): = 0 times <br>
CPPE (32): 1619 (47), 682 (31), 13713 (36), 309 (43) = 157 times<br>
CPPL (6): 678 (37) = 37 times <br>
CR (32) : 3624 (48), 1677 (39) = 88 times<br>
CRRV (6): <br>

In [28]:
l = list()
wrong = {}
for i in range(200):
    model_trainingpath = i
    X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(padded_docs, labels, range(0, len(labels)),test_size=0.2, random_state=model_trainingpath)
    indice_mappings = np.asarray(file_name)[indices_test] 
    bad = set(np.nonzero([list(map(lambda x: np.float64(round(x)), i)) for i in loaded_model.predict(X_test)] != y_test)[0])
    for i in bad:
        if indice_mappings[i] in wrong:
            wrong[indice_mappings[i]] += 1
        else:
            wrong[indice_mappings[i]] = 1
wrong

{'3624.txt': 48,
 '3662.txt': 36,
 '1619.txt': 47,
 '1677.txt': 39,
 '682.txt': 31,
 '679.txt': 47,
 '13713.txt': 36,
 '309.txt': 43,
 '241.txt': 44,
 '3735.txt': 42,
 '678.txt': 37}

In [13]:
[list(map(lambda x: np.float64(round(x)), i)) for i in loaded_model.predict(X_test)]

[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]]