In [1]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, BatchNormalization, TimeDistributed

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import gensim
from gensim.utils import simple_preprocess

#import multiprocessing as mp
from multiprocessing import cpu_count

# set working directory
path = '/Users/ianlo/Documents/Data Analyitcs & Data Science/Deep Learning Developer Course/RNNProject/'
os.chdir(path)

# import custom files
import utils as utils
import global_settings as gs
from parallelproc import applyParallel

Using TensorFlow backend.


In [2]:
# tokenizer: can change this as needed
tokenize = lambda x: simple_preprocess(x)

# set no of groups for partitioning
_number_of_groups = int(cpu_count()*0.8)

# set no of threads
_cpu = int(cpu_count()*0.8)

# initialise global parameters
gs.init()

# max no. of words in a review
MAX_SEQUENCE_LENGTH = 300
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
BATCH_SIZE = 32
EPOCHS = 10

Read in xls file, remove duplicates and create index for parallel processing

In [3]:
# -----------------------------------------------------------------------------
# extract all the reviews from the XLS files into a data frame
i=1
rows = pd.DataFrame()
while i <= 1:
    rw = utils.readXlsx("./data/Train/hotel_sentiment_v01.xlsx", sheet = i, header=True)
    # extend takes the content of another list and adds it into that list
    rows = rows.append(rw, ignore_index=True)
    i += 1
    
df = pd.DataFrame(rows)

# remove no longer used objects in memory
del rw, rows

# remove duplicate entries
df = df.dropna().drop_duplicates()

# create index for easy parallel processing
df['indx'] = df.index
df.insert(0,'grpId',df.apply(lambda row: row.indx % _number_of_groups, axis=1, raw=True))
# drop the temp indx column
df = df.drop('indx', 1)

In [4]:
print('Starting pre-processing: Clean')
df = applyParallel(df.groupby(df.grpId), utils.clean_text, {"dest_col_ind": df.shape[1]-1, "dest_col": "processed_text", "src_col": "review_text"}, _cpu)


print('Starting pre-processing: Lower case')
df = applyParallel(df.groupby(df.grpId), utils.lower_case, {"dest_col_ind": df.shape[1]-1, "dest_col": "processed_text", "src_col": "processed_text"}, _cpu)


print('Starting pre-processing: Restructure text')
df = applyParallel(df.groupby(df.grpId), utils.restructureText, {"dest_col_ind": df.shape[1]-1, "dest_col": "processed_text", "src_col": "processed_text"}, _cpu)


print('Starting pre-processing: Remove custom stop words')
df = applyParallel(df.groupby(df.grpId), utils.remove_stopwords, {"dest_col_ind": df.shape[1]-1, "dest_col": "processed_text", "src_col": "processed_text"}, _cpu)

Starting pre-processing: Clean
Starting pre-processing: Lower case
Starting pre-processing: Restructure text
Starting pre-processing: Remove custom stop words


In [5]:
# =============================================================================
# find the word frequency and remove non frequent words

# the list of words from the corpus that has more than 10 instances
# this list will be used to tag the sentences with PAD, EOS, UNK
wordlist = utils.get_words_by_freq(df['processed_text'], 10)
wordlist.append('PAD')
wordlist.append('EOS')
wordlist.append('UNK')

# substitute everything not in wordlist with PAD, EOS, UNK
max_features = 23413
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df['processed_text'].values)
X = tokenizer.texts_to_sequences(df['processed_text'].values)
X = pd.DataFrame(pad_sequences(X, padding='post', truncating='post', maxlen = MAX_SEQUENCE_LENGTH))

# get the word index from the tokenizer
word_index = tokenizer.word_index
word_index['PAD'] = 0
word_index['EOS'] = len(word_index)

# create index to word list so that we can update the padded sentences with the 
# special charaters needed
# create index to word dictionary
index_word = {}
for word in word_index.keys():
    if word in wordlist:
        index_word[word_index.get(word)] = word
    else:
        index_word[word_index.get(word)] = 'UNK'

In [6]:
# one hot encode target
enc1 = OneHotEncoder()
Y = pd.DataFrame(enc1.fit_transform(pd.DataFrame(df.iloc[:,4])).toarray())


# create train / test set
x_train, x_valid, y_train, y_valid = train_test_split(X,
                                                      Y,
                                                      test_size = 0.2,
                                                      random_state = gs.seedvalue)

In [7]:
# load a word2vec from google news
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)  

# create a vocab based on the google news vectors model
vocab = dict([(k, v.index) for k, v in model.vocab.items()])

In [8]:
# create the embeddings_index so that we can get the embeddings from the google model
# for each word
embeddings_index = {}
# if word is not found in the google vector then set to 0
# otherwise the embeddings_index will have the corresponding google news vector embedding for
# the specific word found in the review text
for word in word_index.keys():
    if vocab.get(word) is not None:
        embeddings_index[word] = model.syn0[vocab.get(word)]
    else:
        embeddings_index[word] = [0] * 300 # default size of google news vector embeddings


empty=np.empty(300, dtype=float); empty.fill(0.001) 
embeddings_index['PAD'] = empty
embeddings_index['EOS'] = empty
embeddings_index['UNK'] = empty


# embedding_dimension is the size of the vector
embedding_dimension = 300
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dimension))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)[:embedding_dimension]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector[:embedding_dimension]

In [None]:
lstm_out = 1024

# create the model
model = Sequential()
model.add(Embedding(max_features, embedding_dimension, input_length = x_train.shape[1], weights=[embedding_matrix], trainable=True))
model.add(Bidirectional(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)))
model.add(Bidirectional(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3)))
#model.add(Dense(1024, activation='relu'))
#model.add(BatchNormalization())
model.add(Dense(5, activation='softmax'))

# setting up the optimization of our weights
#sgd = keras.optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
#adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.002)
#nadam = keras.optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004)

model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 300)          7023900   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 2048)              10854400  
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 10245     
Total params: 17,888,545
Trainable params: 17,888,545
Non-trainable params: 0
_________________________________________________________________
None


In [None]:

batch_size = 32
hist = model.fit(np.array(x_train), np.array(y_train),
                 epochs = EPOCHS, batch_size=BATCH_SIZE, verbose = 1,
                 validation_data=(np.array(x_valid), np.array(y_valid)),
                 shuffle=True)

Epoch 1/10


In [None]:
model.save('hotel-sentiment-model.hdf5')