In [3]:
import numpy as np
import pandas as pd
import sys

import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, LSTM, Dropout, Input, Activation, Embedding, Conv1D, GlobalMaxPooling1D
from keras.optimizers import RMSprop, Adam

from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.utils import np_utils

from nltk import word_tokenize   

import tensorflow as tf

Using TensorFlow backend.


In [4]:
print(tf.__version__)
print(keras.__version__)

1.3.0
2.0.8


In [5]:
## Loading the training and testing dataset
train_df = pd.read_csv('Data/train.csv')
test_df = pd.read_csv('Data/test.csv')
print "Number of training and examples originally:", train_df.shape, test_df.shape

## Split training dataset into training and validation
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(train_df, test_size = 0.2)
print "Training dataset and validation dataset:", train_df.shape, valid_df.shape

Number of training and examples originally: (19579, 3) (8392, 2)
Training dataset and validation dataset: (15663, 3) (3916, 3)


In [6]:
## Making use of a text corpus within NLTK
## Used to download the various corpus only first time installation
# nltk.download()

# from nltk.corpus import brown
# corpus = brown.words() # Using all the word in the brown corpus
# # corpus = brown.words(categories='mystery') # Using only the mystery category within the brown corpus
# corpus = [w.lower() for w in corpus]

corpus = [word.lower() for sent in train_df.text for word in word_tokenize(sent.decode('utf-8')) ]
# corpus = [w.lower() for w in corpus]

# create mapping of unique chars to integers
words = sorted(list(set(corpus)))
word_to_int = dict((c, i) for i, c in enumerate(words))
int_to_word = dict((i, c) for i, c in enumerate(words))

# summarize the loaded data
n_vocab = len(words)
print "The number of words within the Brown Corpus: ", len(corpus)
print "Total Unique chars: ", n_vocab

The number of words within the Brown Corpus:  477432
Total Unique chars:  23264


In [7]:
## Tokenize the sentences into words and find the longest sentence
maxlen = max([len(word_tokenize(sent.decode('utf-8'))) for sent in train_df.text])
print maxlen

876


In [15]:
trainX = []
validX = []
testX = []
for sentence in train_df.text:
    trainX.append([word_to_int[word.lower()] for word in word_tokenize(sentence.decode('utf-8'))])
for sentence in valid_df.text:
    validX.append([word_to_int[word.lower()] for word in word_tokenize(sentence.decode('utf-8')) if word in word_to_int.keys()] )
for sentence in test_df.text:
    testX.append([word_to_int[word.lower()] for word in word_tokenize(sentence.decode('utf-8')) if word in word_to_int.keys()] )

In [16]:
from keras.preprocessing import sequence

trainX = sequence.pad_sequences(trainX, maxlen=maxlen)
validX = sequence.pad_sequences(validX, maxlen=maxlen)

In [17]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
le = LabelEncoder()
le.fit(train_df.author)

print list(le.classes_)
trainy = to_categorical(le.transform(train_df.author))
validy = to_categorical(le.transform(valid_df.author))

['EAP', 'HPL', 'MWS']


In [18]:
# define the input shape
inp = Input(shape=(trainX.shape[0],trainX.shape[1]))
print 'Our input shape is ',trainX.shape

Our input shape is  (15663, 876)


In [19]:
embedding_vecor_length = 50
model = Sequential()
model.add(Embedding(n_vocab, embedding_vecor_length, input_length=maxlen))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 876, 50)           1163200   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 1,223,903
Trainable params: 1,223,903
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
import h5py
filepath="checkpoints/weights-improvement-{epoch:02d}-{loss:.4f}-gentext-CharRNN-simple.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

TB = TensorBoard(log_dir='./logs', histogram_freq=1, write_grads=True, embeddings_freq=1)

callbacks_list = [TB]

In [22]:
model.fit(trainX, trainy, epochs=5, batch_size=32, validation_data=(validX,validy), callbacks=callbacks_list)

Train on 15663 samples, validate on 3916 samples
Epoch 1/5
  448/15663 [..............................] - ETA: 684s - loss: 0.2270 - acc: 0.9159

KeyboardInterrupt: 

In [47]:
# x = LSTM(256, return_sequences = True)(inp)
# x = Dropout(0.2)(x)
# output = Dense(1, activation ='softmax')(x)

# generative_model = Model(inputs = inp, outputs=output )

# optimizer = RMSprop(lr=0.01)
# generative_model.compile(loss='categorical_crossentropy', optimizer='adam')

# filepath="checkpoints/weights-improvement-{epoch:02d}-{loss:.4f}-gentext-CharRNN-simple.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]

In [None]:
# generative_model.fit(trainX, trainy, epochs=10, batch_size=64, callbacks=callbacks_list)