## Fetch the data

In [1]:
file = open('data/space.txt', 'r')
data = file.read()

## Data Preprocessing
* Remove punctuations and numbers
* Remove single characters
* Replace multiple spaces with a sinlge space
* Convert text to lowercase

In [2]:
import re

def preprocess(s):
    sentence = re.sub(r'[^a-zA-Z]', ' ', s)
    sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence.lower()

text = preprocess(data)

## Vectorize words
* Tokenize the text into individual words
* Remove stopwords (optional): application specific
* Convert the tokenized words to numbers (indices)
* Create word-to-index dictionary
* Create index-to-word dictionary (by reversing)

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer

words = word_tokenize(text)
words = [w for w in words if w not in stopwords.words('english')]

num_words = len(words)
print('Total words:', num_words)

unique_words = len(set(words))
print('Unique words:', unique_words)

tokenizer = Tokenizer(num_words = unique_words + 1)
tokenizer.fit_on_texts(words)

vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size:', vocab_size)

word_to_index = tokenizer.word_index
print('Word to Index dictionary created')

index_to_word = dict(map(reversed, word_to_index.items()))
print('Index to Word dictionary created')

Using TensorFlow backend.


Total words: 1889
Unique words: 754
Vocabulary size: 755
Word to Index dictionary created
Index to Word dictionary created


## Many-to-one Sequence problem: LSTM model

### Reshape input and output, and Normalize input

In [4]:
from numpy import reshape
from keras.utils import to_categorical

input_sequence = []
input_sequence_len = 100
output_words = []

for i in range(0, num_words - input_sequence_len, 1):
    in_seq = words[i:i + input_sequence_len]
    out_seq = words[i + input_sequence_len]
    input_sequence.append([word_to_index[word] for word in in_seq])
    output_words.append(word_to_index[out_seq])
    
X = reshape(input_sequence, (len(input_sequence), input_sequence_len, 1))
X_trn = X / float(vocab_size)

y_trn = to_categorical(output_words)

print("X shape:", X_trn.shape)
print("y shape:", y_trn.shape)

X shape: (1789, 100, 1)
y shape: (1789, 755)


### Create Stacked LSTM Sequential model

In [5]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers.core import Dense

model = Sequential()
model.add(LSTM(200, input_shape=(X_trn.shape[1], X_trn.shape[2]), return_sequences=True))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(25))
model.add(Dense(y_trn.shape[1], activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy')
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100, 200)          161600    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 100)          120400    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100, 50)           30200     
_________________________________________________________________
lstm_4 (LSTM)                (None, 25)                7600      
_________________________________________________________________
dense_1 (Dense)              (None, 755)               19630     
Total params: 339,430
Trainable params: 339,430
Non-trainable params: 0
_________________________________________________________________
None


### Train the model

In [6]:
model.fit(X_trn, y_trn, epochs=10, batch_size=64, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x13a359f28>

### Make prediction
* Randomly select a sequence
* Obtain the words from index-to-word dictionary
* Predict a one-hot encoded array of indices
  * the index that contains 1 will be the index value of the next word

In [7]:
from numpy import random, argmax

random_seq_index = random.randint(0, len(input_sequence)-1)
random_seq = input_sequence[random_seq_index]

word_seq = [index_to_word[val] for val in random_seq]
print(' '.join(word_seq))

neutron stars black holes gravitational pull high even light escape constellations group stars forming various shapes called constellation star patterns laghu saptarshi ursa minor one constellation vrihat saptarshi also known ursa major group seven stars another constellation forms part constellation big bear seen summer time early part night orion mriga another well known constellation seen winter late evenings star sirius brightest star sky located close orion cassiopeia another prominent constellation northern sky visible winter early part night constellation stars large number stars however see bright stars constellation naked eye stars make constellation distance line sight sky galaxies galaxy system stars


In [8]:
for i in range(100):
    int_sample = reshape(random_seq, (1, len(random_seq), 1))
    int_sample = int_sample / float(vocab_size)

    predicted_word_index = model.predict(int_sample, verbose=0)
    predicted_word_id = argmax(predicted_word_index)
    
    seq_in = [index_to_word[index] for index in random_seq]

    word_seq.append(index_to_word[predicted_word_id])

    random_seq.append(predicted_word_id)
    
    random_seq = random_seq[1:len(random_seq)]

output = ""
for word in word_seq:
    output = output + " " + word
print(output)

 neutron stars black holes gravitational pull high even light escape constellations group stars forming various shapes called constellation star patterns laghu saptarshi ursa minor one constellation vrihat saptarshi also known ursa major group seven stars another constellation forms part constellation big bear seen summer time early part night orion mriga another well known constellation seen winter late evenings star sirius brightest star sky located close orion cassiopeia another prominent constellation northern sky visible winter early part night constellation stars large number stars however see bright stars constellation naked eye stars make constellation distance line sight sky galaxies galaxy system stars earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth earth ea

### Save the model

In [9]:
import pickle

pickle.dump(model, open('model.sav', 'wb'))