In [0]:
!pip install -q keras

In [22]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [26]:
# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '1iai6zKxrGYIza4k1Kbwb8eTyD4wybYE8'
downloaded = drive.CreateFile({'id': file_id})
print('Downloaded content "{}"'.format(downloaded.GetContentString()[0:500]))

Downloaded content "﻿CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to do: once or twice she had peeped into the
book her sister was reading, but it had no pictures or conversations in
it, ‘and what is the use of a book,’ thought Alice ‘without pictures or
conversations?’

So she was considering in her own mind (as well as she could, for the
hot day made her feel very sleepy and stupid), whether the pleasure
of making a da"


# How sequences are generated?  
Instead of using a sliding window of a fixed size, we add more charcters to our sequence and ask the network to predict the next character.The sequences will be padded so that all of them will be of length 100.If there are sentences longer than 100 characters, then they will be broken down into smaller pieces and the above mentioned approah will be applied on each of the smaller pieces.

In [38]:
sentence="mary had a little lamb"
for i in range(0,len(sentence)):
  seq_in=sentence[0:i]
  seq_out=sentence[i]
  print('input:',seq_in,".output:",seq_out)

input:  .output: m
input: m .output: a
input: ma .output: r
input: mar .output: y
input: mary .output:  
input: mary  .output: h
input: mary h .output: a
input: mary ha .output: d
input: mary had .output:  
input: mary had  .output: a
input: mary had a .output:  
input: mary had a  .output: l
input: mary had a l .output: i
input: mary had a li .output: t
input: mary had a lit .output: t
input: mary had a litt .output: l
input: mary had a littl .output: e
input: mary had a little .output:  
input: mary had a little  .output: l
input: mary had a little l .output: a
input: mary had a little la .output: m
input: mary had a little lam .output: b


## LSTM Text Generation model

In [0]:
import numpy as np
import string
from keras.models import Sequential
from keras.layers import Dense,Dropout,LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences

class TextGenerator:
  
  def __init__(self):
    self.clean_text=None
    self.chars=None
    self.char_to_int=None
    self.sequence_model=None
    self.X=None
    self.y=None
    
    
  def load_and_clean_text(self,file_content):
    
    #converting text to lower case
    self.clean_text=file_content.lower()
    
    #strip all of the new line characters so that we have one long sequence of characters separated only by white space.
    tokens=self.clean_text.split()
    self.clean_text=' '.join(tokens)
    
    #removing punctuations other than full stop
    punctuations_to_remove=''.join([ch for ch in string.punctuation if ch!='.'])
    self.clean_text=self.clean_text.translate(str.maketrans('', '', punctuations_to_remove))
    self.clean_text=self.clean_text.replace('\ufeff'," ").replace('\n'," ").replace('\r'," ").replace( '‘',"").replace('’',"").replace('“',"").replace('”',"").replace("  "," ")
    
    # create mapping of unique chars to integers
    self.chars = sorted(list(set(self.clean_text)))
    
    self.char_to_int = dict((c, i) for i, c in enumerate(self.chars))
    self.int_to_char = dict((i, c) for i, c in enumerate(self.chars))
    
    self.n_chars = len(self.clean_text)
    self.n_vocab = len(self.chars)
    print("Total Characters: ", self.n_chars)
    print("Total Vocab: ", self.n_vocab)
    
    
  def prepare_data_set(self,seq_len=100):
    #prepare the dataset of input to output pairs encoded as integers
    seq_length=seq_len
    dataX=[]
    dataY=[]
    
    sentences=self.clean_text.split('.')
    
    for sentence in sentences:
      if len(sentence)<=seq_length:
        for i in range(0,len(sentence)):
          seq_in=sentence[0:i]
          seq_out=sentence[i]
          dataX.append([self.char_to_int[char] for char in seq_in])
          dataY.append(self.char_to_int[seq_out])
      else:
        smaller_sentences=[sentence[i:i+seq_length] for i in range(0, len(sentence), seq_length)]
        for smaller_sentence in smaller_sentences:
          if len(smaller_sentence)<=seq_length:
            for i in range(0,len(smaller_sentence)):
              seq_in=smaller_sentence[0:i]
              seq_out=smaller_sentence[i]
              dataX.append([self.char_to_int[char] for char in seq_in])
              dataY.append(self.char_to_int[seq_out])
              
      
    n_patterns = len(dataX)
    print("Total Patterns: ", n_patterns)
    dataX_padded=np.array(pad_sequences(dataX, maxlen=seq_length, padding='pre'))
    
    print('shape of dataX_padded:',dataX_padded.shape)
    # reshape X to be [samples, time steps, features]
    self.X = np.reshape(dataX_padded, (n_patterns, seq_length, 1))
    # normalize
    self.X = self.X / float(self.n_vocab)
    # one hot encode the output variable
    self.y = np_utils.to_categorical(dataY)
    
    
  def create_sequence_model(self,):
    model = Sequential()
    model.add(LSTM(256, input_shape=(self.X.shape[1],self.X.shape[2]),return_sequences=True))
    model.add(Dropout(0.1))
    model.add(LSTM(256))
    model.add(Dense(self.y.shape[1], activation='softmax'))
    
    
    self.sequence_model=model
    self.sequence_model.summary()
    
  def train_model(self,file_path_to_store_weights,num_epochs=100,batch_size=128):
    
    # Prepare callbacks for model saving and for learning rate adjustment.
    checkpoint = ModelCheckpoint(filepath=file_path_to_store_weights,
                             monitor='loss',
                             verbose=1,
                             save_weights_only=False,
                             save_best_only=False,
                             mode='min',
                             period=10)
    
    callbacks_list = [checkpoint]
    # fit the model
    self.sequence_model.compile(loss='categorical_crossentropy', optimizer='adam')
    self.sequence_model.fit(self.X, self.y, epochs=num_epochs, batch_size=batch_size, callbacks=callbacks_list)
    

  def predict(self,char_seed='cat',num_chars_to_predict=100):
    char_prediction=char_seed
    pattern = [self.char_to_int[ch] for ch in char_seed]
    print("Seed:",char_seed)
  
    # generate characters
    for i in range(num_chars_to_predict):
      x = np.array(pad_sequences([pattern], maxlen=100, padding='pre'))
      x=np.reshape(x, (1, 100, 1))
      x = x / float(self.n_vocab)
      prediction = self.sequence_model.predict(x, verbose=0)
      index = np.argmax(prediction)
      result = self.int_to_char[index]
      seq_in = [self.int_to_char[value] for value in pattern]
      char_prediction+=result
      pattern.append(index)
      pattern = pattern[1:len(pattern)]
  
    return char_prediction  
    
  


In [0]:
txt_generator=TextGenerator()
txt_generator.load_and_clean_text(downloaded.GetContentString())

Total Characters:  135043
Total Vocab:  28


In [0]:
txt_generator.prepare_data_set(seq_len=100)

Total Patterns:  134054
shape of dataX_padded: (134054, 100)


In [0]:
txt_generator.create_sequence_model()

W0725 08:05:56.749824 140528753457024 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0725 08:05:56.792500 140528753457024 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0725 08:05:56.801749 140528753457024 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0725 08:05:57.194788 140528753457024 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0725 08:05:57.208194 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100, 256)          264192    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 256)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_1 (Dense)              (None, 28)                7196      
Total params: 796,700
Trainable params: 796,700
Non-trainable params: 0
_________________________________________________________________


In [0]:
weights_save_path='/content/gdrive/My Drive/EIP/Phase2/Assignment2/weights/approach1/July_23/epochs_001_100:{epoch:03d}.hdf5'
txt_generator.train_model(file_path_to_store_weights=weights_save_path,num_epochs=100,batch_size=512)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: saving model to /content/gdrive/My Drive/EIP/Phase2/Assignment2/weights/approach1/July_23/epochs_001_100:010.hdf5
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

Epoch 00020: saving model to /content/gdrive/My Drive/EIP/Phase2/Assignment2/weights/approach1/July_23/epochs_001_100:020.hdf5
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

Epoch 00030: saving model to /content/gdrive/My Drive/EIP/Phase2/Assignment2/weights/approach1/July_23/epochs_001_100:030.hdf5
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100

Epoch 00040: saving model to /content/gdrive/My Drive/EIP/Phase2/Assignment2/weights/approach1/July_23/e

# Predicting characters

In [28]:
txt_generator.create_sequence_model()
#load weights from the 100th epoch
txt_generator.sequence_model.load_weights("/content/gdrive/My Drive/EIP/Phase2/Assignment2/weights/approach1/July_23/epochs_001_100:100.hdf5")
txt_generator.sequence_model.compile(loss='categorical_crossentropy', optimizer='adam')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 100, 256)          264192    
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 256)          0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_3 (Dense)              (None, 28)                7196      
Total params: 796,700
Trainable params: 796,700
Non-trainable params: 0
_________________________________________________________________


## Predicting 100 characters

In [0]:
txt_generator.predict(char_seed='alice wants to',num_chars_to_predict=100)

Seed: alice wants to


'alice wants toom as she could remember that tar swil loot be in gllly shared the rueen still said the caterpillart'

In [0]:
txt_generator.predict(char_seed='he is mad',num_chars_to_predict=100)

Seed: he is mad


'he is madelt be hrrw erighfn seame tr a strrocsiness she whnhout abcourr the did of the games in she remeeten'

In [0]:
txt_generator.predict(char_seed='jon snow knows nothing',num_chars_to_predict=100)

Seed: jon snow knows nothing


'jon snow knows nothing marger alice said and wavier what said the caterpillar and the moment she and he wes you wouldnt ge'

In [0]:
txt_generator.predict(char_seed='a lannister always',num_chars_to_predict=100)

Seed: a lannister always


'a lannister always get itee snder and whispered again and all to oe and raid to her ssoprs of the words drink me but n'

In [0]:
txt_generator.predict(char_seed='the meaning of life is',num_chars_to_predict=100)

Seed: the meaning of life is


'the meaning of life is pl as the dormowse dound oot a very gurane somnow about this take she heard a lond whthout with its'

In [0]:
txt_generator.predict(char_seed='twinkle twinkle',num_chars_to_predict=100)

Seed: twinkle twinkle


'twinkle twinkle his head tau onent of course i was i aegin with the noral of that does it may i sherle loter that s'

## Predicting 500 characters

In [32]:
prediction=txt_generator.predict(char_seed='alice wants to',num_chars_to_predict=500)
print(prediction[0:100])
print(prediction[100:200])
print(prediction[200:300])
print(prediction[300:400])
print(prediction[400:])

Seed: alice wants to
alice wants toom as she could remember that tar swil loot be in gllly shared the rueen still said th
e caterpillart totnde it myst mot what tork a oarral with the moral of that better tail when i eont 
see soogthedred so blm her vire aldrning as hi course down on the swenm of breadandbdyfragooe oerson
ssiy present them and then all pever cl po mnw rersled for the shorld and then aett foid and she wen
t on growing and she felt that she was allow wpok aetentakd different begind iis she samk of the gryphon rabbings 


In [33]:
prediction=txt_generator.predict(char_seed='jon snow knows nothing',num_chars_to_predict=500)
print(prediction[0:100])
print(prediction[100:200])
print(prediction[200:300])
print(prediction[300:400])
print(prediction[400:])

Seed: jon snow knows nothing
jon snow knows nothing marger alice said and wavier what said the caterpillar and the moment she and
 he wes you wouldnt geet terdated something but she recsod toice and wton any more as all anice for 
inlw it to ilv that cane uime afain in simence in the night with at last she stretch harger and was 
belight it eilishing the words as shey all sereat down at the footman lnkee and mooking at he hande 
liceln byt of course you know what it was addatse the knave of hearts would tee to moen tortowfuloe would be taving and wa


In [34]:
prediction=txt_generator.predict(char_seed='a lannister always',num_chars_to_predict=500)
print(prediction[0:100])
print(prediction[100:200])
print(prediction[200:300])
print(prediction[300:400])
print(prediction[400:])

Seed: a lannister always
a lannister always get itee snder and whispered again and all to oe and raid to her ssoprs of the wo
rds drink me but nobusers alice remarked becuint a good was surp and but it said the mock turtle ten
oered so surn into a pig all sound ht a moments decl id went nn uo iimking so food you know as the d
oom tway wnur alice tolemlent adcidndnly ani as her the stajec being meft hfep what she was cown on 
the swenters thought alice to her and she trial down and walked two and hand in tay only reemvev and the mock turtle t


In [35]:
prediction=txt_generator.predict(char_seed='the meaning of life is',num_chars_to_predict=500)
print(prediction[0:100])
print(prediction[100:200])
print(prediction[200:300])
print(prediction[300:400])
print(prediction[400:])

Seed: the meaning of life is
the meaning of life is pl as the dormowse dound oot a very gurane somnow about this take she heard a
 lond whthout with its aegnne the roof was so swal ly hir sometfine worth her latser which was how t
o get tuuef down with euryner sile the mock turtle angral to dar yhen she walted tas a mong will be 
santen in the siceer cruwp oo their heads down and said to the knave of course the gryphon repeated 
orines the only different and the other said to the great hulg tpy luch she were lived a boms little all fay to meaning oo
