In [0]:
!pip install -q keras

In [7]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [11]:
# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '1iai6zKxrGYIza4k1Kbwb8eTyD4wybYE8'
downloaded = drive.CreateFile({'id': file_id})
print('Downloaded content "{}"'.format(downloaded.GetContentString()[0:500]))

Downloaded content "﻿CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to do: once or twice she had peeped into the
book her sister was reading, but it had no pictures or conversations in
it, ‘and what is the use of a book,’ thought Alice ‘without pictures or
conversations?’

So she was considering in her own mind (as well as she could, for the
hot day made her feel very sleepy and stupid), whether the pleasure
of making a da"


# How sequences are generated?  
Instead of using a sliding window of a fixed size, we add more charcters to our sequence and ask the network to predict the next character.The sequences will be padded so that all of them will be of length 100.If there are sentences longer than 100 characters, then they will be broken down into smaller pieces and the above mentioned approah will be applied on each of the smaller pieces.

In [5]:
sentence="jon snow knows nothing"
for i in range(0,len(sentence)):
  seq_in=sentence[0:i]
  seq_out=sentence[i]
  print('input:',seq_in,".output:",seq_out)

input:  .output: j
input: j .output: o
input: jo .output: n
input: jon .output:  
input: jon  .output: s
input: jon s .output: n
input: jon sn .output: o
input: jon sno .output: w
input: jon snow .output:  
input: jon snow  .output: k
input: jon snow k .output: n
input: jon snow kn .output: o
input: jon snow kno .output: w
input: jon snow know .output: s
input: jon snow knows .output:  
input: jon snow knows  .output: n
input: jon snow knows n .output: o
input: jon snow knows no .output: t
input: jon snow knows not .output: h
input: jon snow knows noth .output: i
input: jon snow knows nothi .output: n
input: jon snow knows nothin .output: g


## LSTM Text Generation model

In [0]:
import numpy as np
import string
from keras.models import Sequential
from keras.layers import Dense,Dropout,LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences

class TextGenerator:
  
  def __init__(self):
    self.clean_text=None
    self.chars=None
    self.char_to_int=None
    self.sequence_model=None
    self.X=None
    self.y=None
    self.train_history=None
    
    
  def load_and_clean_text(self,file_content):
    
    #converting text to lower case
    self.clean_text=file_content.lower()
    
    #strip all of the new line characters so that we have one long sequence of characters separated only by white space.
    tokens=self.clean_text.split()
    self.clean_text=' '.join(tokens)
    
    #removing punctuations other than full stop
    punctuations_to_remove=''.join([ch for ch in string.punctuation if ch!='.'])
    self.clean_text=self.clean_text.translate(str.maketrans('', '', punctuations_to_remove))
    self.clean_text=self.clean_text.replace('\ufeff'," ").replace('\n'," ").replace('\r'," ").replace( '‘',"").replace('’',"").replace('“',"").replace('”',"").replace("  "," ")
    
    # create mapping of unique chars to integers
    self.chars = sorted(list(set(self.clean_text)))
    
    self.char_to_int = dict((c, i) for i, c in enumerate(self.chars))
    self.int_to_char = dict((i, c) for i, c in enumerate(self.chars))
    
    self.n_chars = len(self.clean_text)
    self.n_vocab = len(self.chars)
    print("Total Characters: ", self.n_chars)
    print("Total Vocab: ", self.n_vocab)
    
    
  def prepare_data_set(self,seq_len=100):
    #prepare the dataset of input to output pairs encoded as integers
    seq_length=seq_len
    dataX=[]
    dataY=[]
    
    sentences=self.clean_text.split('.')
    
    for sentence in sentences:
      if len(sentence)<=seq_length:
        for i in range(0,len(sentence)):
          seq_in=sentence[0:i]
          seq_out=sentence[i]
          dataX.append([self.char_to_int[char] for char in seq_in])
          dataY.append(self.char_to_int[seq_out])
      else:
        smaller_sentences=[sentence[i:i+seq_length] for i in range(0, len(sentence), seq_length)]
        for smaller_sentence in smaller_sentences:
          if len(smaller_sentence)<=seq_length:
            for i in range(0,len(smaller_sentence)):
              seq_in=smaller_sentence[0:i]
              seq_out=smaller_sentence[i]
              dataX.append([self.char_to_int[char] for char in seq_in])
              dataY.append(self.char_to_int[seq_out])
              
      
    n_patterns = len(dataX)
    print("Total Patterns: ", n_patterns)
    dataX_padded=np.array(pad_sequences(dataX, maxlen=seq_length, padding='pre'))
    
    print('shape of dataX_padded:',dataX_padded.shape)
    # reshape X to be [samples, time steps, features]
    self.X = np.reshape(dataX_padded, (n_patterns, seq_length, 1))
    # normalize
    self.X = self.X / float(self.n_vocab)
    # one hot encode the output variable
    self.y = np_utils.to_categorical(dataY)
    
    
  def create_sequence_model(self,num_lstm_units=256,drop_out=0.1):
    model = Sequential()
    model.add(LSTM(num_lstm_units, input_shape=(self.X.shape[1],self.X.shape[2]),dropout=drop_out,return_sequences=True))
    model.add(LSTM(num_lstm_units,dropout=drop_out))
    model.add(Dense(self.y.shape[1], activation='softmax'))
    
    self.sequence_model=model
    self.sequence_model.summary()
    
  def train_model(self,file_path_to_store_weights,
                  num_epochs=100,
                  batch_size=128,
                 loss_fn='categorical_crossentropy',
                 optimizer_name='adam'):
    
    # Prepare callbacks for model saving and for learning rate adjustment.
    checkpoint = ModelCheckpoint(filepath=file_path_to_store_weights,
                             monitor='loss',
                             verbose=1,
                             save_weights_only=False,
                             save_best_only=False,
                             mode='min',
                             period=10)
    
    callbacks_list = [checkpoint]
    
    self.sequence_model.compile(loss=loss_fn, optimizer=optimizer_name)
    # fit the model
    self.train_history=self.sequence_model.fit(self.X, self.y, epochs=num_epochs, batch_size=batch_size, callbacks=callbacks_list)
    

  def predict(self,char_seed='cat',num_chars_to_predict=100):
    char_prediction=char_seed
    pattern = [self.char_to_int[ch] for ch in char_seed]
    print("Seed:",char_seed)
  
    # generate characters
    for i in range(num_chars_to_predict):
      x = np.array(pad_sequences([pattern], maxlen=100, padding='pre'))
      x=np.reshape(x, (1, 100, 1))
      x = x / float(self.n_vocab)
      prediction = self.sequence_model.predict(x, verbose=0)
      index = np.argmax(prediction)
      result = self.int_to_char[index]
      seq_in = [self.int_to_char[value] for value in pattern]
      char_prediction+=result
      pattern.append(index)
      pattern = pattern[1:len(pattern)]
  
    return char_prediction  
  
  
  def predict_with_temperature(self,char_seed='cat',num_chars_to_predict=100,temperature=1):
    char_prediction=char_seed
    pattern = [self.char_to_int[ch] for ch in char_seed]
#     print("Seed:",char_seed)
  
    # generate characters
    for i in range(num_chars_to_predict):
      x = np.array(pad_sequences([pattern], maxlen=100, padding='pre'))
      x=np.reshape(x, (1, 100, 1))
      x = x / float(self.n_vocab)
      prediction = self.sequence_model.predict(x, verbose=0)
      
      preds=np.asarray(prediction).astype('float64')
      preds=np.log(preds)/temperature
      index = np.argmax(preds)
      
      result = self.int_to_char[index]
      seq_in = [self.int_to_char[value] for value in pattern]
      char_prediction+=result
      pattern.append(index)
      pattern = pattern[1:len(pattern)]
  
    return char_prediction 
  
  

In [7]:
txt_generator=TextGenerator()
txt_generator.load_and_clean_text(downloaded.GetContentString())

Total Characters:  135043
Total Vocab:  28


In [8]:
txt_generator.prepare_data_set(seq_len=100)

Total Patterns:  134054
shape of dataX_padded: (134054, 100)


In [11]:
txt_generator.create_sequence_model(num_lstm_units=256,drop_out=0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 100, 256)          264192    
_________________________________________________________________
lstm_4 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_2 (Dense)              (None, 28)                7196      
Total params: 796,700
Trainable params: 796,700
Non-trainable params: 0
_________________________________________________________________


In [12]:
weights_save_path='/content/gdrive/My Drive/EIP/Phase2/Assignment2/weights/approach1_v4/July_26/epochs_001_100:{epoch:03d}.hdf5'
txt_generator.train_model(file_path_to_store_weights=weights_save_path,num_epochs=100,batch_size=512)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: saving model to /content/gdrive/My Drive/EIP/Phase2/Assignment2/weights/approach1_v4/July_26/epochs_001_100:010.hdf5
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

Epoch 00020: saving model to /content/gdrive/My Drive/EIP/Phase2/Assignment2/weights/approach1_v4/July_26/epochs_001_100:020.hdf5
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

Epoch 00030: saving model to /content/gdrive/My Drive/EIP/Phase2/Assignment2/weights/approach1_v4/July_26/epochs_001_100:030.hdf5
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100

Epoch 00040: saving model to /content/gdrive/My Drive/EIP/Phase2/Assignment2/weights/approach1_

## Predicting characters

## Predicting 100 characters

In [14]:
txt_generator=TextGenerator()
txt_generator.load_and_clean_text(downloaded.GetContentString())
txt_generator.prepare_data_set(seq_len=100)
txt_generator.create_sequence_model(num_lstm_units=256,drop_out=0.1)

Total Characters:  135043
Total Vocab:  28
Total Patterns:  134054


W0726 17:45:39.530944 140146659190656 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0726 17:45:39.547890 140146659190656 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0726 17:45:39.550692 140146659190656 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



shape of dataX_padded: (134054, 100)


W0726 17:45:39.741150 140146659190656 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0726 17:45:39.754376 140146659190656 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100, 256)          264192    
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_1 (Dense)              (None, 28)                7196      
Total params: 796,700
Trainable params: 796,700
Non-trainable params: 0
_________________________________________________________________


In [15]:
txt_generator.sequence_model.load_weights("/content/gdrive/My Drive/EIP/Phase2/Assignment2/weights/approach1_v4/July_26/epochs_001_100:100.hdf5")
txt_generator.sequence_model.compile(loss='categorical_crossentropy', optimizer='adam')

W0726 17:45:47.295643 140146659190656 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0726 17:45:47.895800 140146659190656 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



## Predicting 500 characters

In [16]:
prediction=txt_generator.predict(char_seed='alice wants to',num_chars_to_predict=500)
print(prediction[0:100])
print(prediction[100:200])
print(prediction[200:300])
print(prediction[300:400])
print(prediction[400:])

Seed: alice wants to
alice wants tooo the whoug bill re court wie grrtm up shehe ie puertes mooe and the couldnt ceat c t
r suy i seaningd ttbj shf what sorring tailath hanlt sfmarked reraeperwed all toundle poe of tome we
ry murking about anice whought the her woice of hereing douso atd lad larp besot saee alaume of teal
lr said tooe oeart tomethine nook oinutes i tuisting forugh the fouettaii the waid to herself inw tr
yisgter whoue noned atdrply all the coust and tomp weryendadouegsed anl tound alain oock turping tfrpiet inge at s


In [17]:
prediction=txt_generator.predict_with_temperature(char_seed='alice wants to',num_chars_to_predict=500,temperature=0.2)
print(prediction[0:100])
print(prediction[100:200])
print(prediction[200:300])
print(prediction[300:400])
print(prediction[400:])

alice wants tooo the whoug bill re court wie grrtm up shehe ie puertes mooe and the couldnt ceat c t
r suy i seaningd ttbj shf what sorring tailath hanlt sfmarked reraeperwed all toundle poe of tome we
ry murking about anice whought the her woice of hereing douso atd lad larp besot saee alaume of teal
lr said tooe oeart tomethine nook oinutes i tuisting forugh the fouettaii the waid to herself inw tr
yisgter whoue noned atdrply all the coust and tomp weryendadouegsed anl tound alain oock turping tfrpiet inge at s


In [19]:
prediction=txt_generator.predict_with_temperature(char_seed='alice wants to',num_chars_to_predict=500,temperature=0.5)
print(prediction[0:100])
print(prediction[100:200])
print(prediction[200:300])
print(prediction[300:400])
print(prediction[400:])

alice wants tooo the whoug bill re court wie grrtm up shehe ie puertes mooe and the couldnt ceat c t
r suy i seaningd ttbj shf what sorring tailath hanlt sfmarked reraeperwed all toundle poe of tome we
ry murking about anice whought the her woice of hereing douso atd lad larp besot saee alaume of teal
lr said tooe oeart tomethine nook oinutes i tuisting forugh the fouettaii the waid to herself inw tr
yisgter whoue noned atdrply all the coust and tomp weryendadouegsed anl tound alain oock turping tfrpiet inge at s


In [20]:
prediction=txt_generator.predict_with_temperature(char_seed='alice wants to',num_chars_to_predict=500,temperature=1.0)
print(prediction[0:100])
print(prediction[100:200])
print(prediction[200:300])
print(prediction[300:400])
print(prediction[400:])

alice wants tooo the whoug bill re court wie grrtm up shehe ie puertes mooe and the couldnt ceat c t
r suy i seaningd ttbj shf what sorring tailath hanlt sfmarked reraeperwed all toundle poe of tome we
ry murking about anice whought the her woice of hereing douso atd lad larp besot saee alaume of teal
lr said tooe oeart tomethine nook oinutes i tuisting forugh the fouettaii the waid to herself inw tr
yisgter whoue noned atdrply all the coust and tomp weryendadouegsed anl tound alain oock turping tfrpiet inge at s


In [21]:
prediction=txt_generator.predict_with_temperature(char_seed='alice wants to',num_chars_to_predict=500,temperature=1.2)
print(prediction[0:100])
print(prediction[100:200])
print(prediction[200:300])
print(prediction[300:400])
print(prediction[400:])

alice wants tooo the whoug bill re court wie grrtm up shehe ie puertes mooe and the couldnt ceat c t
r suy i seaningd ttbj shf what sorring tailath hanlt sfmarked reraeperwed all toundle poe of tome we
ry murking about anice whought the her woice of hereing douso atd lad larp besot saee alaume of teal
lr said tooe oeart tomethine nook oinutes i tuisting forugh the fouettaii the waid to herself inw tr
yisgter whoue noned atdrply all the coust and tomp weryendadouegsed anl tound alain oock turping tfrpiet inge at s


In [0]:
prediction=txt_generator.predict(char_seed='jon snow knows nothing',num_chars_to_predict=500)
print(prediction[0:100])
print(prediction[100:200])
print(prediction[200:300])
print(prediction[300:400])
print(prediction[400:])

Seed: jon snow knows nothing
jon snow knows nothing eeat it sather a frcs at well as she could for the fod of the trial dar bette
r now his fisst she said to herself in cand one feel cear so she went on all the sat down a large pa
bbits were gladrels eoot the walked sather the dourt and the mock turtle said to the duchess and the
 mocster was a tery diffirotfd of the sane she wery seldmbering wety gnadendhing the queen said the 
doorman in a parce herself so see iow shan porse ar all the rabbit surtles the walked sather this outping becide she way a


In [0]:
prediction=txt_generator.predict(char_seed='a lannister always',num_chars_to_predict=500)
print(prediction[0:100])
print(prediction[100:200])
print(prediction[200:300])
print(prediction[300:400])
print(prediction[400:])

Seed: a lannister always
a lannister always get to say the mouse who seemed to be no courtered thought alice without becedent
hduieyamd yhat you may gat betadle a baw uaid the duchess the rueen said the duchess as the doumouse
 looked all round the hatter id you dont know what the mouse goo fighing for a minute or two she was
 tilence thinking i should the tort of the thimgs is wer she said to herself which way abdeais puite
 fongotounsy how five to say when i was a little dook and the mock turtle taid alice vhought alice thought it would be


In [0]:
prediction=txt_generator.predict(char_seed='the meaning of life is',num_chars_to_predict=500)
print(prediction[0:100])
print(prediction[100:200])
print(prediction[200:300])
print(prediction[300:400])
print(prediction[400:])

Seed: the meaning of life is
the meaning of life is belongs to a mouse shat came vpon a little door kook at well as she could for
 the fod of the trial dar better now his fisst she said to herself in cand one feel cear so she went
 on all the sat down a large pabbits were gladrels eoot the walked sather the dourt and the mock tur
tle said to the duchess and the mocster was a tery diffirotfd of the sane she wery seldmbering wety 
gnadendhing the queen said the doorman in a parce herself so see iow shan porse ar all the rabbit surtles the walked sathe
