#Writing style mimicry using LSTM
using Herman Melville's _Moby Dick_ as reference

## Setup and Data Loading

In [None]:
import numpy as np
import pandas as pd

In [None]:
def read_file(filepath):
  with open(filepath) as f:
    str_text = f.read()
  
  return str_text

## Preprocessing

In [None]:
import spacy

In [None]:
nlp = spacy.load("en",disable=['parser','tagger','ner'])

In [None]:
nlp.max_length = 11198623

In [None]:
def seperate_punc(doc_text):
  return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_{|}~t\\n ']

In [None]:
d = read_file('moby_dick_four_chapters.txt')

In [None]:
tokens = seperate_punc(d)

In [None]:
len(tokens)

11338

## Sequence generation (26 word sequences, 25 as X and 26 as Y)

In [None]:
# 25 words --> network predict #26

In [None]:
train_len = 25 + 1

text_sequences = []

for i in range(train_len,len(tokens)):
  seq = tokens[i-train_len:i]
  text_sequences.append(seq)

In [None]:
' '.join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [None]:
' '.join(text_sequences[1])

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

## Tokenization

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

In [None]:
tokenizer.fit_on_texts(text_sequences)

In [None]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [None]:
for i in sequences[0]:
  print(f"{i} : {tokenizer.index_word[i]}")
# tokenizer.index_word

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
315 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2712 : interest
14 : me
24 : on


In [None]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

2717

In [None]:
sequences = np.array(sequences)
sequences

array([[ 956,   14,  263, ..., 2712,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2711, ...,   53,    2, 2717],
       [ 166, 2711,    3, ...,    2, 2717,   26]])

In [None]:
sequences.shape

(11312, 26)

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
X = sequences[:,:-1]
y = sequences[:,-1]

In [None]:
y = to_categorical(y,num_classes=vocabulary_size+1)

In [None]:
seq_len = X.shape[1]

##Model creation

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding

In [None]:
def create_model(vocabulary_size,seq_len):
  model = Sequential()

  model.add(Embedding(vocabulary_size,seq_len,input_length=seq_len))
  model.add(LSTM(seq_len*2,return_sequences=True))
  model.add(LSTM(seq_len*2))
  model.add(Dense(50,activation='relu'))

  model.add(Dense(vocabulary_size,activation='softmax'))

  model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

  model.summary()

  return model

In [None]:
model = create_model(vocabulary_size+1,seq_len)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            67950     
                                                                 
 lstm (LSTM)                 (None, 25, 50)            15200     
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 dense (Dense)               (None, 50)                2550      
                                                                 
 dense_1 (Dense)             (None, 2718)              138618    
                                                                 
Total params: 244,518
Trainable params: 244,518
Non-trainable params: 0
_________________________________________________________________


In [None]:
from pickle import dump,load

In [None]:
model.fit(X,y,batch_size=128,epochs=5,verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8308c573d0>

In [None]:
model.save('my_mobydick_model.h5')

In [None]:
dump(tokenizer,open('my_simpletokenizer',"wb"))

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):

  output_text = []

  input_text = seed_text

  for i in range(num_gen_words):
    
    encoded_text = tokenizer.texts_to_sequences([input_text])[0]
    pad_encoded = pad_sequences([encoded_text],maxlen=seq_len,truncating='pre')

    pred_word_ind = np.argmax(model.predict(pad_encoded,verbose=0), axis=-1)[0]

    pred_word = tokenizer.index_word[pred_word_ind]

    input_text += ' '+pred_word

    output_text.append(pred_word)

  pred_words = ' '.join(output_text)

  return str(seed_text + ' | ' + pred_words)

In [None]:
import random
random.seed(42)
random_pick = random.randint(0,len(text_sequences))
random_seed_text = text_sequences[random_pick]
seed_text = ' '.join(random_seed_text)
seed_text

"a horse collar and suddenly felt a slight scratch throwing aside the counterpane there lay the tomahawk sleeping by the savage 's side as if it"

In [None]:
generate_text(model,tokenizer,seq_len,seed_text,num_gen_words=25)

"a horse collar and suddenly felt a slight scratch throwing aside the counterpane there lay the tomahawk sleeping by the savage 's side as if it | the room the room the room the room the room the room the room the room the room the room the room the room the"

It performs badly because we didn't train it for enough epochs, so it kept repeating the most common words :/