<a href="https://colab.research.google.com/github/jangvu/Project_by_me/blob/main/LSTM_alice_in_wonderland.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
!ls "/content/drive/My Drive/Data"

101_ObjectCategories.tar.gz  Annotations.tar  isbi-datasets.zip  NFLX.csv
alice_in_wonderland.txt      candidate_test   NEU-DET.zip


In [15]:
FILE_PATH = '/content/drive/My Drive/Data/alice_in_wonderland.txt'

In [16]:
raw_text = open(FILE_PATH).read().lower()


In [17]:
import numpy as np
from keras.utils import np_utils
from sklearn.preprocessing import LabelBinarizer

Take the set of characters in the texts

In [18]:
chars = sorted(set(raw_text))

Using a dictionary 59 to transcript the story's characters -> each time a char appears 1 in key of that char and 0 for the rest

In [19]:
char_to_int = dict((c, i) for i, c in enumerate(chars))
print(char_to_int)

{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, '*': 7, ',': 8, '-': 9, '.': 10, '0': 11, '3': 12, ':': 13, ';': 14, '?': 15, '[': 16, ']': 17, '_': 18, '`': 19, 'a': 20, 'b': 21, 'c': 22, 'd': 23, 'e': 24, 'f': 25, 'g': 26, 'h': 27, 'i': 28, 'j': 29, 'k': 30, 'l': 31, 'm': 32, 'n': 33, 'o': 34, 'p': 35, 'q': 36, 'r': 37, 's': 38, 't': 39, 'u': 40, 'v': 41, 'w': 42, 'x': 43, 'y': 44, 'z': 45}


But we dont need to use all chars because some chars dont affect the sentences like *, #, $, ... => make new chars 

In [20]:
import string
string.ascii_lowercase
chars_new = list(string.ascii_lowercase) + ['0', '.', ',', ' ', '!', '?', 'unk']
chars_to_int = dict((v, k) for k, v in enumerate(chars_new))
int_to_chars = dict((k, v) for k, v in enumerate(chars_new))

print(chars_to_int)
print(int_to_chars)

n_chars = len(raw_text)
n_vocab = len(chars_new)
print('Total characters: ', n_chars)
print('Total Vocab: ', n_vocab)

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, '0': 26, '.': 27, ',': 28, ' ': 29, '!': 30, '?': 31, 'unk': 32}
{0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm', 13: 'n', 14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y', 25: 'z', 26: '0', 27: '.', 28: ',', 29: ' ', 30: '!', 31: '?', 32: 'unk'}
Total characters:  148574
Total Vocab:  33


In [21]:
def _encode_sen(text):
  text = text.lower()
  sen_vec = []
  for letter in text:
    if letter in chars_new:
      idx = chars_to_int[letter]
    else:
      idx = chars_to_int['unk']
    sen_vec.append(idx)
  return sen_vec

In [22]:
def _decode_sen(vec):
  text = []
  for idx in vec:
    letter = int_to_chars[idx]
    text.append(letter)
  text = ''.join(text)
  return text

In [23]:
a = _encode_sen('Alice in wonderland. #')
print('Encode')
print(a)
b = _decode_sen(a)
print('Decode')
print(b)

Encode
[0, 11, 8, 2, 4, 29, 8, 13, 29, 22, 14, 13, 3, 4, 17, 11, 0, 13, 3, 27, 29, 32]
Decode
alice in wonderland. unk


Creating Window to slide through text 

In [24]:
def window(text, step = 1, window_size = 100):
  X_train = []
  y_train = []
  for i in range(len(text) - window_size - 1):
    X_train.append(_encode_sen(text[i:i+window_size]))
    y_train.append(_encode_sen(text[i+step+window_size]))
  return X_train, y_train




In [25]:
data_X, data_Y = window(raw_text)

INPUT_SHAPE has form (samples, time_steps, features). So in this problem samples = number of windows, time_steps = length of window, features is dimension of window (we using 1D in this)

In [26]:
num_samples = len(data_X)
window_size = 100

In [54]:
X_train = np.asarray(data_X).reshape(num_samples,window_size,1)
# Normalize
X_train = X_train / float(n_vocab)

#labelBinarizer = LabelBinarizer()
#y_train = labelBinarizer.fit_transform(np.asarray(data_Y))
y_train = np_utils.to_categorical(data_Y)


In [28]:
print(X_train.shape)
print(y_train.shape)

(148473, 100, 1)
(148473, 33)


In [29]:
import seaborn as sn
import matplotlib.pyplot as plt
import pandas as pd
plt.figure(figsize = (10, 5))
#sn.countplot(np.asarray(data_Y))
#plt.xticks(np.arange(32),np.array(chars_new))


<Figure size 720x360 with 0 Axes>

<Figure size 720x360 with 0 Axes>

Creating Model

In [30]:
import tensorflow as tf
from tensorflow import keras

In [41]:
model_LSTM = keras.models.Sequential()
model_LSTM.add(keras.layers.LSTM(256,input_shape = (X_train.shape[1], X_train.shape[2])))
#model_LSTM.add(keras.layers.LSTM(256))
model_LSTM.add(keras.layers.Dropout(0.2))
model_LSTM.add(keras.layers.Dense(y_train.shape[1], activation='softmax'))

model_LSTM.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model_LSTM.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 256)               264192    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 33)                8481      
Total params: 272,673
Trainable params: 272,673
Non-trainable params: 0
_________________________________________________________________


In [42]:
filepath = 'weights-improvement-{epoch:02d}-{loss:.4f}.hdf5'
checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor = 'val_acc', verbose = 1, save_best_only = True, mode = 'max')
callback_list = [checkpoint]

In [55]:
model_LSTM.fit(X_train, y_train, epochs = 5, batch_size = 128, validation_split=0.33, callbacks = callback_list, verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7efbce41e910>

In [63]:
base_word = 'Alice was beginning to get very tired of sitting by her sister on the bank'

def _predict_let(text, len_sen = 1):
    text_for = []
    for i in range(len_sen):
        x_input = np.array(_encode_sen(text)[-100:])/float(n_vocab)
        if x_input.shape[0] < 100:
            x_input = np.concatenate((np.zeros(100-x_input.shape[0]), x_input), axis = 0)
        x_input = np.expand_dims(np.expand_dims(x_input, -1), 0)
        y_prob = model_LSTM.predict(x_input)
        y_let = int_to_chars[np.argmax(y_prob, axis = 1)[0]]
        text = text + y_let
        print(i, text)
    return text[len_sen:]

In [64]:
_predict_let(base_word, 100)


0 Alice was beginning to get very tired of sitting by her sister on the bank 
1 Alice was beginning to get very tired of sitting by her sister on the bank a
2 Alice was beginning to get very tired of sitting by her sister on the bank ad
3 Alice was beginning to get very tired of sitting by her sister on the bank ad 
4 Alice was beginning to get very tired of sitting by her sister on the bank ad n
5 Alice was beginning to get very tired of sitting by her sister on the bank ad n 
6 Alice was beginning to get very tired of sitting by her sister on the bank ad n h
7 Alice was beginning to get very tired of sitting by her sister on the bank ad n h 
8 Alice was beginning to get very tired of sitting by her sister on the bank ad n h o
9 Alice was beginning to get very tired of sitting by her sister on the bank ad n h o 
10 Alice was beginning to get very tired of sitting by her sister on the bank ad n h o e
11 Alice was beginning to get very tired of sitting by her sister on the bank ad n h o

' o h a at o h a at     unk he eten o h a at o e at o h a at o h a at     unk h'