<a href="https://colab.research.google.com/github/krahul2024/machine-learning/blob/main/projects/next_char.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Working on next-character prediction model

In [15]:
import tensorflow as tf
import requests

file_url = 'https://raw.githubusercontent.com/krahul2024/machine-learning/main/projects/Text-Datasets/paul_graham_essays.txt'
data = requests.get(file_url).text
print(len(data))

3023219


In [23]:
#@title Data Pre-Processing

def process_data(content, input_size = 30):
  import numpy as np
  vocab = sorted(list(set(content)))
  vocab_size, total_chars = len(vocab), len(content)
  print(f"Total unique characters : {vocab_size}")

  char_index = dict((c, i) for i, c in enumerate(vocab)) # map the characters against their respective indexes
  steps = 3 # the characters we would skip to construct next input sequence
  x , y = [], []

  # create the dataset
  for i in range(0, total_chars-input_size, steps):
      x.append(content[i:i+input_size])
      y.append(content[i+input_size])

  # print first 5 sample input and outputs from our dataset
  for i in range(5):
      print(f"{x[i]} -> {y[i]}")

  print(f"input-size : {len(x)}, output-size : {len(y)}")

  # convert the input and output to numpy array format, also perform the encoding
  X = np.zeros(
      (
          len(x),
          input_size,
          len(vocab)
      ),
      dtype =bool
  )

  Y = np.zeros(
      (
          len(x),
          vocab_size
      )
  )

  print(f"Dims of input data : {X.shape} \n Dims of output data : {Y.shape}")

  # Assign the truth values where we find a character and false values to rest of the array, do the same thing for input and output data
  for i, seq in enumerate(x):
    for j, char in enumerate(seq):
      X[i, j, char_index[char]] = 1
      Y[i, char_index[y[i]]] = 1

  # split the large dataset into multiple parts taking into consideration the resource availability
  train_X = np.array_split(X, 10)
  train_Y = np.array_split(Y, 10)

  return [vocab, char_index, train_X, train_Y]

In [32]:
#@title Model Building and Training function

# function to create and compile the model
def create_model(input_size, vocab_size,):
  import tensorflow as tf
  model_ = tf.keras.models.Sequential([
      tf.keras.layers.LSTM(128, input_shape = (input_size, vocab_size)),
      tf.keras.layers.Dense(vocab_size, activation = 'softmax')
  ])

  model_.compile(
      optimizer = 'adam',
      loss = 'categorical_crossentropy',
      metrics = ['accuracy']
  )
  print(model_.summary())
  return model_

# function to train the model
def train_model(model_, input_data, output_data, batch_size = 128, validation_split = 0.05, verbose = 1, shuffle = True, epochs = 25):
  training_history = model_.fit(
      input_data, output_data, batch_size = batch_size,
      validation_split = validation_split, verbose = verbose,
      shuffle = shuffle, epochs = epochs
  ).history

  return model_, training_history

In [30]:
# get the sets of training data
chars, char_indices, train_x, train_y = process_data(data, input_size = 40)

Total unique characters : 106
  Anyone can see they're not the same by ->  
nyone can see they're not the same by th -> e
ne can see they're not the same by the n -> u
can see they're not the same by the numb -> e
 see they're not the same by the number  -> o
input-size : 1007727, output-size : 1007727
Dims of input data : (1007727, 40, 106) 
 Dims of output data : (1007727, 106)


In [33]:
# create the  model
model = create_model(input_size = 40, vocab_size = len(chars))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 128)               120320    
                                                                 
 dense (Dense)               (None, 106)               13674     
                                                                 
Total params: 133994 (523.41 KB)
Trainable params: 133994 (523.41 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [None]:
# train the model
model, history = train_model(model, train_x[0], train_y[0], epochs = 50)

In [None]:
# train the model on rest of the dataset
import time # this is to introduce the delay after each training
for i in range(1, 10):
  model = train_model(model, train_x[i], train_y[i], epochs = 100)
  time.sleep(60) # delay of 60 seconds after each training


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100