## Sentence Autocompletion using Tensorflow

#### Data Cleaning-> Tokenize the cleaned data-> Split the data into train and test-> Model-> Training (50 epochs)

In [11]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, Callback
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('Shakespeare_data.csv')

# Keep only 'PlayerLine' column
data = data[['PlayerLine']]

print(data.head())

# Extract the 'PlayerLine' column into a list
text = data['PlayerLine'].tolist()

# Define the text cleaning function
def clean_text(text):
    # Remove special characters
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    # Remove digits
    text = re.sub('\d+', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Clean the text data
texts = [clean_text(t) for t in text]

# Limit to first 10000 texts
texts = texts[:10000]

# Tokenize the cleaned text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Convert texts to sequences
text_sequences = tokenizer.texts_to_sequences(texts)

# Check sequences length
sequence_lengths = [len(seq) for seq in text_sequences]
print(f"Sequence lengths (first 10): {sequence_lengths[:10]}")
print(f"Max sequence length: {max(sequence_lengths)}")
print(f"Min sequence length: {min(sequence_lengths)}")

# Ensure sequences are not empty
text_sequences = [seq for seq in text_sequences if len(seq) > 0]
print(f"Number of non-empty sequences: {len(text_sequences)}")

# Pad sequences to ensure they have the same length
max_sequence_length = max(sequence_lengths)
text_sequences = pad_sequences(text_sequences, maxlen=max_sequence_length, padding='pre')
print('Max sequence length ->>', max_sequence_length)
print('Text Sequence ->>\n', text_sequences[0])
print('Text Sequence Shape ->>', text_sequences.shape)

# Debugging sequences
for i, seq in enumerate(text_sequences[:5]):
    print(f"Sequence {i}: {seq}")

# Check the shape of padded sequences
print(f"Padded sequences shape: {text_sequences.shape}")

# Splitting the dataset into input and output
X, y = text_sequences[:, :-1], text_sequences[:, -1]
print('First Input: ', X[0])
print('First Output: ', y[0])

word_index = tokenizer.word_index
total_words = len(word_index) + 1
print('Total number of words: ', total_words)
y = to_categorical(y, num_classes=total_words)

print('Input Shape: ', X.shape)
print('Output Shape: ', y.shape)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print shapes to verify
print('Training Data Shapes:')
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('Testing Data Shapes:')
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

# Define the model
model = Sequential(name="LSTM_Model")

# Adding embedding layer
model.add(Embedding(input_dim=total_words, 
                    output_dim=100,  # Fixed dimension for embedding vectors
                    input_shape=(max_sequence_length - 1,)))  # Use input_shape instead of input_length

# Adding a LSTM layer
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.5))

# Adding the final output activation with activation function of softmax
model.add(Dense(total_words, activation='softmax'))

# Printing model summary
print(model.summary())

# Compile the model
model.compile(
    loss="categorical_crossentropy", 
    optimizer='adam', 
    metrics=['accuracy']
)

# Define the checkpoint callback to save every 10 epochs
class CustomCheckpoint(Callback):
    def __init__(self, save_path, save_freq):
        super(CustomCheckpoint, self).__init__()
        self.save_path = save_path
        self.save_freq = save_freq
    
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.save_freq == 0:
            save_filepath = self.save_path.format(epoch=epoch+1)
            self.model.save(save_filepath)
            print(f'Saved model checkpoint at epoch {epoch+1} to {save_filepath}')

checkpoint_callback = CustomCheckpoint(
    save_path='path/to/checkpoint/model_epoch_{epoch:02d}.h5',  # Path to save the model file
    save_freq=10  # Save the model every 10 epochs
)

# Training the model with custom checkpoint callback
history = model.fit(
    X_train, y_train,
    epochs=50,
    verbose=1,
    validation_data=(X_test, y_test),  # Use validation data tuple
    callbacks=[checkpoint_callback]
)


                                          PlayerLine
0                                              ACT I
1                       SCENE I. London. The palace.
2  Enter KING HENRY, LORD JOHN OF LANCASTER, the ...
3             So shaken as we are, so wan with care,
4         Find we a time for frighted peace to pant,
Sequence lengths (first 10): [2, 5, 16, 9, 9, 7, 7, 8, 9, 8]
Max sequence length: 54
Min sequence length: 1
Number of non-empty sequences: 10000
Max sequence length ->> 54
Text Sequence ->>
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 455   4]
Text Sequence Shape ->> (10000, 54)
Sequence 0: [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 455   4]
Seq

  super().__init__(**kwargs)


None
Epoch 1/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 281ms/step - accuracy: 0.0114 - loss: 8.2253 - val_accuracy: 0.0135 - val_loss: 7.6584
Epoch 2/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 282ms/step - accuracy: 0.0171 - loss: 7.3341 - val_accuracy: 0.0165 - val_loss: 7.6608
Epoch 3/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 284ms/step - accuracy: 0.0190 - loss: 7.0480 - val_accuracy: 0.0240 - val_loss: 7.7315
Epoch 4/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 284ms/step - accuracy: 0.0285 - loss: 6.7547 - val_accuracy: 0.0245 - val_loss: 7.8293
Epoch 5/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 277ms/step - accuracy: 0.0316 - loss: 6.4903 - val_accuracy: 0.0285 - val_loss: 8.0232
Epoch 6/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 271ms/step - accuracy: 0.0388 - loss: 6.1631 - val_accuracy: 0.0295 - val_loss: 8.1547
Epoch



Saved model checkpoint at epoch 10 to path/to/checkpoint/model_epoch_10.h5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 274ms/step - accuracy: 0.1708 - loss: 4.2426 - val_accuracy: 0.0300 - val_loss: 8.9123
Epoch 11/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 279ms/step - accuracy: 0.2425 - loss: 3.7069 - val_accuracy: 0.0250 - val_loss: 9.0578
Epoch 12/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 283ms/step - accuracy: 0.3512 - loss: 3.1407 - val_accuracy: 0.0265 - val_loss: 9.3512
Epoch 13/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 283ms/step - accuracy: 0.4535 - loss: 2.6220 - val_accuracy: 0.0250 - val_loss: 9.5717
Epoch 14/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 281ms/step - accuracy: 0.5500 - loss: 2.1457 - val_accuracy: 0.0315 - val_loss: 9.7740
Epoch 15/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 283ms/step - accuracy: 0.644



Saved model checkpoint at epoch 20 to path/to/checkpoint/model_epoch_20.h5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 276ms/step - accuracy: 0.8884 - loss: 0.6302 - val_accuracy: 0.0295 - val_loss: 10.6854
Epoch 21/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 275ms/step - accuracy: 0.9070 - loss: 0.5505 - val_accuracy: 0.0275 - val_loss: 10.8001
Epoch 22/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 276ms/step - accuracy: 0.9076 - loss: 0.5074 - val_accuracy: 0.0305 - val_loss: 10.9251
Epoch 23/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 277ms/step - accuracy: 0.9231 - loss: 0.4279 - val_accuracy: 0.0320 - val_loss: 10.9856
Epoch 24/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 275ms/step - accuracy: 0.9237 - loss: 0.4104 - val_accuracy: 0.0290 - val_loss: 11.1049
Epoch 25/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 275ms/step - accuracy: 



Saved model checkpoint at epoch 30 to path/to/checkpoint/model_epoch_30.h5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 273ms/step - accuracy: 0.9499 - loss: 0.2558 - val_accuracy: 0.0320 - val_loss: 11.4919
Epoch 31/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 833ms/step - accuracy: 0.9545 - loss: 0.2408 - val_accuracy: 0.0295 - val_loss: 11.5246
Epoch 32/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 272ms/step - accuracy: 0.9505 - loss: 0.2306 - val_accuracy: 0.0295 - val_loss: 11.6430
Epoch 33/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 280ms/step - accuracy: 0.9570 - loss: 0.2112 - val_accuracy: 0.0290 - val_loss: 11.6834
Epoch 34/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 278ms/step - accuracy: 0.9635 - loss: 0.1908 - val_accuracy: 0.0310 - val_loss: 11.7304
Epoch 35/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 283ms/step - accuracy:



Saved model checkpoint at epoch 40 to path/to/checkpoint/model_epoch_40.h5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 280ms/step - accuracy: 0.9598 - loss: 0.1827 - val_accuracy: 0.0315 - val_loss: 11.9757
Epoch 41/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 278ms/step - accuracy: 0.9625 - loss: 0.1707 - val_accuracy: 0.0345 - val_loss: 12.0278
Epoch 42/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 279ms/step - accuracy: 0.9585 - loss: 0.1753 - val_accuracy: 0.0320 - val_loss: 12.0479
Epoch 43/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 277ms/step - accuracy: 0.9646 - loss: 0.1594 - val_accuracy: 0.0305 - val_loss: 12.0843
Epoch 44/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 289ms/step - accuracy: 0.9612 - loss: 0.1704 - val_accuracy: 0.0315 - val_loss: 12.0360
Epoch 45/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4139s[0m 17s/step - accuracy: 



Saved model checkpoint at epoch 50 to path/to/checkpoint/model_epoch_50.h5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 276ms/step - accuracy: 0.9637 - loss: 0.1570 - val_accuracy: 0.0310 - val_loss: 12.2428


#### Save the Tokenizer

In [12]:
import pickle

# Assuming 'tokenizer' is already defined and fitted on the text data
tokenizer_path = 'path/to/tokenizer.pkl'  # Path where the tokenizer will be saved
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


#### Load the tokenizer and model for inference and prediction

In [13]:
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the tokenizer
tokenizer_path = 'path/to/tokenizer.pkl'  # Path where the tokenizer is saved
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load the saved model
saved_model_path = 'path/to/checkpoint/model_epoch_50.h5'  # Update this with the actual path
model = tf.keras.models.load_model(saved_model_path)

# Define max_sequence_length as it was during training
max_sequence_length = 54  # Use the same max_sequence_length as used in training

# Define a function to preprocess input text
def preprocess_text(input_text, tokenizer, max_sequence_length):
    cleaned_text = re.sub('[^a-zA-Z0-9\s]', '', input_text).lower()
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length - 1, padding='pre')
    return padded_sequence

# Example input text for inference
#input_text = "Where valiant Talbot above human " #thought #WRONG
#input_text = "All of one nature, of one substance "#bred
#input_text = "Against acquaintance, kindred and " #allies:
input_text="is not a buff jerkin a most sweet robe of " #durance?
#input_text = "A base Walloon, to win the Dauphin's " #grace, #WRONG

# Preprocess the input text
input_sequence = preprocess_text(input_text, tokenizer, max_sequence_length)

# Make a prediction
predicted = model.predict(input_sequence)

# Decode the prediction to get the word
predicted_word_index = np.argmax(predicted, axis=-1)[0]
predicted_word = tokenizer.index_word[predicted_word_index]

print(f"Predicted next word: {predicted_word}")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step
Predicted next word: durance


#### Inference and Evaluation using Perplexity Score and BLEU Score

In [14]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.translate.bleu_score import sentence_bleu

nltk.download('punkt')

def autoCompletions(text, model, tokenizer, max_sequence_length, max_words, reference_text):
    text_sequence = tokenizer.texts_to_sequences([text])
    word_count = 0
    total_log_prob = 0  # To accumulate log probabilities of predicted words

    while word_count < max_words:
        # Pad the current text sequence
        padded_sequence = pad_sequences(text_sequence, maxlen=max_sequence_length - 1, padding='pre')
        
        # Predict probabilities for the next word
        predictions = model.predict(padded_sequence, verbose=0)
        
        # Get the index of the word with the highest probability
        y_pred_test = np.argmax(predictions)
        
        # Retrieve the predicted word corresponding to the index
        predicted_word = ''
        for word, index in tokenizer.word_index.items():
            if index == y_pred_test:
                predicted_word = word
                break
        
        # Append the predicted word to the text
        text += " " + predicted_word
        print(text)
        word_count += 1
        
        # Update text_sequence with the new text
        text_sequence = tokenizer.texts_to_sequences([text])
        
        # Compute log probability of the predicted word and accumulate it
        predicted_word_prob = predictions[0][y_pred_test]
        total_log_prob += np.log(predicted_word_prob + 1e-10)  # Adding a small epsilon to avoid log(0)
    
    # Compute perplexity
    perplexity = np.exp(-total_log_prob / word_count)
    
    # Tokenize the reference text and the generated text
    reference_tokens = nltk.word_tokenize(reference_text)
    candidate_tokens = nltk.word_tokenize(text)
    
    # Compute BLEU score
    bleu_score = sentence_bleu([reference_tokens], candidate_tokens)
    
    return text, perplexity, bleu_score

# Example usage
reference_text = "This cardinal's more haughty than the devil."
complete_sentence, perplexity, bleu_score = autoCompletions("This cardinal's", model, tokenizer, max_sequence_length, 5, reference_text)
print("Completed Sentence:", complete_sentence)
print("Perplexity:", perplexity)
print("BLEU Score:", bleu_score)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kalpi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


This cardinal's shame
This cardinal's shame mine
This cardinal's shame mine blunt
This cardinal's shame mine blunt gentleman
This cardinal's shame mine blunt gentleman fall
Completed Sentence: This cardinal's shame mine blunt gentleman fall
Perplexity: 4.628903715023214
BLEU Score: 3.940055059819774e-78


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


#### Including smoothing to deal with 0 counts of 4-gram

In [15]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

nltk.download('punkt')

def autoCompletions(text, model, tokenizer, max_sequence_length, max_words, reference_text):
    text_sequence = tokenizer.texts_to_sequences([text])
    word_count = 0
    total_log_prob = 0  # To accumulate log probabilities of predicted words

    while word_count < max_words:
        # Pad the current text sequence
        padded_sequence = pad_sequences(text_sequence, maxlen=max_sequence_length - 1, padding='pre')
        
        # Predict probabilities for the next word
        predictions = model.predict(padded_sequence, verbose=0)
        
        # Get the index of the word with the highest probability
        y_pred_test = np.argmax(predictions)
        
        # Retrieve the predicted word corresponding to the index
        predicted_word = ''
        for word, index in tokenizer.word_index.items():
            if index == y_pred_test:
                predicted_word = word
                break
        
        # Append the predicted word to the text
        text += " " + predicted_word
        print(text)
        word_count += 1
        
        # Update text_sequence with the new text
        text_sequence = tokenizer.texts_to_sequences([text])
        
        # Compute log probability of the predicted word and accumulate it
        predicted_word_prob = predictions[0][y_pred_test]
        total_log_prob += np.log(predicted_word_prob + 1e-10)  # Adding a small epsilon to avoid log(0)
    
    # Compute perplexity
    perplexity = np.exp(-total_log_prob / word_count)
    
    # Tokenize the reference text and the generated text
    reference_tokens = nltk.word_tokenize(reference_text)
    candidate_tokens = nltk.word_tokenize(text)
    
    # Compute BLEU score with smoothing
    smoothing_function = SmoothingFunction().method1
    bleu_score = sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothing_function)
    
    return text, perplexity, bleu_score

# Example usage
reference_text = "This cardinal's more haughty than the devil."
complete_sentence, perplexity, bleu_score = autoCompletions("This cardinal's", model, tokenizer, max_sequence_length, 5, reference_text)
print("Completed Sentence:", complete_sentence)
print("Perplexity:", perplexity)
print("BLEU Score:", bleu_score)

#Low Perplexity: Indicates good next-word prediction but might suggest overfitting.
#Low BLEU Score: Indicates the generated text doesn't match well with the reference text.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kalpi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


This cardinal's shame
This cardinal's shame mine
This cardinal's shame mine blunt
This cardinal's shame mine blunt gentleman
This cardinal's shame mine blunt gentleman fall
Completed Sentence: This cardinal's shame mine blunt gentleman fall
Perplexity: 4.628903715023214
BLEU Score: 0.12131756417616475
