## Next word predictor 

In [1]:
import pandas as pd
import numpy as np

### - Opening the dataset and visualizing some of the rows of the book.

In [2]:
df = pd.read_table("Downloads/Sherlock Holmes Dataset.txt")
print(df.head())

                          THE ADVENTURES OF SHERLOCK HOLMES
0                                 Arthur Conan Doyle       
1                                  Table of contents       
2                               A Scandal in Bohemia       
3                              The Red-Headed League       
4                                 A Case of Identity       


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565 entries, 0 to 9564
Data columns (total 1 columns):
 #   Column                                                     Non-Null Count  Dtype 
---  ------                                                     --------------  ----- 
 0                           THE ADVENTURES OF SHERLOCK HOLMES  9565 non-null   object
dtypes: object(1)
memory usage: 74.9+ KB


In [4]:
df.tail()

Unnamed: 0,THE ADVENTURES OF SHERLOCK HOLMES
9560,warranties of merchantablity or fitness f...
9561,This text was formatted from various free...
9562,See http://sherlock-holm.es for an electr...
9563,additional information about it.
9564,This text comes from the collection's ver...


In [5]:
print(df.columns)
print(df.head())


Index(['                        THE ADVENTURES OF SHERLOCK HOLMES'], dtype='object')
                          THE ADVENTURES OF SHERLOCK HOLMES
0                                 Arthur Conan Doyle       
1                                  Table of contents       
2                               A Scandal in Bohemia       
3                              The Red-Headed League       
4                                 A Case of Identity       


### - Data preprocessing 

In [6]:
df.columns = df.columns.str.strip()
# Remove the leading and trailing spaces from the column name.

In [7]:
text_data = ' '.join(df['THE ADVENTURES OF SHERLOCK HOLMES'].tolist())
# Concatenate the text data into a single string for preprocessing.

In [8]:
import re

def preprocess_text(text):
    # Convert text to lower case
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

clean_text = preprocess_text(text_data)


### - Tokenize

In [9]:
from nltk.tokenize import word_tokenize
import nltk

# nltk.download('punkt')

tokens = word_tokenize(clean_text)


In [10]:
# Generate sequences of a fixed number of words (e.g., sequences of 5 words).
sequence_length = 5
sequences = []

for i in range(sequence_length, len(tokens)):
    seq = tokens[i-sequence_length:i]
    sequences.append(' '.join(seq))

print(sequences[:10])  # Print the first 10 sequences


['arthur conan doyle table of', 'conan doyle table of contents', 'doyle table of contents a', 'table of contents a scandal', 'of contents a scandal in', 'contents a scandal in bohemia', 'a scandal in bohemia the', 'scandal in bohemia the redheaded', 'in bohemia the redheaded league', 'bohemia the redheaded league a']


### - Deep learning from Keras and Tensorflow

In [14]:
# pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [15]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [16]:
# Encode sequences and split into input (X) and output (y).
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)

vocabulary_size = len(tokenizer.word_index) + 1

sequences = np.array(sequences)

X = sequences[:, :-1]
y = sequences[:, -1]

y = tf.keras.utils.to_categorical(y, num_classes=vocabulary_size)



In [17]:
# Ensure all input sequences are of the same length.
max_sequence_len = X.shape[1]
X = pad_sequences(X, maxlen=max_sequence_len, padding='pre')


In [18]:
# define the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(vocabulary_size, 50, input_length=max_sequence_len))
model.add(LSTM(100))
model.add(Dense(vocabulary_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [20]:
# model training
model.fit(X, y, epochs=100, verbose=1)


Epoch 1/100
[1m3268/3268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 23ms/step - accuracy: 0.0562 - loss: 6.7447
Epoch 2/100
[1m3268/3268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 25ms/step - accuracy: 0.0926 - loss: 5.8590
Epoch 3/100
[1m3268/3268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 27ms/step - accuracy: 0.1274 - loss: 5.3976
Epoch 4/100
[1m3268/3268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 30ms/step - accuracy: 0.1455 - loss: 5.0505
Epoch 5/100
[1m3268/3268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 26ms/step - accuracy: 0.1631 - loss: 4.7714
Epoch 6/100
[1m3268/3268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 30ms/step - accuracy: 0.1793 - loss: 4.5070
Epoch 7/100
[1m3268/3268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 28ms/step - accuracy: 0.1931 - loss: 4.2853
Epoch 8/100
[1m3268/3268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 26ms/step - accuracy: 0.2159 - loss: 4.0702


<keras.src.callbacks.history.History at 0x22f6c7f2810>

In [22]:
# Generate the next word
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

print(generate_text("sherlock holmes", 5, max_sequence_len))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
sherlock holmes sprang round and in his


### RNN and LTSM

In [23]:
# RNN model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Build the RNN model
rnn_model = Sequential()
rnn_model.add(Embedding(vocabulary_size, 50, input_length=max_sequence_len))
rnn_model.add(SimpleRNN(150, return_sequences=False))
rnn_model.add(Dense(vocabulary_size, activation='softmax'))

rnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
rnn_model.summary()


In [24]:
# LTSM Model
from tensorflow.keras.layers import LSTM

# Build the LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(vocabulary_size, 50, input_length=max_sequence_len))
lstm_model.add(LSTM(150, return_sequences=False))
lstm_model.add(Dense(vocabulary_size, activation='softmax'))

lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()


### Model Training

In [27]:
# RNN model Training
rnn_model.fit(X, y, epochs=50, batch_size=64, validation_split=0.2, verbose=1)


Epoch 1/50
[1m1308/1308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 61ms/step - accuracy: 0.0550 - loss: 6.8952 - val_accuracy: 0.0785 - val_loss: 6.1520
Epoch 2/50
[1m1308/1308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 53ms/step - accuracy: 0.0965 - loss: 5.8235 - val_accuracy: 0.1118 - val_loss: 5.8597
Epoch 3/50
[1m1308/1308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 51ms/step - accuracy: 0.1285 - loss: 5.2824 - val_accuracy: 0.1188 - val_loss: 5.8527
Epoch 4/50
[1m1308/1308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 52ms/step - accuracy: 0.1485 - loss: 4.8593 - val_accuracy: 0.1240 - val_loss: 5.9273
Epoch 5/50
[1m1308/1308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 55ms/step - accuracy: 0.1715 - loss: 4.4941 - val_accuracy: 0.1212 - val_loss: 6.0174
Epoch 6/50
[1m1308/1308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 52ms/step - accuracy: 0.1983 - loss: 4.1593 - val_accuracy: 0.1198 - val_loss: 6.1523
Epoc

<keras.src.callbacks.history.History at 0x23148bc05d0>

In [28]:
# LTSM Model training
lstm_model.fit(X, y, epochs=50, batch_size=64, validation_split=0.2, verbose=1)


Epoch 1/50
[1m1308/1308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 66ms/step - accuracy: 0.0546 - loss: 6.9129 - val_accuracy: 0.0581 - val_loss: 6.3873
Epoch 2/50
[1m1308/1308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 57ms/step - accuracy: 0.0750 - loss: 6.1386 - val_accuracy: 0.0840 - val_loss: 6.1043
Epoch 3/50
[1m1308/1308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 43ms/step - accuracy: 0.1019 - loss: 5.7038 - val_accuracy: 0.1090 - val_loss: 5.9490
Epoch 4/50
[1m1308/1308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 43ms/step - accuracy: 0.1265 - loss: 5.3609 - val_accuracy: 0.1188 - val_loss: 5.8997
Epoch 5/50
[1m1308/1308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 40ms/step - accuracy: 0.1420 - loss: 5.0768 - val_accuracy: 0.1254 - val_loss: 5.9178
Epoch 6/50
[1m1308/1308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 50ms/step - accuracy: 0.1555 - loss: 4.8115 - val_accuracy: 0.1289 - val_loss: 5.9500
Epoc

<keras.src.callbacks.history.History at 0x233034018d0>

### Model Evaluation

#### RNN Model evaluation

In [32]:
rnn_loss, rnn_accuracy = rnn_model.evaluate(X, y, verbose=1)
print(f'RNN Model - Loss: {rnn_loss}, Accuracy: {rnn_accuracy}')


[1m3268/3268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 6ms/step - accuracy: 0.8404 - loss: 0.8124
RNN Model - Loss: 2.568842887878418, Accuracy: 0.7012012004852295


#### LTSM model evaluation

In [33]:
lstm_loss, lstm_accuracy = lstm_model.evaluate(X, y, verbose=1)
print(f'LSTM Model - Loss: {lstm_loss}, Accuracy: {lstm_accuracy}')


[1m3268/3268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 8ms/step - accuracy: 0.8706 - loss: 0.7408
LSTM Model - Loss: 2.499688148498535, Accuracy: 0.7274631261825562


### Creativity and language analysis

#### Generate Text

In [34]:
def generate_text(model, tokenizer, seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

seed_text = "sherlock holmes"
print("RNN Generated Text:")
print(generate_text(rnn_model, tokenizer, seed_text, 20, max_sequence_len))
print("LSTM Generated Text:")
print(generate_text(lstm_model, tokenizer, seed_text, 20, max_sequence_len))


RNN Generated Text:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 444ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

#### Linguistic Analysis

##### Frequency Analysis

In [35]:
from collections import Counter

def word_frequency(text):
    tokens = word_tokenize(text.lower())
    return Counter(tokens)

# Original text frequency
original_freq = word_frequency(clean_text)

# RNN generated text frequency
rnn_generated_text = generate_text(rnn_model, tokenizer, seed_text, 100, max_sequence_len)
rnn_freq = word_frequency(rnn_generated_text)

# LSTM generated text frequency
lstm_generated_text = generate_text(lstm_model, tokenizer, seed_text, 100, max_sequence_len)
lstm_freq = word_frequency(lstm_generated_text)

print("Original Text Frequency:")
print(original_freq.most_common(10))
print("RNN Generated Text Frequency:")
print(rnn_freq.most_common(10))
print("LSTM Generated Text Frequency:")
print(lstm_freq.most_common(10))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27

##### POS Tagging
compare part-of-speech distribution

In [36]:
# 
import nltk
nltk.download('averaged_perceptron_tagger')

def pos_tag_analysis(text):
    tokens = word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    return Counter(tag for word, tag in tags)

# Original text POS tags
original_pos = pos_tag_analysis(clean_text)

# RNN generated text POS tags
rnn_pos = pos_tag_analysis(rnn_generated_text)

# LSTM generated text POS tags
lstm_pos = pos_tag_analysis(lstm_generated_text)

print("Original Text POS Tags:")
print(original_pos.most_common(10))
print("RNN Generated Text POS Tags:")
print(rnn_pos.most_common(10))
print("LSTM Generated Text POS Tags:")
print(lstm_pos.most_common(10))


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Newton\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


Original Text POS Tags:
[('NN', 17985), ('IN', 13318), ('DT', 10693), ('PRP', 7480), ('JJ', 7179), ('VBD', 6708), ('RB', 6468), ('VB', 4574), ('CC', 3886), ('NNS', 3835)]
RNN Generated Text POS Tags:
[('IN', 17), ('NN', 12), ('JJ', 11), ('VBD', 11), ('RB', 8), ('PRP', 8), ('VBG', 7), ('NNS', 5), ('VBP', 5), ('VBN', 4)]
LSTM Generated Text POS Tags:
[('NN', 19), ('IN', 14), ('DT', 13), ('PRP$', 8), ('JJ', 8), ('VBN', 6), ('VBD', 5), ('NNS', 4), ('VBZ', 4), ('VBP', 4)]
