In [1]:
import tensorflow as tf
print(tf.__version__)

2.17.0


In [2]:
import nltk
from nltk.corpus import stopwords
import string

# Download stop words if not already downloaded
nltk.download('stopwords')

# Function to read the text from a file
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Function to preprocess the text by removing stop words and punctuation
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    
    # Tokenize the text and remove punctuation
    words = text.split()
    words = [word.strip(string.punctuation).lower() for word in words]
    
    # Remove stop words
    filtered_text = [word for word in words if word not in stop_words and word.isalpha()]
    
    return ' '.join(filtered_text)

# Example usage
file_path = 'alllines.txt'

# First, read the text from the file
raw_text = read_text_file(file_path)

# Then, preprocess the text
cleaned_text = preprocess_text(raw_text)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nutech\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  0


In [4]:
cleaned_text[0:100]

'act scene london palace enter king henry lord john lancaster earl westmoreland sir walter blunt othe'

In [5]:
from collections import defaultdict

# Function to tokenize words into integers
def tokenize_text_to_integers(text):
    # Tokenize the text into words
    words = text.split()
    
    # Create a vocabulary (word -> integer mapping)
    word_to_index = defaultdict(lambda: len(word_to_index))  # Automatically assign a new index to new words
    
    # Convert words to their corresponding integer tokens
    tokenized_text = [word_to_index[word] for word in words]
    
    return tokenized_text, dict(word_to_index)  # Returning both tokenized text and the vocabulary

# Tokenize the cleaned text
tokenized_text, vocab = tokenize_text_to_integers(cleaned_text)


In [6]:
def create_sequences(tokenized_text, seq_length):
    sequences = []
    targets = []
    
    # Loop over the tokenized text to create overlapping sequences
    for i in range(len(tokenized_text) - seq_length):
        # Extract the sequence of fixed length
        seq = tokenized_text[i:i + seq_length]
        # The target is the next word (after the sequence)
        target = tokenized_text[i + seq_length]
        
        sequences.append(seq)
        targets.append(target)
    
    return sequences, targets


# Generate sequences and corresponding targets
seq_length=5
sequences, targets = create_sequences(tokenized_text, seq_length)




In [7]:
vocab_size = len(vocab)


In [8]:
print(vocab_size)

21503


In [9]:
print(len(sequences))

394593


In [11]:
index_to_word = {v: k for k, v in vocab.items()}

In [9]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to prepare the input and target data for training
def prepare_training_data(sequences, targets, vocab_size):
    # Padding sequences to ensure uniform length
    X = np.array(sequences[0:10000])  # Already in correct sequence length
    y = to_categorical(targets[0:10000], num_classes=vocab_size)  # One-hot encode the targets
    return X, y

# Build the LSTM model
def build_lstm_model(vocab_size, seq_length, embedding_dim, lstm_units):
    model = Sequential()
    
    # Embedding layer to represent each word as a dense vector of given embedding dimension
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_length))
    
    # LSTM layer
    model.add(LSTM(lstm_units, return_sequences=False))
    
    # Dense output layer with softmax to predict the next word
    model.add(Dense(128, activation='relu'))
    
    model.add(Dense(256, activation='relu'))

    model.add(Dense(512, activation='relu'))

    
    model.add(Dense(vocab_size, activation='softmax'))
    
    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

# Train the model
def train_lstm_model(model, X, y, epochs, batch_size=64):
    with tf.device('/device:GPU:0'):    
        model.fit(X, y, epochs=epochs, batch_size=batch_size)
    
# Function to predict the next word
def predict_next_word(model, user_input, word_to_index, index_to_word, seq_length):
    # Convert user input into tokens (integers)
    tokenized_input = [word_to_index[word] for word in user_input if word in word_to_index]
    
    # Pad or truncate input to the required sequence length
    tokenized_input = pad_sequences([tokenized_input], maxlen=seq_length, padding='pre')
    
    # Predict the next word (returns probabilities for each word in the vocab)
    predicted_probabilities = model.predict(tokenized_input, verbose=0)[0]
    
    # Get the word index with the highest probability
    predicted_index = np.argmax(predicted_probabilities)
    
    # Return the corresponding word
    return index_to_word[predicted_index]

# Example Workflow

# Example tokenized text
#tokenized_text = [0, 1, 2, 3, 4, 5, 6, 7, 2, 3]  # Pre-tokenized example
#vocab = {'this': 0, 'is': 1, 'an': 2, 'example': 3, 'of': 4, 'tokenizing': 5, 'text': 6, 'into': 7, 'integers': 8}
index_to_word = {v: k for k, v in vocab.items()}  # Inverse mapping for predictions

# Parameters
seq_length = 5
embedding_dim = 200
lstm_units = 1024
vocab_size = len(vocab)

# Generate sequences and targets
sequences, targets = create_sequences(tokenized_text, seq_length)

# Prepare data for training
X, y = prepare_training_data(sequences, targets, vocab_size)

# Build the model
#lstm_model = build_lstm_model(vocab_size, seq_length, embedding_dim, lstm_units)

# Train the model
#train_lstm_model(lstm_model, X, y, epochs=10)  # You can adjust the number of epochs

# Prediction
#user_input = ['this', 'is', 'an', 'example', 'of']  # Example user input
#predicted_word = predict_next_word(lstm_model, user_input, vocab, index_to_word, seq_length)

#print(f"Predicted next word: {predicted_word}")


In [7]:
def predict_next_word(model, user_input, word_to_index, index_to_word, seq_length):
    # Convert user input into tokens (integers)
    tokenized_input = [word_to_index[word] for word in user_input if word in word_to_index]
    
    # Pad or truncate input to the required sequence length
    tokenized_input = pad_sequences([tokenized_input], maxlen=seq_length, padding='pre')
    
    # Predict the next word (returns probabilities for each word in the vocab)
    predicted_probabilities = model.predict(tokenized_input, verbose=0)[0]
    
    # Get the word index with the highest probability
    predicted_index = np.argmax(predicted_probabilities)
    
    # Return the corresponding word
    return index_to_word[predicted_index]

In [5]:
loaded_model=tf.keras.models.load_model('new_model.h5')



In [10]:
# Prediction
user_input = ['Was', 'this', 'an ', 'answer', 'to']  # Example user input
predicted_word = predict_next_word(loaded_model, user_input, vocab, index_to_word, seq_length)

print(f"Predicted next word: {predicted_word}")

Predicted next word: john


In [12]:
import streamlit as st
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# Load the trained model
#@st.cache_resource
st.title("Next Word Prediction with LSTM")
st.write("Enter a sequence of 5 words, and the model will predict the next word.")

# User input
user_input = st.text_input("Enter 5 words separated by space:")

# Load the LSTM model

# Define your word-to-index and index-to-word mappings
# For example:
#word_to_index = {'this': 0, 'is': 1, 'an': 2, 'example': 3, 'of': 4, 'tokenizing': 5, 'text': 6, 'into': 7, 'integers': 8}
#index_to_word = {v: k for k, v in word_to_index.items()}  # Inverse mapping for predictions
seq_length = 5  # Assuming you're using 5-word sequences

if user_input:
    input_words = user_input.split()
    
    # Ensure exactly 5 words are input
    if len(input_words) == 5:
        predicted_word = predict_next_word(loaded_model, user_input, vocab, index_to_word, seq_length)
        st.write(f"Next word predicted: {predicted_word}")
        
        # Button to append the predicted word and allow continued text generation
        if st.button("Continue Prediction"):
            input_words.append(predicted_word)
            st.text(f"Updated sequence: {' '.join(input_words)}")
    else:
        st.write("Please enter exactly 5 words.")


2024-09-28 23:03:05.822 
  command:

    streamlit run C:\Users\Nutech\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-09-28 23:03:05.822 Session state does not function when running a script without `streamlit run`


In [12]:
import json



# Save to a JSON file
with open('variables.json', 'w') as f:
    json.dump({'vocab': vocab, 'index_to_word': index_to_word}, f)