In [None]:
import torch
import torch.functional as F
import torch.nn as nn

import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras import Model, Input, layers, models, optimizers


from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from pprint import pprint
import re

In [None]:
torch.__version__

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

In [None]:
from os import linesep
import string

# Read the file
file_path = '/kaggle/input/text-for-next-word-predictor/leo tolstoy - war and peace.txt'

# Open and read the contents of the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

filtered_text = re.sub(r'-', ' ', text)
filtered_text = re.sub('[^a-zA-Z0-9 \.\n]', '', filtered_text)
filtered_text = re.sub(r'\.{1,}', '', filtered_text)
filtered_text = filtered_text.lower()

words=[]
for (word) in filtered_text.split():
    if word not in words:
        words.append(word)

para=filtered_text.split("\n\n")
print("Total no. of para: ", len(para))
print("Total unique words: ", len(words))

In [None]:
stoi={s:i+1 for i,s in enumerate(words)}
stoi['.']=0
itos={i:s for s,i in stoi.items()}
print(len(itos))

In [None]:
# Hyperparameter
block_size=5 # context_length: how many words do we take to predict the next one

# X and Y matrices to store the data for training
# X stores the half lines
# Y stores the next word
X,Y=[],[]
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for p in para:
  context=[0]*block_size

  for word in p.split():
    word=word.rstrip(string.punctuation)
    ix=stoi[word]
    X.append(context)
    Y.append(ix)
    # print(' '.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix]


# Move data to GPU

X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)


X.shape, Y.shape, X.dtype, Y.dtype

In [None]:
emb_dim = 64 # Hyperparameter

# Embedding layer
emb = Embedding(input_dim=len(stoi), output_dim=emb_dim)

# Since we're not creating a model, just initialize the embedding layer by calling it on some dummy input
emb(tf.constant([[0]]))  # Initialize with a dummy input, similar to `.to(device)` in PyTorch

# Print the embedding layer and weights
print(emb.get_weights()[0].shape)


In [None]:
from tensorflow.keras import layers, Model

class Next_Word_Predictor(Model):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_dim):
        super(Next_Word_Predictor, self).__init__()
        self.block_size = block_size
        self.emb_dim = emb_dim
        self.emb = layers.Embedding(vocab_size, emb_dim)
        self.dense1 = layers.Dense(hidden_dim, activation='relu')
        self.dense2 = layers.Dense(hidden_dim, activation='relu')
        self.dense3 = layers.Dense(hidden_dim, activation='relu')
        self.dense4 = layers.Dense(vocab_size, activation='softmax')  # Output layer

    def call(self, x):
        x = self.emb(x)

        # Reshape to match the input shape for the dense layers
        x = tf.reshape(x, (tf.shape(x)[0], self.block_size * self.emb_dim))
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        x = self.dense4(x)
        return x


In [None]:
# Generate names from untrained model


def generate_next_words(model, itos, stoi, content, block_size, k=10, max_len=10):
    context = content.lower()
    context = re.sub('[^a-zA-Z0-9 \.]', '', context)
    context = [stoi[word.strip(string.punctuation)] for word in context.split()]

    if len(context) <= block_size:
        context = [0] * (block_size - len(context)) + context
    elif len(context) > block_size:
        context = context[-block_size:]

    for i in range(k):
        x = np.array(context).reshape(1, -1)
        y_pred = model(x)
        logits = y_pred.numpy()
        
        ix = tf.random.categorical(logits, num_samples=1).numpy()[0, 0]
        word = itos[ix]
        content += " " + word
        context = context [1:] + [ix]
        
    return content


In [None]:
X_np = X.cpu().numpy()  
Y_np = Y.cpu().numpy()  

# Define and compile the Keras model
model = Next_Word_Predictor(block_size, len(stoi), emb_dim, 1024)
model.compile(optimizer=optimizers.Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy')

# Mini-batch training parameters
batch_size = 1024
print_every = 100
model.fit(X_np, Y_np, batch_size=batch_size, epochs=20, verbose=1)

In [None]:
# Generate names from trained model

para=" "
content=input("Enter some context: ")
k=int(input("Enter no. of words to be generated: "))
for i in range(10):
    para+=generate_next_words(model, itos, stoi, content, block_size, k)
    para+="\n\n"
print(para)

In [None]:
embedding_weights = model.emb.weights[0].numpy()
print(embedding_weights.shape)
# Reduce dimensionality using t-SNE
tsne = TSNE(n_components=2, random_state=42)
embeddings_tsne = tsne.fit_transform(embedding_weights)

# Visualize embeddings
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_tsne[:, 0], embeddings_tsne[:, 1], alpha=0.5)
plt.title('t-SNE Visualization of Embeddings')  
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()