# Vocabulary Dataset Tokenized

In [28]:
vocab_tokenized = {
    "hello": 0, "world": 1, "how": 2, "are": 3, "you": 4,
    "good": 5, "morning": 6, "night": 7, "bye": 8
}

reverse_vocab_tokenized = {idx: word for word, idx in vocab_tokenized.items()}

reverse_vocab_tokenized

{0: 'hello',
 1: 'world',
 2: 'how',
 3: 'are',
 4: 'you',
 5: 'good',
 6: 'morning',
 7: 'night',
 8: 'bye'}

# Simple LLM Model

In [37]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class TextGeneratorModel(nn.Module):
    def __init__(self, vocab_size, emb_size=32, h_size=128, max_seq_length=10):
        super(TextGeneratorModel, self).__init__()
        self.emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=emb_size)
        self.lstm1 = nn.LSTM(input_size=emb_size, hidden_size=h_size, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=h_size, hidden_size=h_size, batch_first=True)
        self.fc1 = nn.Linear(h_size, 64)
        self.fc2 = nn.Linear(64, vocab_size)
    def forward(self, x):
        x = self.emb(x)
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x = x[:, -1, :]
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

vocab_size = len(vocab_tokenized)
model = TextGeneratorModel(vocab_size)
criterion = nn.CrossEntropyLoss()
adam_optim = optim.Adam(model.parameters(), lr=0.01)

# Train LLM Model

In [38]:
import torch

# tokenize training data
training_sentences = [
    ["hello", "world"],
    ["how", "are", "you"],
    ["good", "morning"],
    ["good", "night"],
    ["bye"]
]

tokenized_train_sentences = [[vocab_tokenized[word] for word in seq] for seq in training_sentences]

# retrieve X_train and y_train
X_train = []
y_train = []
for seq in tokenized_train_sentences:
    X_train.append(seq[:-1])
    y_train.append(seq[-1])

# uniforme the sequences dimension
max_seq_length = max(len(seq) for seq in X_train)
for seq in X_train:
    while len(seq) < max_seq_length:
        seq.append(0)

X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)

X_train, y_train

(tensor([[0, 0],
         [2, 3],
         [5, 0],
         [5, 0],
         [0, 0]]),
 tensor([1, 4, 6, 7, 8]))

In [39]:
# epoch training loop
epochs = 500
for epoch in range(epochs):
    adam_optim.zero_grad()
    predictions = model(X_train)
    loss = criterion(predictions, y_train)
    loss.backward()
    adam_optim.step()
    if (epoch+1) % 50 == 0:
        print(f"Epoch {epoch}, loss: {loss.item():.4f}")

Epoch 49, loss: 0.5551
Epoch 99, loss: 0.5547
Epoch 149, loss: 0.5545
Epoch 199, loss: 0.5545
Epoch 249, loss: 0.5549
Epoch 299, loss: 0.5545
Epoch 349, loss: 0.5547
Epoch 399, loss: 0.5545
Epoch 449, loss: 0.5545
Epoch 499, loss: 0.5595


# Evaluate the model

In [42]:
def generate_next_word(text):
    sentence = [vocab_tokenized[word] for word in text.split() if word in vocab_tokenized]
    while len(sentence) < max_seq_length:
        sentence.append(0)
    input = torch.tensor([sentence], dtype=torch.long)
    with torch.no_grad():
        output = model(input)
        word_index = torch.argmax(output, dim=1).item()
    word = reverse_vocab_tokenized[word_index]
    return word

In [50]:
print("hello -> ", generate_next_word("hello"))
print("good ->", generate_next_word("good"))
print("how are ->", generate_next_word("how are"))

hello ->  bye
good -> morning
how are -> you
