In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
import matplotlib.pyplot as plt

In [4]:
!pip install datasets



In [5]:
import re
from collections import Counter

In [6]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency re

In [7]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
lines = [x['text'] for x in dataset['train'] if x['text'].strip() != ""]
text = " ".join(lines[:100])  # only the first 1000 non-empty lines

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [8]:
def tokenizer(text):
  return re.sub(r"[^\w\s]","",text.lower()).split()

In [9]:
tokens = tokenizer(text)

In [10]:
def buildVocab(tokens):
  counter = Counter(tokens);
  vocab = {word: idx for idx, (word, count) in enumerate(counter.items())}
  inv_vocab = {idx: word for word, idx in vocab.items()};
  token_ids  = [vocab[word] for word in tokens]
  vocab_size = len(vocab)
  return vocab, inv_vocab, token_ids, vocab_size;

In [11]:
vocab, inv_vocab, token_ids, vocab_size = buildVocab(tokens)
context_size = 4

In [12]:
def buildInputs(context_size):
  inputs, targets = [], []
  for i in range(context_size, len(token_ids)):
    context = token_ids[i-context_size:i]
    target = token_ids[i]
    inputs.append(torch.tensor(context))
    targets.append(torch.tensor(target))
  return inputs, targets

In [13]:
inputs, targets = buildInputs(context_size);

In [14]:
class NextWordDataSet(Dataset):
  def __init__(self, X, y):
    super().__init__()
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [15]:
dataset = NextWordDataSet(inputs, targets)
loader = DataLoader(dataset, batch_size = 2, shuffle=True)

In [16]:
class LSTMModel(nn.Module):
  def __init__(self, vocab_size, embed_size, hidden_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embed_size)
    self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
    self.fc = nn.Linear(hidden_size, vocab_size)

  def forward(self, x):
    x = self.embedding(x)
    out, (h_n, c_n) = self.lstm(x)
    return self.fc(h_n[-1])

In [17]:
vocab_size = len(vocab)
embed_size = 100
hidden_size = 128

model = LSTMModel(vocab_size, embed_size, hidden_size);
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
losses = []
epochs = 20

for epoch in range(epochs):
  epoch_loss = 0
  for X, y in loader:
    optimizer.zero_grad()
    logits = model(X)
    loss = criterion(logits, y)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
  avg_loss_in_epoch = epoch_loss/len(loader)
  losses.append(avg_loss_in_epoch)
  print(f'Epoch {epoch+1}: Loss = {avg_loss_in_epoch}')

In [None]:
plt.plot(losses, marker='o')
plt.title("Training Loss vs Epoch (Next Word Prediction)")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True)
plt.show()

In [None]:
def inferNextWord(self, input_context):
  model.eval()
  with torch.no_grad():
    input_context_ids = [vocab[word] for word in input_context[-context_size:]]
    x = torch.tensor(input_context_ids).unsqueeze(0)
    logits = model(x)
    pred_id = torch.argmax(logits, dim=-1).item()
    return inv_vocab[pred_id]

In [None]:
seed = ["most", "of", "the", "equipment", "arms", "and", "machinery"]
outputString = ' '.join(seed);

for _ in range(20):
    next_word = inferNextWord(model, seed)
    seed.append(next_word)
    outputString += ' ' + next_word
    print(outputString)
    seed.pop(0)
