In [1]:
from google.colab import drive
import torch
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
class SimpleRnn(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(SimpleRnn, self).__init__()

    self.W_xh = nn.Parameter(torch.randn(input_size, hidden_size).to(device))
    self.W_hh = nn.Parameter(torch.randn(hidden_size, hidden_size).to(device))
    self.b_h = nn.Parameter(torch.zeros(hidden_size).to(device))

    self.W_yh = nn.Parameter(torch.randn(hidden_size, output_size).to(device))
    self.b_y = nn.Parameter(torch.zeros(output_size).to(device))

  def forward(self, x):
    batch_size, seq_length, _ = x.size()

    h = torch.zeros(batch_size, self.W_hh.size(0), device=device)

    for i in range(seq_length):
      xt = x[:, i, :]
      h = torch.tanh(torch.matmul(xt, self.W_xh) + torch.matmul(h, self.W_hh) + self.b_h)

    y = torch.matmul(h, self.W_yh) + self.b_y
    return y

In [3]:
class SimpleLSTMCell(nn.Module):
  def __init__(self, input_size, hidden_size):
    super(SimpleLSTMCell, self).__init__()

    self.input_size = input_size
    self.hidden_size = hidden_size

    # Input gate components
    self.W_ii = nn.Parameter(torch.Tensor(input_size, hidden_size))
    self.W_hi = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
    self.b_i = nn.Parameter(torch.Tensor(hidden_size))

    # Forget gate components
    self.W_if = nn.Parameter(torch.Tensor(input_size, hidden_size))
    self.W_hf = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
    self.b_f = nn.Parameter(torch.Tensor(hidden_size))

    # Cell gate components
    self.W_ig = nn.Parameter(torch.Tensor(input_size, hidden_size))
    self.W_hg = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
    self.b_g = nn.Parameter(torch.Tensor(hidden_size))

    # Output gate components
    self.W_io = nn.Parameter(torch.Tensor(input_size, hidden_size))
    self.W_ho = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
    self.b_o = nn.Parameter(torch.Tensor(hidden_size))

    self.init_weights()

  def init_weights(self):
    for param in self.parameters():
      nn.init.uniform_(param, -0.1, 0.1)

  def forward(self, x, hidden):
    h_prev, c_prev = hidden

    i_t = torch.sigmoid(x @ self.W_ii + h_prev @ self.W_hi + self.b_i)
    f_t = torch.sigmoid(x @ self.W_if + h_prev @ self.W_hf + self.b_f)
    g_t = torch.tanh(x @ self.W_ig + h_prev @ self.W_hg + self.b_g)
    o_t = torch.sigmoid(x @ self.W_io + h_prev @ self.W_ho + self.b_o)

    c_t = f_t * c_prev + i_t * g_t
    h_t = o_t * torch.tanh(c_t)

    return h_t, (h_t, c_t)



class SimpleLSTM(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers):
    super(SimpleLSTM, self).__init__()

    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.cells = nn.ModuleList([SimpleLSTMCell(input_size, hidden_size) if i == 0
                                else SimpleLSTMCell(hidden_size, hidden_size)
                                for i in range(num_layers)])


  def forward(self, x):
    batch_size, seq_len, _ = x.size()
    h = [torch.zeros(batch_size, self.hidden_size).to(device) for _ in range(self.num_layers)]
    c = [torch.zeros(batch_size, self.hidden_size).to(device) for _ in range(self.num_layers)]

    for t in range(seq_len):
      x_t = x[:, t, :]
      for i, cell in enumerate(self.cells):
        h[i], (h[i], c[i]) = cell(x_t, (h[i], c[i]))
        x_t = h[i]

    return h[-1]

In [21]:
# creating model
g = torch.Generator(device=device).manual_seed(2)

C_dimensions = 30

class Model(nn.Module):
  def __init__(self, vocab_size, embedding_dim=200, lstm_hidden_size=800, dropout_rate=0.2, num_classes=10):
    super(Model, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)

    self.lstm = SimpleLSTM(input_size=embedding_dim, hidden_size=lstm_hidden_size, num_layers=2)

    self.dropout1 = nn.Dropout(dropout_rate)

    self.fc_layers = nn.Sequential(
        nn.Linear(lstm_hidden_size, 2500),
        nn.ReLU(),
        nn.Dropout(dropout_rate),

        nn.Linear(2500, 700),
        nn.ReLU(),
        nn.Dropout(dropout_rate),

        nn.Linear(700, num_classes)
    )

    self.apply(self._init_weights)

  def _init_weights(self, m):
      if isinstance(m, nn.Linear):
          nn.init.xavier_uniform_(m.weight)
          if m.bias is not None:
              nn.init.zeros_(m.bias)

  def forward(self, x):
    x = self.embedding(x)

    lstm_out = self.lstm(x)

    logits = self.fc_layers(lstm_out)

    return logits

  def forward_train(self, x, y):
    logits = self.forward(x)
    loss = F.cross_entropy(logits, y)
    return loss


# Shecspir Dataset

In [22]:
import requests

url = "https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt"

text_data = requests.get(url)
text_data = text_data.text

In [23]:
# genereting tokens
tokens = []

for letter in text_data:
  if letter not in tokens:
    tokens.append(letter)

tokens.sort()
tokens = ["."] + tokens
stoi = {}
itos = {}
for i in range(len(tokens)):
  itos[i] = tokens[i]
  stoi[tokens[i]] = i
tokens_length = len(stoi)

# transform to tokens
X = []
Y = []

for i in range(len(text_data) - 1):
  X.append(stoi[text_data[i]])
  Y.append(stoi[text_data[i + 1]])

X_tensor = torch.tensor(X)
Y_tensor = torch.tensor(Y)

train_size = int(len(X_tensor) * 0.8)
val_size = int(len(X_tensor) * 0.1)

X_train, Y_train = X_tensor[:train_size], Y_tensor[:train_size]
X_val, Y_val = X_tensor[train_size:train_size + val_size], Y_tensor[train_size:train_size + val_size]
X_test, Y_test = X_tensor[train_size + val_size:], Y_tensor[train_size + val_size:]

In [28]:
model = Model(vocab_size=tokens_length, num_classes=tokens_length).to(device)
print(sum(p.numel() for p in model.parameters()))

12161591


In [29]:
loss_ar = []
test_loss_ar = []
loss = torch.tensor(0)

batch_size = 32
context_length = 400
accumulation_steps = 10
learning_rate_schedule = {20: 0.01}
num_epochs = 750
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

for epoch in range(50):
  if epoch % 3 == 0:
    print(epoch, loss.item())

  optimizer.zero_grad()

  epoch_loss = 0

  for step in range(accumulation_steps):
    ix = torch.randint(0, X_train.shape[0] - context_length, (batch_size,), device=device)

    X_batch = torch.stack([X_train[i:i+context_length] for i in ix])
    Y_batch = torch.stack([Y_train[i+context_length] for i in ix])

    X_batch = X_batch.to(device)
    Y_batch = Y_batch.to(device)

    # forward pass
    loss = model.forward_train(X_batch, Y_batch)
    epoch_loss += loss.item()

    loss.backward()

  if epoch in learning_rate_schedule:
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate_schedule[epoch]

  optimizer.step()

  loss_ar += [epoch_loss / accumulation_steps]

plt.plot(loss_ar, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Test Loss')
plt.legend()
plt.show()


0 0
5 4.504734039306641
10 4.223953723907471
15 4.303626537322998
20 4.1727399826049805
25 4.127109527587891
30 4.080804347991943
35 4.086588382720947
40 3.9969305992126465
45 4.133274555206299


KeyboardInterrupt: 

In [36]:
torch.cuda.empty_cache()

In [38]:
# test dataset
ix = torch.randint(0, X_test.shape[0] - context_length, (10,), device=device)

X_batch = torch.stack([X_test[i:i+context_length] for i in ix])
Y_batch = torch.stack([X_test[i+context_length] for i in ix])
X_batch = X_batch.to(device)
Y_batch = Y_batch.to(device)
model.forward_train(X_batch, Y_batch)
loss.item()

In [None]:
# validate dataset
ix = torch.randint(0, X_val.shape[0] - context_length, (1000,), device=device)

X_batch = torch.stack([X_val[i:i+context_length] for i in ix])
Y_batch = torch.stack([Y_val[i+context_length] for i in ix])
X_batch = X_batch.to(device)
Y_batch = Y_batch.to(device)
model.forward_train(X_batch, Y_batch)
loss.item()

In [12]:
def generate_content(model, start_token, max_length=100, temperature = 1):
  model.eval()

  context = [start_token] * context_length
  generated_tokens = context.copy()

  context_tensor = torch.tensor(context, dtype=torch.long, device=device).unsqueeze(0)

  with torch.no_grad():
    for _ in range(max_length):
      logits = model(context_tensor)
      logits = logits / temperature
      probs = F.softmax(logits, dim=-1)
      next_token = torch.multinomial(probs, 1).item()

      generated_tokens.append(next_token)

      context_tensor = torch.cat([context_tensor[:, 1:], torch.tensor([[next_token]], device=device)], dim=1)

  generated_text = ''.join([itos[idx] for idx in generated_tokens])

  return generated_text

In [39]:
start_token = stoi["K"]

generated_text = generate_content(model, start_token=start_token, max_length=500)

# Print the generated content
print(generated_text)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 1.06 MiB is free. Process 272284 has 14.74 GiB memory in use. Of the allocated memory 13.46 GiB is allocated by PyTorch, and 1.15 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)