In [5]:
# Imports
import torch
from torch import nn
from torch.nn import functional as F

import numpy as np

from matplotlib import pyplot as plt

import time

import pandas as pd

import urllib.request

In [19]:
# Model parameters
MASTER_CONFIG = {
    "batch_size": 8,      # Number of batches
    "context_window": 16  # Number of characters in a batch
}

# Data Preprocessing

In [7]:
# Tiny Shakespeare
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
file_name = "tinyshakespeare.txt"
urllib.request.urlretrieve(url, file_name)

('tinyshakespeare.txt', <http.client.HTTPMessage at 0x7eab0accaf50>)

In [10]:
# Create vocab list of unique chars
lines = open("tinyshakespeare.txt", 'r').read()
vocab = sorted(list(set(lines)))

print("First 10 chars of vocab list:", vocab[:10])
print("Vocab size:", len(vocab))

First 10 chars of vocab list: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3']
Vocab size: 65


In [11]:
# Int to string
itos = {i: ch for i, ch in enumerate(vocab)}

# String to int
stoi = {ch: i for i, ch in enumerate(vocab)}

In [15]:
# Encoding and decoding
def encode(s):
    return [stoi[ch] for ch in s]

def decode(l):
    return ''.join([itos[i] for i in l])

# Example
print(encode("hello world"))
print(decode(encode("hello world")))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world


In [16]:
# Convert dataset into a tensor
dataset = torch.tensor(encode(lines), dtype=torch.int8)

# Approximately 1 million tokens (chars)
print(dataset.shape)

torch.Size([1115394])


In [17]:
# Split dataset into train/val/test batches
def get_batches(data, split, batch_size, context_window, config=MASTER_CONFIG):
  # Train/val/test split = 0.8/0.1/0.1
  train = data[:int(0.8 * len(data))]
  val = data[int(0.8 * len(data)):int(0.9 * len(data))]
  test = data[int(0.9 * len(data)):]

  # Determine which batch to use
  batch_data = train
  if split == "val":
    batch_data = val
  elif split == "test":
    batch_data = test

  # batch_size number of random starting points in the data
  ix = torch.randint(0, batch_data.size(0) - context_window - 1, (batch_size,))

  # Input and target sequences
  x = torch.stack([batch_data[i:i + context_window] for i in ix]).long()
  y = torch.stack([batch_data[i + 1:i + context_window + 1] for i in ix]).long()

  return x, y

In [23]:
# Obtain batches for training
xs, ys = get_batches(dataset, "train", MASTER_CONFIG["batch_size"], MASTER_CONFIG["context_window"])

# Decode batches
decoded_samples = [(decode(xs[i].tolist()), decode(ys[i].tolist())) for i in range(len(xs))]

# (sample, target)
print(decoded_samples)

[('desires,\nI am fr', 'esires,\nI am fri'), ('r heirs, God, if', ' heirs, God, if '), ('hurch,\nShall hap', 'urch,\nShall happ'), ('hat make her goo', 'at make her good'), ('nown to the camp', 'own to the camp,'), ('And then awake a', 'nd then awake as'), ('onour\nThan one o', 'nour\nThan one on'), ('\nNot fearing out', 'Not fearing outw')]


# Loss

In [24]:
# Computes the mean loss for 10 batches for train/val
@torch.no_grad()
def evaluate_loss(model, config=MASTER_CONFIG):
  out = {}

  # Set the model to evaluation mode
  model.eval()

  # Iterate through train and val splits
  for split in ["train", "val"]:
    losses = []

    # Get 10 sample batches
    for _ in range(10):
      # Input and target sequences
      xb, yb = get_batches(dataset, split, config["batch_size"], config["context_window"])
      # Run the model and calculate the loss
      _, loss = model(xb, yb)
      losses.append(loss.item())

    out[split] = np.mean(losses)

  # Set the model to train mode
  model.train()

  return out

# Base NN