In [1]:
import torch

print(torch.cuda.is_available())  # Check if CUDA (GPU) is available
print(torch.cuda.device_count()) # Print number of available GPUs

import torch.nn as nn
from torch.nn import functional as F

import requests

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

True
1
cuda


In [2]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 1000
eval_interval = 50
learning_rate = 1e-2
eval_iters = 200

# GPT-1 hyperparameters
n_embd = 768  # embedding dimension (was smaller in Karpathy's version)
n_head = 12   # number of attention heads (was fewer)
n_layer = 12  # number of transformer blocks (was fewer)
block_size = 512  # context window (matches GPT-1)
dropout = 0.1    # dropout rate

# ------------

torch.manual_seed(1337)

<torch._C.Generator at 0x783fc440e730>

In [3]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
r = requests.get(url=url)
text = r.text

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
  # generate a small batch of data of inputs x and targets y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y

In [4]:
class Head(nn.Module):
  """ one head of self-attention """
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B, T, C = x.shape
    k = self.key(x)    # (B,T,C)
    q = self.query(x)  # (B,T,C)
    # compute attention scores ("affinities")
    wei = q @ k.transpose(-2,-1) * C**-0.5  # (B, T, T)
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
    wei = F.softmax(wei, dim=-1)  # (B, T, T)
    wei = self.dropout(wei)
    # perform the weighted aggregation of the values
    v = self.value(x)  # (B,T,C)
    out = wei @ v  # (B, T, C)
    return out

class MultiHeadAttention(nn.Module):
  """ multiple heads of self-attention in parallel """
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)
    # Add layer normalization
    self.norm = nn.LayerNorm(n_embd)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out))
    return out

class FeedForward(nn.Module):
  """ GPT-1 style feed-forward network """
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(n_embd, 4 * n_embd),
      nn.GELU(),  # GPT-1 used GELU instead of ReLU
      nn.Linear(4 * n_embd, n_embd),
      nn.Dropout(dropout),
    )
    # Add layer normalization
    self.norm = nn.LayerNorm(n_embd)

  def forward(self, x):
    return self.net(x)

class Block(nn.Module):
  """ Transformer block: communication followed by computation """
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    # GPT-1 used layer norm before attention and ffwd (pre-norm)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    # Change order of layer norm (pre-norm)
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

class GPT1Model(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    # Token embeddings
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)

    # Transformer blocks
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])

    # Final layer norm
    self.ln_f = nn.LayerNorm(n_embd)

    # Language model head
    self.lm_head = nn.Linear(n_embd, vocab_size)

    # Better initialization (as used in GPT-1)
    self.apply(self._init_weights)

  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
      if module.bias is not None:
        torch.nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    elif isinstance(module, nn.LayerNorm):
      torch.nn.init.zeros_(module.bias)
      torch.nn.init.ones_(module.weight)

  def forward(self, idx, targets=None):
    B, T = idx.shape

    # Get embeddings
    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
    x = tok_emb + pos_emb

    # Apply transformer blocks
    x = self.blocks(x)
    x = self.ln_f(x)

    # Get logits
    logits = self.lm_head(x)

    # Calculate loss if targets provided
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # crop idx to the last block_size tokens
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, _ = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] / temperature

      # optionally crop probabilities to only the top k options
      if top_k is not None:
        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
        logits[logits < v[:, [-1]]] = float('-inf')

      # apply softmax to convert logits to probabilities
      probs = F.softmax(logits, dim=-1)

      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1)

      # print the generated token
      print(decode([idx_next[0].item()]), end='', flush=True)

      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1)

    print()  # new line at the end
    return idx

model = GPT1Model(vocab_size=vocab_size)
m = model.to(device)

In [7]:
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters).to(device)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

In [8]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
  if iter % eval_interval == 0 or iter == max_iters - 1:
    losses = estimate_loss()
    context = torch.zeros((1, 1), dtype=torch.long, device=device)
    model.generate(context, max_new_tokens=200)
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

uIe
sd A 
  u sea
rO;ahes ad 
akeosoe 
U 
T    Go eg,oa eh  y fk
,
  ui t
tdoef
l  yntfioh
 e twt -
 dgie wnpeetn  me
aleAs.? oh r h e s rse eU   tB
BeA 
Y gaTohnelhw
oetee ri:oekaelcteeeefe U ose e n
step 0: train loss 3.3527, val loss 3.3916
ue oPsroiea anuPgp u Y;clfHndsBtulUnidoona has, oBr nmnh. b: eyIB d  oitm o'ih'rU;kmano o:nilwroo esnyi dnn.rs iog
u t paeeratOntlt.eYu tes
f eudv lnsoGeseen foa Inhud tprv weecReh noaam O sn

e  tn i
step 50: train loss 3.3205, val loss 3.3597
ael go V t ghhd-
CR r nsD
YySee dr,
, t seeiimiflisisenkig aor l dond cmsomN;Pe,  te bai'

T
L efolhi Do?:T'd heviohteab ?NSebn d nle k t yF fd uaths s
I socheehts d cr,sn .ut,EOtls slLona;atoperas ws
step 100: train loss 3.0422, val loss 3.0539
oian d  I ,he hsnd:

 wean vamis sek.War
hyh m canlhneene w onlpedeowmrutejl tasoanle t nshend fdees oyh t r teru ddRA ly:hoaaa' egteamshe inamyeang reenrtcoctsounettisyir thad
 othau pe d ble rn, yhd
step 150: train loss 2.8802, val loss 2.8920
ae oteshrllr alt oc

In [9]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
model.generate(context, max_new_tokens=1200)

 opnd yelal trs
e s? w tom t
L
Ccuid h I d ro tianeepot hhasoueror d ceptitor nerovet ld mceenre s, ts arororus atofenouer mesttom idng sse bhom mis ln a tenoufo'nte o s
Mnn is n  ainand menekenais go ke whapar a f rest s'sh uendn ansth bea dercaey Ieanlacovotit lareon wigut w' eutathpinast thin
B
Odendstiai'ouiveanot; dyoubes oowe!B
Dtor bee he?
Whr iccengofortuny d otor w scrayosh sive thamW orinc., ith binor, msig h y en l ccowhindeeabhoritilise owo t.
or arr ncie ur,
Ont sand g me doulie t:

Sy s
CBas,etatefot, lngithe t be WEhe icrineloowterat th,AWhane s mmnythinde,olishemeRGhat  s thth,n ebrdelloo the t her;puy

sewcheugcWglellld ow:
IUF hee
Ifldre t perth we,in, s s br wotcele 

J US:



Aanast wyth ony me wanat'treaurhe:
OUCathes t s, t bnoureethr the d mere d bethilthe gofthome tlor ouacather sorge ow hisesplervins toly strthoknenr t Fnebo theshondiabp dIshas
Wtej f;etuturura e:
CNI thon? u y.
AU
Whomuleeonthit wle otegcurdb'tary, Pt ce wowNwer p e r ld t in 'ency; mROnghat  

tensor([[ 0,  1, 53,  ...,  1, 39, 52]], device='cuda:0')

In [12]:
from google.colab import drive
drive.mount('/content/drive')

torch.save(model.state_dict(), '/content/drive/My Drive/gp1-shakespeare.pth')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
