In [1]:
#Standard libararies
import os
import numpy as np
import random
import math
from functools import partial

#pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim

In [2]:
import torchvision
from torchvision.datasets import CIFAR100
from torchvision import transforms

In [3]:
import pytorch_lightning as pl
pl.seed_everything(42)

Global seed set to 42


42

In [4]:
#ensure all operations are deterministic on gpu for reproducibility
device=torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

print("Device:" , device)

Device: mps


In [5]:
dataset_path="/Users/joesh/Documents/nlp_book/data"
checkpoint_path="/Users/joesh/Documents/nlp_book/models"

In [6]:
import urllib.request
from urllib.error import HTTPError

# Github URL where saved models are stored for this tutorial
base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial6/"
# Files to download
pretrained_files = ["ReverseTask.ckpt", "SetAnomalyTask.ckpt"]

# Create checkpoint path if it doesn't exist yet
os.makedirs(checkpoint_path, exist_ok=True)

In [7]:
for file_name in pretrained_files:
    file_path=os.path.join(checkpoint_path, file_name)
    if "/" in file_name:
        os.makedirs(file_path.rsplit("/",1)[0], exist_ok=True)
    if not os.path.isfile(file_path):
        file_url = base_url + file_name
        print(f"Downloading {file_url}...")
        try:
            urllib.request.urlretrieve(file_url, file_path)
        except HTTPError as e:
            print("Something went wrong. Please try to download the file from the GDrive folder, or contact the author with the full output including the following error:\n", e)

In [8]:
#key value
def scaled_dot_product(k,v,q, mask=None):
    d_k=q.size()[-1]
    attn_logits=torch.matmul( q, k.transpose(-2,-1))  #just a standard transpose
    attn_logits= attn_logits/math.sqrt(d_k)
    if mask is not None:
        attn_logits=attn_logits.masked_fill(mask==0, -9e15)
    attention=F.softmax(attn_logits, dim=-1)
    values=torch.matmul(attention, v)
    return values, attention

In [9]:
#Note our code supports any additional dimensionality in front of the sequence length so we can use it for batches
#generate some random queries

seq_len, d_k = 3, 2
pl.seed_everything(42)
q = torch.randn(seq_len, d_k)
k = torch.randn(seq_len, d_k)
v = torch.randn(seq_len, d_k)
values, attention = scaled_dot_product(k,v,q)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("Values\n", values)
print("Attention\n", attention)

Global seed set to 42


Q
 tensor([[ 0.3367,  0.1288],
        [ 0.2345,  0.2303],
        [-1.1229, -0.1863]])
K
 tensor([[ 2.2082, -0.6380],
        [ 0.4617,  0.2674],
        [ 0.5349,  0.8094]])
V
 tensor([[ 1.1103, -1.6898],
        [-0.9890,  0.9580],
        [ 1.3221,  0.8172]])
Values
 tensor([[ 0.5698, -0.1520],
        [ 0.5379, -0.0265],
        [ 0.2246,  0.5556]])
Attention
 tensor([[0.4028, 0.2886, 0.3086],
        [0.3538, 0.3069, 0.3393],
        [0.1303, 0.4630, 0.4067]])
