## Self Attention Model
\begin{align}
Q &= X W^Q, \\
K &= X W^K, \\
V &= X W^V, \\
\text{Attention}(Q, K, V) 
  &= \operatorname{Softmax}\!\left(\frac{QK^\top}{\sqrt{d_k}}\right)V.
\end{align}

\begin{align}
\text{MultiHead}(X) 
  &= \text{Concat}\big(\text{head}_1, \ldots, \text{head}_h\big) W^O, \\
\text{head}_i 
  &= \operatorname{Attention}(X W_i^Q, X W_i^K, X W_i^V).
\end{align}


In [1]:
import torch
import torch.nn as nn
class myselfattention(nn.Module):
    def __init__(self, d_in, d_out,context_length,bias=False):
        super().__init__()
        self.W_value=nn.Linear(d_in,d_out,bias=bias)
        self.W_query=nn.Linear(d_in,d_out,bias=bias)
        self.W_key=nn.Linear(d_in,d_out,bias=bias)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New
    def forward(self, x): 
        _,num_tokens,_=x.shape
        value=self.W_value(x)
        query=self.W_query(x)
        key=self.W_key(x)
        attn_scores=key@query.mT
        attn_scores.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)      
        return nn.Softmax(dim=-1)(attn_scores/key.shape[-1]**0.5)@value
myattention=myselfattention(4,4,3)
input=torch.rand(2,3,4)
output=myattention(input)
print(f"output size is: {output.shape}")

output size is: torch.Size([2, 3, 4])


In [2]:
class myselfattention_mh(nn.Module):
    def __init__(self, d_in, d_head, d_out,context_length,bias=False):
        super().__init__()
        self.heads=nn.ModuleList([myselfattention(d_in, d_out//d_head, context_length) for _ in range(d_head)])
        self.W_out=nn.Linear(d_out,d_out,bias)
    def forward(self, x): 
        out=torch.cat([self.heads[i](x) for i in range(len(self.heads))], dim=-1)
        return self.W_out(out)

In [3]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(myselfattention_mh(16,8,16,7))

1024

In [4]:
class mySilu(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x): 
        return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2/torch.pi))*(x+0.044715*x**3)))

In [5]:
class ffn(nn.Module):
    def __init__(self,dim,dropout=0.1,bias=False):
        super().__init__()
        self.W1=nn.Linear(dim,4*dim,bias)
        self.W2=nn.Linear(4*dim,dim,bias)
        self.dropout=nn.Dropout(dropout)
        self.silu=mySilu()
    def forward(self, x): 
        return self.dropout(self.W2(self.silu(self.W1(x))))

In [6]:
class myTransformer(nn.Module):
    def __init__(self, heads, dropout, hidden,context_length,bias=False):
        super().__init__()
        self.myselfattention_mh=myselfattention_mh(hidden,heads,hidden,context_length)#nn.ModuleList([myselfattention(d_in, d_head) for _ in range(d_out//d_head)])
        self.layernorm1=nn.LayerNorm(hidden)
        self.layernorm2=nn.LayerNorm(hidden)
        self.ffn=ffn(hidden,dropout,bias)
        self.dropout=dropout
    def forward(self, x): 
        shortcut1=x;
        x=self.layernorm1(x)
        x=self.myselfattention_mh(x)
        x=nn.Dropout(self.dropout)(x)
        x=x+shortcut1
        shortcut2=x;
        x=self.layernorm2(x)
        x=self.ffn(x)
        x=nn.Dropout(self.dropout)(x)
        x=x+shortcut2
        return x

In [7]:
cfg={
    "layers": 12,
    "heads": 12,
    "dropout": 0.1,
    "context_length": 1024,
    "hidden_dim": 768,
    "voc_size": 50257
}
class myGPT2(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers=cfg["layers"]
        self.heads=cfg["heads"]
        self.dropout=cfg["dropout"]
        self.context_length=cfg["context_length"]
        self.hidden_dim=cfg["hidden_dim"]
        self.voc_size=cfg["voc_size"]
        self.wte=nn.Embedding(self.voc_size,self.hidden_dim)
        self.wpe=nn.Embedding(self.context_length,self.hidden_dim)
        self.layernorm=nn.LayerNorm(self.hidden_dim)
        self.linear=nn.Linear(self.hidden_dim,self.voc_size,bias=False)
        self.transformerBlocks=nn.ModuleList([myTransformer(self.heads,self.dropout,self.hidden_dim,self.context_length) for _ in range(self.layers)])
    def forward(self, x): 
        batch,length=x.shape
        te=self.wte(x);
        pe=self.wpe(torch.arange(length));
        x=te+pe;
        x=nn.Dropout(self.dropout)(x)
        x=nn.Sequential(*self.transformerBlocks)(x)
        x=self.layernorm(x)
        x=self.linear(x)
        return x

In [8]:
x=torch.tensor([[1, 2, 3], [4, 5, 6]])
model=myGPT2(cfg)
model(x).shape

torch.Size([2, 3, 50257])

In [9]:
count_parameters(model)

162954240

In [10]:
input_text1="I study at Clemson"
input_text2="I am a student"
input=[input_text1,input_text2]
import tiktoken

# Load GPT-2 tokenizer
enc = tiktoken.get_encoding("gpt2")

#text = "Hello, I live in Atlanta."
tokens = [enc.encode(input_text) for input_text in input]




In [11]:
tokens

[[40, 2050, 379, 27801], [40, 716, 257, 3710]]

In [12]:
model(torch.tensor(tokens)).shape

torch.Size([2, 4, 50257])

In [13]:
def generate(model,tokens,max_length):
    for i in range(max_length):
        with torch.no_grad():
            x=model(tokens)
        x_pred=x[:,-1,:]
        _,max_ind=torch.max(x_pred, -1, keepdim=True)
        tokens=torch.cat([tokens, max_ind], dim=1)
    return tokens
generated=generate(model,torch.tensor(tokens),6)

In [31]:
generated

tensor([[   40,  2050,   379, 27801, 34473, 15460, 44910, 21511, 20771, 29794],
        [   40,   716,   257,  3710, 11818, 24990, 19440, 43833, 22896, 36284]])

In [32]:
for i, seq in enumerate(generated.tolist()):
    text = enc.decode(seq)
    print(f"Sequence {i}: {text}")

Sequence 0: I study at Clemsonヘラlat Witches weekends educateuko
Sequence 1: I am a student Tan Scientology severityFYworkers vehement


In [33]:
enc.encode(text)

[40, 716, 257, 3710, 11818, 24990, 19440, 43833, 22896, 36284]