In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import requests

In [2]:
if not os.path.exists('sales_textbook.txt'):
    url = 'https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/resolve/main/sales_textbook.txt'
    response = requests.get(url)
    with open('sales_textbook.txt', 'w') as f:
        f.write(response.text)

with open('sales_textbook.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# text[:500]

In [3]:
# hyperparameters
batch_size = 4      # x个批次并行训练
context_size = 16   # 输入序列长度
d_model = 64
num_heads = 4       # 多头注意力的头数

In [4]:
import tiktoken
enc = tiktoken.get_encoding("o200k_base")

In [None]:
tokenized_text = enc.encode(text)
tokenized_text = torch.tensor(tokenized_text, dtype=torch.long)     # len() 表示用到的文字的token的个数
max_token_value = int(torch.max(tokenized_text))                    # 词表里的token的最大值, 词表里的词不一定都用到了

# split to train and validation sets
train_idx= int(len(tokenized_text) * 0.9)
train_data = tokenized_text[:train_idx]
val_data = tokenized_text[train_idx:]

print(len(tokenized_text))
print(max_token_value)
print(tokenized_text[:10])  # Print the first 10 tokens to verify

76923
199853
tensor([ 45990,    220,     16,     25,  22478, 158287,    326,  29742,   4788,
         81932])


In [6]:
data = train_data
batch_idxs = torch.randint(low = 0, high = len(data) - context_size, size = (batch_size,))

x_batch = torch.stack([data[i:i+context_size] for i in batch_idxs])     # tokenized text
y_batch = torch.stack([data[i+1:i+1+context_size] for i in batch_idxs]) # y相比x向前一个位置，作为预测目标

print(batch_idxs)
print(x_batch.shape, y_batch.shape)

tensor([24302, 68156, 63139,  2249])
torch.Size([4, 16]) torch.Size([4, 16])


In [7]:
import pandas as pd
df = pd.DataFrame(x_batch.numpy())
enc.decode(df.iloc[0].tolist())  # decode the first row of x_batch

' challenges and demonstrating your understanding, you establish yourself as a reliable and trustworthy salesperson.\n'

In [None]:
# Define input embedding table
input_embedding_lookup_table = nn.Embedding(num_embeddings=max_token_value + 1, embedding_dim=d_model)

x_batch_embedding = input_embedding_lookup_table(x_batch)  # (batch_size, context_size, d_model)
y_batch_embedding = input_embedding_lookup_table(y_batch)

print(input_embedding_lookup_table)
print(input_embedding_lookup_table.weight)
print(x_batch_embedding.shape, y_batch_embedding.shape)

Embedding(199854, 64)
Parameter containing:
tensor([[ 0.9746,  0.0257,  1.2428,  ...,  0.8464, -1.5246,  0.5678],
        [-0.6309, -0.7373,  1.3936,  ..., -0.4268, -1.2555,  0.4132],
        [-1.8322, -0.3063, -0.8633,  ...,  0.3685,  1.7548, -1.1050],
        ...,
        [ 0.4536, -0.3777,  1.2460,  ...,  0.2020,  0.8204,  1.1518],
        [-0.4620, -2.5583, -1.1699,  ...,  0.2427, -0.0967, -0.9821],
        [ 1.6256,  1.3452, -1.8877,  ...,  0.3258,  1.2051,  0.5673]],
       requires_grad=True)
torch.Size([4, 16, 64]) torch.Size([4, 16, 64])


In [9]:
# Define posional encoding lookup table
import math
position_encoding_lookup_table = torch.zeros(context_size, d_model)         # (context_size, d_model)
position = torch.arange(0, context_size, dtype=torch.float).unsqueeze(1)    #(context_size,) -> (context_size, 1)

print(position_encoding_lookup_table)
print(position_encoding_lookup_table.shape)

div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))               # (d_model/2,)
position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)                                 # (context_size, d_model/2)
position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)                                 # (context_size, d_model/2)
position_encoding_lookup_table = position_encoding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1)  # (1, context_size, d_model)

# print(position_encoding_lookup_table)
print(position_encoding_lookup_table.shape)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([16, 64])
torch.Size([4, 16, 64])


In [10]:
x = x_batch_embedding + position_encoding_lookup_table
y = y_batch_embedding + position_encoding_lookup_table

print(x.shape, y.shape)

torch.Size([4, 16, 64]) torch.Size([4, 16, 64])


In [11]:
# Get Q, K, V
Wq = nn.Linear(d_model, d_model)
Wk = nn.Linear(d_model, d_model)
Wv = nn.Linear(d_model, d_model)

Q = Wq(x)   # batch_size 自动计算了
K = Wk(x)
V = Wv(x)

print(Q.shape, K.shape, V.shape)

# Create muti heads
Q = Q.view(batch_size, context_size, num_heads, d_model//num_heads).permute(0, 2, 1, 3)
K = K.view(batch_size, context_size, num_heads, d_model//num_heads).permute(0, 2, 1, 3)
V = V.view(batch_size, context_size, num_heads, d_model//num_heads).permute(0, 2, 1, 3)

print(Q.shape, K.shape, V.shape)

torch.Size([4, 16, 64]) torch.Size([4, 16, 64]) torch.Size([4, 16, 64])
torch.Size([4, 4, 16, 16]) torch.Size([4, 4, 16, 16]) torch.Size([4, 4, 16, 16])


In [12]:
# Apply attention equation
output = Q @ K.transpose(-2,-1) / math.sqrt(d_model//num_heads)

# mask
mask = torch.triu(torch.ones(context_size, context_size), diagonal=1).bool()
output = output.masked_fill(mask, float('-inf'))
# pd.DataFrame(output[0,0].detach().numpy())

# softmax
output = F.softmax(output, dim=-1)      # 最后一个维度是 word 的 token embedding, 永远是对这个维度操作
output = output @ V                     # (batch_size, num_heads, context_size, d_model) 

print(output.shape)

# Concatenate 
attention = output.permute(0, 2, 1, 3).reshape(batch_size, context_size, d_model)

# Get output
Wo = nn.Linear(d_model, d_model)
output = Wo(attention)
print(output.shape)

torch.Size([4, 4, 16, 16])
torch.Size([4, 16, 64])


In [13]:
# Apply residual connection
output = output + x

In [None]:
# Apply layer normalization
layer_norm = nn.LayerNorm(d_model)
layer_norm_output = layer_norm(output)

In [None]:
# Apply feed forward network
output = nn.Linear(d_model, 4*d_model)(layer_norm_output)
output = nn.ReLU()(output)
output = nn.Linear(4*d_model, d_model)(output)

In [None]:
# Apply residual connection + LayerNorm
output = output + layer_norm_output
layer_norm = nn.LayerNorm(d_model)
output = layer_norm(output)

print(output.shape)

torch.Size([4, 16, 64])


In [None]:
# Define final linear layer
output = nn.Linear(d_model, max_token_value + 1)(output)    # 每个词的维度展开 -> 每个词预测的下一个词的概率
print(output.shape)

torch.Size([4, 16, 199854])


In [33]:
logits = F.softmax(output, dim=-1)
predicted_index = torch.argmax(logits[0,0], dim=-1).item()

print(logits[0, 0])
print(max(logits[0, 0]).item())

print(predicted_index)
print(enc.decode([predicted_index]))

tensor([6.0390e-06, 4.8566e-06, 6.1546e-06,  ..., 1.4018e-05, 2.5888e-06,
        1.2664e-05], grad_fn=<SelectBackward0>)
5.970880374661647e-05
188862
 perioden
