In [1]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import tiktoken
import torch
import torch.nn as nn

In [2]:
# Hyperparameters
batch_size = 4  # How many batches per training step
context_length = 16  # Length of the token chunk each batch
d_model = 64  # The vector size of the token embeddings
num_layers = 8  # Number of transformer blocks
num_heads = 4  # Number of heads in Multi-head attention # 我们的代码中通过 d_model / num_heads = 来获取 head_size
learning_rate = 1e-3  # 0.001
dropout = 0.1 # Dropout rate
max_iters = 500  # Total of training iterations
eval_interval = 50  # How often to evaluate the model 
eval_iters = 20  # How many iterations to average the loss over when evaluating the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Instead of using the cpu, we'll use the GPU if it's available.

TORCH_SEED = 1337
torch.manual_seed(TORCH_SEED)

<torch._C.Generator at 0x2392d1a2830>

In [3]:
with open('./sales_textbook.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(text[0:100])

Chapter 1: Building Rapport and Capturing Attention
Subpoint: Understanding the Importance of Buildi


### tokenizer 分词

In [4]:
# Using TikToken to tokenize the source text
encoding = tiktoken.get_encoding("cl100k_base")
tokenized_text = encoding.encode(text)
tokenized_text = torch.tensor(tokenized_text, dtype=torch.long, device=device) # Convert tokens into a tensor
max_token_value = tokenized_text.max().item() # the maximum index value in our vocabulary

print(f"Tokenized text size: {len(tokenized_text)}")
print(f"The maximum value in the tokenized text is: {max_token_value}")

Tokenized text size: 77919
The maximum value in the tokenized text is: 100069


In [6]:
# Split train and validation
split_idx = int(len(tokenized_text) * 0.9)
train_data = tokenized_text[:split_idx]
val_data = tokenized_text[split_idx:]
len(train_data), len(val_data)

(70127, 7792)

In [7]:
# Prepare data for training batch
data = train_data
idxs = torch.randint(low=0, high=len(data) - context_length, size=(batch_size,))
print(idxs)
x_batch = torch.stack([data[idx:idx + context_length] for idx in idxs])
y_batch = torch.stack([data[idx + 1:idx + context_length + 1] for idx in idxs])
print(x_batch.shape, x_batch.shape)
print(x_batch.data)

[encoding.decode([i]) for i in x_batch[0]],[encoding.decode([i]) for i in y_batch[0]]

tensor([35754, 55550, 63572,  1447])
torch.Size([4, 16]) torch.Size([4, 16])
tensor([[  279,  6763,  1920,    13,   578,  5845,   311, 13750, 19570,   279,
           907,   323,  7720,   315,  1057,  3956],
        [ 3495, 14955,    11,   477,  5064, 23146,   430,  9788,   279, 66732,
           315,   701, 10209,    13,  3296, 32644],
        [38769, 10742,    11, 20958,   264,  6928, 19451,    11, 11125, 64784,
            11,   323, 56501, 54111,   439,  6975],
        [43496,   872,  8830,   719,  1101,  3727,   279,  6130,  2733,  6755,
           323, 16365,   627, 29831, 19682,  5900]], device='cuda:0')


([' the',
  ' sales',
  ' process',
  '.',
  ' The',
  ' ability',
  ' to',
  ' effectively',
  ' communicate',
  ' the',
  ' value',
  ' and',
  ' benefits',
  ' of',
  ' our',
  ' products'],
 [' sales',
  ' process',
  '.',
  ' The',
  ' ability',
  ' to',
  ' effectively',
  ' communicate',
  ' the',
  ' value',
  ' and',
  ' benefits',
  ' of',
  ' our',
  ' products',
  ' or'])

In [8]:
# Illustration purpose
pd.set_option('display.expand_frame_repr', False)
print("Our batches:\n", pd.DataFrame(x_batch.detach().cpu().numpy()))

Our batches:
       0      1     2      3     4      5      6      7      8      9    10     11     12     13     14     15
0    279   6763  1920     13   578   5845    311  13750  19570    279  907    323   7720    315   1057   3956
1   3495  14955    11    477  5064  23146    430   9788    279  66732  315    701  10209     13   3296  32644
2  38769  10742    11  20958   264   6928  19451     11  11125  64784   11    323  56501  54111    439   6975
3  43496    872  8830    719  1101   3727    279   6130   2733   6755  323  16365    627  29831  19682   5900


### Embedding 词向量嵌入

In [9]:
# Define Token Embedding look-up table
token_embedding_lookup_table = nn.Embedding(max_token_value+1, d_model).to(device) #gpt3 使用的是50257 * 12288
print("Token Embedding Look-up table: ", token_embedding_lookup_table)


Token Embedding Look-up table:  Embedding(100070, 64)


In [10]:
data[61650]

tensor(907, device='cuda:0')

In [13]:

pd.set_option('expand_frame_repr', True)
pd.DataFrame(_.detach().cpu().numpy())
token_embedding_lookup_table.weight.shape

torch.Size([100070, 64])

In [14]:
# Get x and y embedding
x_batch_embedding = token_embedding_lookup_table(x_batch.detach().to(device)) # [4, 16, 64] [batch_size, context_length, d_model]
y_batch_embedding = token_embedding_lookup_table(y_batch.detach().to(device))

x_batch_embedding.shape, y_batch_embedding.shape
pd.DataFrame(x_batch_embedding[0].detach().cpu().numpy())


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,1.399742,-1.206918,0.32799,-0.25113,0.531874,-0.248193,0.351938,-0.389238,0.129977,-1.20758,...,-0.547752,1.515671,1.223743,0.83367,-0.404194,0.563555,0.591292,-0.724745,1.670105,-0.241721
1,0.566486,-1.102276,1.712332,-0.354509,0.550577,-0.707943,-0.743899,0.757765,0.018193,1.39238,...,0.873562,1.226714,0.794431,0.598629,0.884421,0.03252,1.353617,0.059697,1.17251,0.527427
2,-0.426478,1.717362,-0.34381,-0.917124,-0.27361,0.695366,-0.849842,-1.301135,-0.162554,-0.25281,...,-0.93496,1.145729,-1.91415,-0.447346,0.597272,1.673483,-1.969475,0.397835,-0.438475,-0.562923
3,0.709939,1.369311,-0.707588,1.538689,-2.110915,0.441344,-0.005807,0.171597,-0.296632,0.20732,...,0.071533,-0.735549,0.069967,-2.74475,1.087368,-0.997812,0.714992,-1.357311,1.603957,0.92029
4,-1.963246,0.298927,0.131364,0.082995,0.153765,-0.821641,-1.220109,-1.088038,1.535371,1.829628,...,0.531913,-0.567854,-2.390947,-0.086596,0.066017,0.655226,0.624369,-0.763375,-0.692774,-0.007724
5,0.811149,0.435134,1.13103,0.816734,-1.013971,-0.052429,-0.527541,-0.710573,-0.163887,-1.343154,...,-0.046995,-1.201052,-0.927833,0.322523,0.586139,0.108184,-1.653296,1.918813,0.941642,0.58433
6,0.317549,2.106441,-0.09221,0.636316,-0.912476,-1.975633,-0.068806,0.201157,0.333519,0.151939,...,0.200116,0.051824,1.304806,0.517675,0.049345,0.044632,1.346794,-0.32139,-0.478787,-0.16692
7,-2.025703,1.256391,-0.318619,1.432163,1.644837,-1.910154,-1.001209,-0.976038,1.502204,0.841974,...,-2.33957,0.190785,-0.0552,2.281739,-0.417175,-0.801704,-1.393716,1.863095,-0.393567,-0.131746
8,2.906978,0.09202,-0.785242,0.609121,-0.79018,-0.026004,-1.156866,0.398455,-0.455395,-0.251288,...,3.363073,-0.796739,1.757077,1.52669,-0.654219,1.660685,0.965431,0.618787,1.662946,1.768386
9,1.399742,-1.206918,0.32799,-0.25113,0.531874,-0.248193,0.351938,-0.389238,0.129977,-1.20758,...,-0.547752,1.515671,1.223743,0.83367,-0.404194,0.563555,0.591292,-0.724745,1.670105,-0.241721


### 位置编码 Position Encoding PE

In [15]:
# Define Position Encoding look-up table
context_length = 16
position_encoding_lookup_table = torch.zeros(context_length, d_model)
position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000) / d_model))
#div_term_ = 1/(10000 ** (torch.arange(0, d_model, 2).float()/d_model))
position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)
position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)
position_encoding_lookup_table = position_encoding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1) #add batch dimension

print("Position Encoding Look-up Table: ", position_encoding_lookup_table.shape) # [4, 16, 64] [batch_size, context_length, d_model]
pd.DataFrame(position_encoding_lookup_table[0].detach().cpu().numpy())


Position Encoding Look-up Table:  torch.Size([4, 16, 64])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
1,0.841471,0.540302,0.681561,0.731761,0.533168,0.846009,0.409309,0.912396,0.310984,0.950415,...,0.000422,1.0,0.000316,1.0,0.000237,1.0,0.000178,1.0,0.000133,1.0
2,0.909297,-0.416147,0.99748,0.070948,0.902131,0.431463,0.746903,0.664932,0.591127,0.806578,...,0.000843,1.0,0.000632,1.0,0.000474,1.0,0.000356,1.0,0.000267,1.0
3,0.14112,-0.989992,0.778273,-0.627927,0.993253,-0.115966,0.953634,0.300967,0.812649,0.582754,...,0.001265,0.999999,0.000949,1.0,0.000711,1.0,0.000533,1.0,0.0004,1.0
4,-0.756802,-0.653644,0.141539,-0.989933,0.778472,-0.62768,0.993281,-0.11573,0.953581,0.301137,...,0.001687,0.999999,0.001265,0.999999,0.000949,1.0,0.000711,1.0,0.000533,1.0
5,-0.958924,0.283662,-0.571127,-0.820862,0.323935,-0.946079,0.858896,-0.51215,0.999947,-0.010342,...,0.002108,0.999998,0.001581,0.999999,0.001186,0.999999,0.000889,1.0,0.000667,1.0
6,-0.279415,0.96017,-0.977396,-0.211416,-0.230368,-0.973104,0.574026,-0.818837,0.947148,-0.320796,...,0.00253,0.999997,0.001897,0.999998,0.001423,0.999999,0.001067,0.999999,0.0008,1.0
7,0.656987,0.753902,-0.859313,0.511449,-0.713721,-0.70043,0.188581,-0.982058,0.800422,-0.599437,...,0.002952,0.999996,0.002214,0.999998,0.00166,0.999999,0.001245,0.999999,0.000933,1.0
8,0.989358,-0.1455,-0.280228,0.959933,-0.977262,-0.212036,-0.229904,-0.973213,0.574318,-0.818632,...,0.003374,0.999994,0.00253,0.999997,0.001897,0.999998,0.001423,0.999999,0.001067,0.999999
9,0.412118,-0.91113,0.449194,0.893434,-0.939824,0.34166,-0.608108,-0.793854,0.291259,-0.956644,...,0.003795,0.999993,0.002846,0.999996,0.002134,0.999998,0.0016,0.999999,0.0012,0.999999


In [None]:
# Illustration Purpose Only
def visualize_pe(pe):
    plt.imshow(pe, aspect="auto")
    plt.title("Positional Encoding")
    plt.xlabel("Encoding Dimension")
    plt.ylabel("Position Index")
    plt.colorbar()
    plt.show()

position_encoding_lookup_table2_np = position_encoding_lookup_table[0].cpu().numpy()
visualize_pe(position_encoding_lookup_table2_np)

In [None]:
# Add positional encoding into the input embedding vector
input_embedding_x = x_batch_embedding + position_encoding_lookup_table.to(device) # [4, 16, 64] [batch_size, context_length, d_model]
input_embedding_y = y_batch_embedding + position_encoding_lookup_table.to(device)
pd.DataFrame(input_embedding_x[0].detach().cpu().numpy())

### 多头注意力

In [None]:
# Prepare Query, Key, Value for Multi-head Attention
X = input_embedding_x
query = key = value = X # [4, 16, 64] [batch_size, context_length, d_model]
query.shape

In [1]:
# Define Query, Key, Value weight matrices # GPT3 在这个地方是12228*12228*3 
Wq = nn.Linear(d_model, d_model).to(device)
Wk = nn.Linear(d_model, d_model).to(device)
Wv = nn.Linear(d_model, d_model).to(device)

Q = Wq(query).to(device) #[4, 16, 64]
Q = Q.view(batch_size, -1, num_heads, d_model // num_heads)  #[4, 16, 4, 16]

K = Wk(key).to(device) #[4, 16, 64]
K = K.view(batch_size, -1, num_heads, d_model // num_heads)  #[4, 16, 4, 16]

V = Wv(value).to(device)#[4, 16, 64]
V = V.view(batch_size, -1, num_heads, d_model // num_heads)  #[4, 16, 4, 16]

# print(torch.round(Q[0] * 100) / 100)
qqq = Q.detach().cpu().numpy()
pd.set_option('expand_frame_repr', False)
for qs in qqq:
    for qss in qs:
        print(pd.DataFrame(qss))

print(Q.shape) # [4, 16, 4, 16] [batch_size, context_length, num_heads, head_size]

NameError: name 'nn' is not defined

In [None]:
# Transpose q,k,v from [batch_size, context_length, num_heads, head_size] to [batch_size, num_heads, context_length, head_size]
# The reason is that treat each batch with "num_heads" as its first dimension.
Q = Q.transpose(1, 2) # [4, 4, 16, 16]
K = K.transpose(1, 2) # [4, 4, 16, 16]
V = V.transpose(1, 2) # [4, 4, 16, 16]

In [None]:
# Calculate the attention score
attention_score = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_model // num_heads) # [4, 4, 16, 16]

# Illustration only
plt.imshow(attention_score[1, 1].detach().cpu().numpy(), "Accent", aspect="auto")
plt.title("Attention(Q @ K)") #plot attention in the first head of the first batch
plt.xlabel(encoding.decode(x_batch[0].tolist()))
plt.ylabel(encoding.decode(x_batch[0].tolist()))
plt.colorbar()
pd.DataFrame(attention_score[0][0].detach().cpu().numpy())

In [None]:
# Apply Mask to attention scores
attention_score = attention_score.masked_fill(torch.triu(torch.ones(attention_score.shape[-2:]).to(device), diagonal=1).bool(), float('-inf'))#[4, 4, 16, 16] [batch_size, num_heads, context_length, context_length]

# Illustration only
plt.imshow(attention_score[1, 1].detach().cpu().numpy(), "Accent", aspect="auto")
plt.title("Attention(Q,K)")
plt.xlabel(encoding.decode(x_batch[0].tolist()))
plt.ylabel(encoding.decode(x_batch[0].tolist()))
plt.colorbar()
pd.DataFrame(attention_score[0][0].detach().cpu().numpy())

In [None]:
# Softmax the attention score
attention_score = torch.softmax(attention_score, dim=-1) #[4, 4, 16, 16] [batch_size, num_heads, context_length, context_length]
pd.DataFrame(attention_score[0][0].detach().cpu().numpy())

In [None]:
from bertviz import head_view

att_first_head = [attention_score[i].unsqueeze(0) for i in range(batch_size)]
print(att_first_head[0].shape)

token_list = [[encoding.decode_single_token_bytes(i).decode('utf-8')] for i in x_batch[0].tolist()]
head_view(att_first_head, token_list, prettify_tokens=False)

In [None]:
# Calculate the V attention output
print(attention_score.shape) #[4, 4, 16, 16] [batch_size, num_heads, context_length, context_length]
print(V.shape) #[4, 4, 16, 16] [batch_size, num_heads, context_length, head_size]
A = torch.matmul(attention_score, V) # [4, 4, 16, 16] [batch_size, num_heads, context_length, head_size]
print(A.shape)

In [None]:
# Concatenate the attention output
A = A.transpose(1, 2) # [4, 16, 4, 16] [batch_size, context_length, num_heads, head_size]
A = A.reshape(batch_size, -1, d_model) # [4, 16, 64] [batch_size, context_length, d_model]
A.shape

In [None]:
# Define the output weight matrix  
Wo = nn.Linear(d_model, d_model).to(device)
output = Wo(A) # [4, 16, 64] [batch_size, context_length, d_model]
print(output.shape)
pd.DataFrame(output[0].detach().cpu().numpy())

In [None]:
# Add residual connection
output = output + X

### 层归一化

In [None]:
# Add Layer Normalization
layer_norm = nn.LayerNorm(d_model).to(device)
output_layernorm = layer_norm(output)

### FFN 全连接网络

In [None]:
# Define Feed Forward Network
output = nn.Linear(d_model, d_model * 4).to(device)(output_layernorm)
output = nn.ReLU()(output)#GPT2 使用的是GELU
output = nn.Linear(d_model * 4, d_model).to(device)(output)
output = torch.dropout(output, p=dropout, train=True)

In [None]:
# Add residual connection & layerNorm (last time in a Transformer block)
output = output + output_layernorm
# Add Layer Normalization
layer_norm = nn.LayerNorm(d_model).to(device)
output = layer_norm(output)
print(output.shape)

### 输出映射到词表

In [None]:
# Apply the final linear layer to get the logits
logits = nn.Linear(d_model, max_token_value+1).to(device)(output)
pd.DataFrame(logits[0].detach().cpu().numpy())

In [None]:
# Get the probabilities 
# torch.softmax usually used during inference, during training we use torch.nn.CrossEntropyLoss
# but for illustration purpose, we'll use torch.softmax here
probabilities = torch.softmax(logits, dim=-1)
pd.DataFrame(probabilities[0].detach().cpu().numpy())
probabilities.shape

In [None]:
# Let's see the predicted token and it's original English word
predicted_index = torch.argmax(probabilities[0,0]).item()
encoding.decode([predicted_index])

In [None]:
# Let's see the original input sentence
print(encoding.decode(x_batch[0].tolist()))
print(encoding.decode(y_batch[0].tolist()))