In [None]:
# Importing libraries and settings
import torch  # PyTorch is a Python library for Machine Learning and Deep Learning.
import torch.nn as nn   # torch.nn: Building blocks for neural networks
import torch.nn.functional as F   
import torch.optim as optim   # torch.optim: Optimizer (for example, to reduce Loss)

from transformers import AutoTokenizer
# transformers transformers is an open source AI library developed by Firma Hugging Face.
# This library is one of the most commonly used tools for NLP (Natural Language Processing)
# AutoTokenizer is a tool that automatically loads a "tokenizer".
# Turn human language into a “list of numbers” so that the AI model can understand.

from datasets import load_dataset
# From the datasets library, import a function called load_dataset. 
# This function is used to Help you quickly download and load open source datasets for training or testing AI models.

from torch.optim import Adam

import wandb 
# wandb (Weights & Biases) is a tool for recording, tracking, and visualizing the machine learning training process.

import os 
# os= Control and Read operating system: File operations, Path processing, Environment variable management.



In [8]:
# Set some global variables
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 32    # The number of training samples per batch.
SEQ_LEN = 64       # The length of each token, which <= 1024 in GPT2.
EPOCHS = 5

# Using GPT2 tokenizer and Tiny Shakespeare text
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT2 does not have pad_token, use eos_token instead

# load Tiny Shakespeare Dataset
dataset = load_dataset("tiny_shakespeare", trust_remote_code=True ) # allow these codes (custom loading logic) to run
text = dataset["train"][0]["text"]  # Get the original text string

# ==== encode into "token" ====
tokens = tokenizer.encode(text, return_tensors="pt").squeeze()
tokens = tokens[:1024]  # Limit the length, GPT2 only accepts a maximum of 1024 tokens

# Split these tokens into fixed-length input-output pairs as training samples
# "make_chunks" is a function to cut the tokens (integer form) of a whole paragraph- 
# -into small training samples (chunks), for language model input (reading) and target (prediction)
def make_chunks(tokens, seq_len):
    inputs = []
    targets = []
    for i in range(0, len(tokens) - seq_len):
        inputs.append(tokens[i:i+seq_len])  
        # Input: the first N words seen by the model
        targets.append(tokens[i+1:i+seq_len+1])  
        # Output (target): The next word that the model is going to predict
    return torch.stack(inputs), torch.stack(targets) 
    # Convert these sliced ​​fragments into tensor format for model training.


Token indices sequence length is longer than the specified maximum sequence length for this model (301966 > 1024). Running this sequence through the model will result in indexing errors


In [9]:
input_ids, labels = make_chunks(tokens, SEQ_LEN)
# Cut a whole token sequence (tokens) into multiple short input sequences (input_ids) and target output (labels). 
# to train the language model, to let the model learn to predict the following words from the previous ones

# establish DataLoader: make the prepared training data into a form that can be fed to the model in batches
from torch.utils.data import TensorDataset, DataLoader, random_split
# TensorDataset: Pack two Tensors (input_ids, labels) into a dataset
# DataLoader: Loading data in batches
# random_split: randomly divide the data into two parts (training and verification)

dataset = TensorDataset(input_ids, labels) 
# Create the dataset. Each pair of (input, label) is a training sample

train_size = int(0.9 * len(dataset))  
val_size = len(dataset) - train_size
# 90% of the samples for training and 10% for validation

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
# "random_split" will randomly separate the two data sets to ensure that the verification results are fair

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
# The function of DataLoader is to:
# -Cut the data into batches (e.g. BATCH_SIZE = 16)
# -Return (inputs, labels) for each batch
# -Automatically shuffle the order during training (shuffle=True)

In [10]:
# Decoder-only
class MiniDecoderModel(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)  # [batch_size, seq_len, d_model]
        x = x.permute(1, 0, 2)  # Transformer expects: [seq_len, batch_size, d_model]
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(x.size(0)).to(x.device)
        output = self.decoder(x, x, tgt_mask=tgt_mask)
        output = output.permute(1, 0, 2)  # Back to [batch_size, seq_len, d_model]
        return self.fc_out(output)



In [None]:
# Train the model and log it with wandb
# Initialize wandb:
wandb.init(project="tiny-transformer", name="decoder-from-scratch")

# Models and Optimizers
model = MiniDecoderModel(vocab_size=tokenizer.vocab_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0

    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1))
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    wandb.log({
        "epoch": epoch,
        "train_loss": avg_train_loss,
        "val_loss": avg_val_loss
    })

    print(f"Epoch {epoch+1}/{EPOCHS} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")

# Result:
# Epoch 1/5 - Train Loss: 10.0167 - Val Loss: 9.3827
# Epoch 2/5 - Train Loss: 8.9734 - Val Loss: 8.5459
# Epoch 3/5 - Train Loss: 8.1644 - Val Loss: 7.7465
# Epoch 4/5 - Train Loss: 7.3912 - Val Loss: 6.9782
# Epoch 5/5 - Train Loss: 6.6558 - Val Loss: 6.2553
# This is good. it shows that the model is learning and 
# -its performance is improving both on the training data and on unseen validation data.

Epoch 1/5 - Train Loss: 10.0167 - Val Loss: 9.3827
Epoch 2/5 - Train Loss: 8.9734 - Val Loss: 8.5459
Epoch 3/5 - Train Loss: 8.1644 - Val Loss: 7.7465
Epoch 4/5 - Train Loss: 7.3912 - Val Loss: 6.9782
Epoch 5/5 - Train Loss: 6.6558 - Val Loss: 6.2553


In [None]:
# Upload the model to Hugging Face Hub
# log in 
from huggingface_hub import login
login()  # 输入你在 Hugging Face 网站上的 token

# Define the directory where the model will be saved locally
model_dir = "mini_decoder_model"

# Save the model weights and tokenizer
torch.save(model.state_dict(), "mini_decoder_model.pth")
tokenizer.save_pretrained("mini_decoder_model")

# Upload the Model to Hugging Face
from huggingface_hub import HfApi

# Instantiate the Hugging Face API object
api = HfApi()

# Create a new repository or use an existing one
api.create_repo(repo_id="WenWebProjekt/mini-decoder", exist_ok=True)

# Upload the model directory to the repository
api.upload_folder(
    repo_id="WenWebProjekt/mini-decoder",
    folder_path="mini_decoder_model"     # Path to the local directory containing the model and tokenizer
)


CommitInfo(commit_url='https://huggingface.co/WenWebProjekt/mini-decoder/commit/cb8823f0ebecdbbb69ce678e13fd1d7063042b59', commit_message='Upload folder using huggingface_hub', commit_description='', oid='cb8823f0ebecdbbb69ce678e13fd1d7063042b59', pr_url=None, repo_url=RepoUrl('https://huggingface.co/WenWebProjekt/mini-decoder', endpoint='https://huggingface.co', repo_type='model', repo_id='WenWebProjekt/mini-decoder'), pr_revision=None, pr_num=None)