## 1. 环境准备
- 建议使用 GPU 运行时 (Runtime -> Change runtime type -> GPU).

In [10]:
!nvidia-smi
!pip install --quiet --upgrade pip
# 强制安装 numpy<2 以兼容 torch 2.2.x
!pip install --quiet "numpy<2"
!pip install --quiet torch==2.2.2+cu121 torchvision==0.17.2+cu121 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
!pip install --quiet tqdm omegaconf matplotlib

Thu Nov 27 04:52:26 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P0             46W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

## 2. Git 设置与仓库同步

In [None]:
import getpass, os, subprocess

REPO_URL = "https://github.com/lachlanye/MLLM-from-scratch.git"
repo_name = REPO_URL.split("/")[-1].replace(".git", "")

# 1. 防止嵌套目录：检查当前是否已经在仓库目录中
if os.getcwd().endswith(repo_name):
    print(f"Already in {repo_name} directory.")
else:
    # 2. 如果不在，检查是否存在，不存在则克隆
    if not os.path.exists(repo_name):
        print(f"Cloning {repo_name}...")
        # 请确保设置了 GITHUB_TOKEN
        os.environ["GITHUB_TOKEN"] = "YOUR_TOKEN"
        subprocess.run(["git", "clone", f"https://{os.environ['GITHUB_TOKEN']}@" + REPO_URL.split("https://")[-1]], check=True)
    
    # 3. 进入目录
    %cd $repo_name

# 4. 配置 Git
!git config user.name "lachlanye"
!git config user.email "colab@example.com"

# 5. 拉取代码 (使用 master 分支)
!git pull origin master

Already in MLLM-from-scratch directory.
From https://github.com/lachlanye/MLLM-from-scratch
 * branch            master     -> FETCH_HEAD
Already up to date.
From https://github.com/lachlanye/MLLM-from-scratch
 * branch            master     -> FETCH_HEAD
Already up to date.


## 3. 数据集准备
加载 Tiny Shakespeare 数据集并查看部分数据。

In [12]:
from datasets.tinyshakespeare import TinyShakespeareDataset
import torch

# 初始化数据集 (会自动下载)
dataset = TinyShakespeareDataset(root="data", download=True)
print(f"Vocab size: {dataset.vocab_size}")
print(f"Dataset length: {len(dataset)}")

# 查看前 200 个字符
print("\n--- Data Sample ---")
print(dataset.tokenizer.decode(dataset.data[:200].tolist()))
print("-------------------")

File already exists at data/tinyshakespeare/input.txt, skipping download.
Found existing vocabulary at data/tinyshakespeare/vocab.json. Loading...
Vocabulary loaded from data/tinyshakespeare/vocab.json
Encoding the entire corpus into a single sequence...


File already exists at data/tinyshakespeare/input.txt, skipping download.
Found existing vocabulary at data/tinyshakespeare/vocab.json. Loading...
Vocabulary loaded from data/tinyshakespeare/vocab.json
Encoding the entire corpus into a single sequence...


TypeError: 'ellipsis' object cannot be interpreted as an integer

## 4. 训练配置
查看当前的训练配置 `configs/llm_config.yaml`。

In [None]:
import yaml
from pathlib import Path

config_path = Path("configs/llm_config.yaml")
with open(config_path) as f:
    config = yaml.safe_load(f)
    
# 可以在这里动态修改配置，例如增加 epoch
# config['training_params']['num_epochs'] = 5
# with open(config_path, 'w') as f:
#     yaml.dump(config, f)

print(yaml.dump(config))

## 5. 开始训练
运行 `language_model/train_llm.py`。我们将输出重定向到日志文件以便后续绘图。

In [None]:
# 确保保存目录存在
!mkdir -p checkpoints
# 运行训练并将输出同时显示在控制台和保存到 training.log
!python -m language_model.train_llm | tee training.log

## 6. 训练过程可视化 (Loss Curve)
解析 `training.log` 并绘制训练和验证 Loss 曲线。

In [None]:
import matplotlib.pyplot as plt
import re

train_losses = []
val_losses = []
epochs = []

# 解析日志文件
with open("training.log", "r") as f:
    for line in f:
        # 匹配格式: Epoch 1/10 Summary | Train Loss: 2.5000 | Val Loss: 2.4000
        match = re.search(r"Epoch (\d+)/\d+ Summary \| Train Loss: ([\d\.]+) \| Val Loss: ([\d\.]+)", line)
        if match:
            epochs.append(int(match.group(1)))
            train_losses.append(float(match.group(2)))
            val_losses.append(float(match.group(3)))

if not epochs:
    print("未找到训练日志数据，请检查训练是否成功完成。")
else:
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_losses, label='Train Loss', marker='o')
    plt.plot(epochs, val_losses, label='Validation Loss', marker='s')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss over Epochs')
    plt.legend()
    plt.grid(True)
    plt.show()

## 7. 文本生成展示
使用训练好的模型生成莎士比亚风格的文本。

In [None]:
# 样本 1
!python -m language_model.generate_text

In [None]:
# 样本 2 (修改 configs/llm_config.yaml 中的 start_context 来生成不同的文本)
import yaml

# 读取配置
with open("configs/llm_config.yaml", 'r') as f:
    cfg = yaml.safe_load(f)

# 修改起始文本
cfg['generation_params']['start_context'] = "To be, or not to be"

# 写入临时配置
with open("configs/llm_config_sample2.yaml", 'w') as f:
    yaml.dump(cfg, f)

# 使用新配置生成
# 注意：generate_text.py 默认读取 configs/llm_config.yaml，我们需要修改代码或覆盖文件
# 这里简单起见，我们直接覆盖原配置文件，生成后再改回来 (或者修改 generate_text.py 接受参数)
# 为了演示，我们直接覆盖
with open("configs/llm_config.yaml", 'w') as f:
    yaml.dump(cfg, f)

print("--- Sample 2: 'To be, or not to be' ---")
!python -m language_model.generate_text

# 恢复配置 (可选)
cfg['generation_params']['start_context'] = "You are all resolved"
with open("configs/llm_config.yaml", 'w') as f:
    yaml.dump(cfg, f)

## 8. 保存结果
将训练好的模型和日志推送到 GitHub。

In [None]:
!git status
!git add -f checkpoints/llm_tinyshakespeare.pth training.log
!git commit -m "Add LLM training results and logs" || echo "Nothing to commit"
!git push origin master