https://chat.openai.com/share/ca177a86-984f-4193-80a4-e2b8a69b62b3

python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121

In [1]:
import collections
import hashlib
import math
import os
import random
import re
import shutil
import sys
import tarfile
import time
import zipfile
from collections import defaultdict
import pandas as pd
import requests
from IPython import display
from matplotlib import pyplot as plt
from matplotlib_inline import backend_inline


In [2]:
import torch
import torch.nn as nn

In [3]:
torch.cuda.is_available()

True

In [4]:
x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])

In [58]:
input_tensor = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]).cuda()

# 计算每个样本的平均值和方差
mean = input_tensor.mean(dim=1, keepdim=True)
variance = input_tensor.var(dim=1, keepdim=True, unbiased=False)

# 定义 epsilon 防止除零错误
epsilon = 1e-5

# 应用层归一化公式
normalized_tensor = (input_tensor - mean) / torch.sqrt(variance + epsilon)


In [59]:
normalized_tensor

tensor([[-1.2247,  0.0000,  1.2247],
        [-1.2247,  0.0000,  1.2247]], device='cuda:0')

In [63]:
# 创建一个简单的张量
input_tensor = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]).cuda()

# 定义层归一化层
# 假设我们的特征维度是3（因为我们的输入有3个特征）
layer_norm = nn.LayerNorm(input_tensor.size()[1], device="cuda")

# 应用层归一化
output_tensor = layer_norm(input_tensor)

print("原始张量:")
print(input_tensor)
print("经过层归一化后的张量:")
print(output_tensor)

原始张量:
tensor([[1., 2., 3.],
        [4., 5., 6.]], device='cuda:0')
经过层归一化后的张量:
tensor([[-1.2247,  0.0000,  1.2247],
        [-1.2247,  0.0000,  1.2247]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward0>)


In [14]:
from minGPT.mingpt.utils import CfgNode as CN

In [64]:
import torch.nn.functional as F
class CausalSelfAttention(nn.Module):
    """
    A vanilla multi-head masked self-attention layer with a projection at the end.
    It is possible to use torch.nn.MultiheadAttention here but I am including an
    explicit implementation here to show that there is nothing too scary here.
    """

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                     .view(1, 1, config.block_size, config.block_size))
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y


In [65]:
class GPT(nn.Module):
    """ GPT Language Model """

    @staticmethod
    def get_default_config():
        C = CN()
        # either model_type or (n_layer, n_head, n_embd) must be given in the config
        C.model_type = 'gpt'
        C.n_layer = None
        C.n_head = None
        C.n_embd =  None
        # these options must be filled in externally
        C.vocab_size = None
        C.block_size = None
        # dropout hyperparameters
        C.embd_pdrop = 0.1
        C.resid_pdrop = 0.1
        C.attn_pdrop = 0.1
        return C

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.block_size = config.block_size

        type_given = config.model_type is not None
        params_given = all([config.n_layer is not None, config.n_head is not None, config.n_embd is not None])
        assert type_given ^ params_given # exactly one of these (XOR)
        if type_given:
            # translate from model_type to detailed configuration
            config.merge_from_dict({
                # names follow the huggingface naming conventions
                # GPT-1
                'openai-gpt':   dict(n_layer=12, n_head=12, n_embd=768),  # 117M params
                # GPT-2 configs
                'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
                'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
                'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
                'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
                # Gophers
                'gopher-44m':   dict(n_layer=8, n_head=16, n_embd=512),
                # (there are a number more...)
                # I made these tiny models up
                'gpt-mini':     dict(n_layer=6, n_head=6, n_embd=192),
                'gpt-micro':    dict(n_layer=4, n_head=4, n_embd=128),
                'gpt-nano':     dict(n_layer=3, n_head=3, n_embd=48),
            }[config.model_type])

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.embd_pdrop),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters (note we don't count the decoder parameters in lm_head)
        n_params = sum(p.numel() for p in self.transformer.parameters())
        print("number of parameters: %.2fM" % (n_params/1e6,))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)

    @classmethod
    def from_pretrained(cls, model_type):
        """
        Initialize a pretrained GPT model by copying over the weights
        from a huggingface/transformers checkpoint.
        """
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel

        # create a from-scratch initialized minGPT model
        config = cls.get_default_config()
        config.model_type = model_type
        config.vocab_size = 50257 # openai's model vocabulary
        config.block_size = 1024  # openai's model block_size
        model = GPT(config)
        sd = model.state_dict()

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        keys = [k for k in sd_hf if not k.endswith('attn.masked_bias')] # ignore these
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla nn.Linear.
        # this means that we have to transpose these weights when we import them
        assert len(keys) == len(sd)
        for k in keys:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, train_config):
        """
        This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object.
        """

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                # random note: because named_modules and named_parameters are recursive
                # we will see the same tensors p many many times. but doing it this way
                # allows us to know which parent module any tensor p belongs to...
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)

        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # either sample from the distribution or take the most likely element
            if do_sample:
                idx_next = torch.multinomial(probs, num_samples=1)
            else:
                _, idx_next = torch.topk(probs, k=1, dim=-1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

In [66]:
config = GPT.get_default_config()
config.model_type = 'gpt-nano'
config.vocab_size = 3
config.block_size = 11
config.n_embd = 48
config.n_head = 3

In [67]:
attn = CausalSelfAttention(config)

In [22]:
config.model_type

'gpt'

In [69]:


input_tensor = torch.randn(1, 10, config.n_embd) #.cuda()

print(input_tensor) 


tensor([[[-1.0428,  0.0508,  0.5945, -0.1914, -0.7316, -0.3466, -2.6920,
           2.1529,  1.7580, -0.8282, -0.1455,  0.0564,  1.3450,  1.2541,
          -0.2799, -0.3056, -0.3024,  0.2441,  0.3060,  0.0209,  0.0456,
          -1.1879,  0.4372,  1.4288, -1.3912, -0.2410, -2.1609, -0.1044,
          -0.3594, -0.0490, -0.2117, -1.1906, -1.0567,  0.6662, -0.6972,
          -0.7998, -0.1399, -1.0520, -0.4148, -0.1163, -0.4117,  0.0439,
          -0.0959,  0.1125,  1.4421, -0.8242,  0.1227, -1.7319],
         [-0.8110,  0.5994, -0.0590,  0.3937, -0.4602, -0.0283,  1.8238,
          -1.4366, -1.0495,  0.8136, -1.0866, -0.2896, -0.0395,  0.7240,
           0.2066, -0.1457, -0.0401,  0.4626, -0.7670,  1.8677, -0.5092,
           0.1604,  0.7133, -1.8264, -1.8339,  0.4055,  0.5685, -0.2694,
          -1.0105,  1.7701,  1.0843, -1.5971, -1.8077,  0.5500, -0.8081,
           1.7379,  1.3204,  1.2956, -0.3001,  0.3021, -0.2158, -0.4613,
           1.5946,  0.4977, -0.0239, -1.1403, -0.8988, -0.9

In [70]:
# 通过 CausalSelfAttention 获取输出
output_tensor = attn(input_tensor)

In [38]:
output_tensor

tensor([[[-4.5397e-02, -5.3180e-01,  1.0115e+00, -9.7004e-02, -0.0000e+00,
           8.9517e-02,  0.0000e+00, -1.4924e-01, -4.0716e-01, -6.1810e-01,
          -3.7041e-01,  3.6248e-02, -1.7717e-01,  9.4238e-01, -6.0726e-01,
           1.4189e-01,  7.5382e-02,  0.0000e+00, -4.3820e-01,  2.8809e-01,
           6.3346e-02,  1.0342e-01,  8.3648e-01, -1.3487e-01, -3.6065e-02,
          -3.6872e-01,  2.6944e-01, -8.6270e-02, -7.2617e-01,  3.1154e-01,
          -4.6656e-01,  3.2325e-01, -3.8224e-01, -1.0432e+00,  2.8136e-02,
           2.2506e-01, -2.4952e-01,  1.3780e-01,  2.5083e-01, -2.5613e-01,
           5.2677e-01,  1.4351e-01,  8.5896e-02,  8.5359e-02,  6.2343e-01,
           5.9525e-01, -1.1253e+00, -1.3406e+00],
         [-6.1220e-01, -3.9249e-01,  6.7203e-01, -2.7848e-02, -0.0000e+00,
          -1.9633e-01,  2.7571e-01,  2.4278e-01, -7.7353e-01, -3.4798e-01,
           3.9674e-01, -1.1360e-01, -1.0206e-01,  4.3668e-01,  1.9027e-01,
           2.5527e-01, -0.0000e+00,  4.4270e-01, -

In [39]:
input_tensor

tensor([[[-1.1434e+00,  8.5085e-01, -1.1020e-01, -4.1483e-02, -3.7499e-01,
          -2.7098e-01, -2.0127e+00, -1.0807e+00, -2.5766e+00, -1.5124e+00,
          -8.5082e-02, -2.0072e+00, -1.3589e+00,  7.1515e-01,  1.2694e+00,
          -5.0828e-01,  3.2971e-01,  4.7442e-01,  1.7500e-01, -4.7310e-03,
          -2.4749e-01, -1.6138e-01,  6.3162e-01,  2.0431e-01,  1.9941e-01,
          -1.5073e+00, -7.6510e-01, -7.7724e-01, -9.5968e-02,  2.2442e+00,
          -1.0663e+00,  1.0552e+00,  7.8778e-01,  3.9780e-01,  4.6046e-01,
          -3.2837e-01,  1.4538e-01, -1.2118e+00, -1.6320e+00, -1.1839e+00,
          -1.1512e+00, -3.9916e-01, -9.2560e-02,  9.2976e-01, -3.6033e-01,
          -7.4420e-01,  2.7736e-01,  1.4367e+00],
         [-1.4534e+00,  1.6543e+00, -8.8243e-01, -3.2915e-01,  3.4075e-01,
           8.5306e-01, -9.4084e-01, -4.1994e-01,  6.8491e-02,  1.8351e-01,
          -1.3080e+00,  8.6825e-02,  1.0545e+00, -3.8472e-01,  1.4881e-01,
          -1.0184e+00,  1.5137e+00,  3.0889e-01,  

In [42]:
torch.Size([12])

torch.Size([12])

In [43]:
input_tensor.shape

torch.Size([1, 10, 48])

In [45]:
print(torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]).shape)

torch.Size([2, 3])


In [47]:
torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])[0][2]

tensor(3.)

In [50]:
print(torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]).reshape(1,6))
print(torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]).reshape(6,1))

tensor([[1., 2., 3., 4., 5., 6.]])
tensor([[1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.]])


In [1]:
import collections
import hashlib
import math
import os
import random
import re
import shutil
import sys
import tarfile
import time
import zipfile
from collections import defaultdict
import pandas as pd
import requests
from IPython import display
from matplotlib import pyplot as plt
from matplotlib_inline import backend_inline

In [2]:
import numpy as np
import torch

In [3]:
import torchvision
from PIL import Image
from torch import nn
from torch.nn import functional as F
from torch.utils import data
from torchvision import transforms