eine TransformerEncoderLayer-basiete Model

In [None]:

import os
import json
import torch
import torch.nn as nn

#eine TransformerEncoderLayer-basiete Model
class GPTDecoderOnlyModel(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2, dim_feedforward=256, max_seq_length=128, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_embedding = nn.Parameter(torch.zeros(1, max_seq_length, d_model))

        self.blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=d_model,
                nhead=nhead,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ) for _ in range(num_layers)
        ])

        self.norm = nn.LayerNorm(d_model)
        self.lm_head = nn.Linear(d_model, vocab_size)
        self.max_seq_length = max_seq_length

    def generate_causal_mask(self, seq_len, device):
        return torch.triu(torch.full((seq_len, seq_len), float('-inf')), diagonal=1).to(device)

    def forward(self, input_ids):
        B, T = input_ids.shape
        x = self.token_embedding(input_ids)
        x = x + self.positional_embedding[:, :T, :]
        mask = self.generate_causal_mask(T, input_ids.device)

        for block in self.blocks:
            x = block(x, mask)

        x = self.norm(x)
        logits = self.lm_head(x)
        return logits



#eine TransformerDecoderLayer-basiete Model

In [28]:
class GPTStyleDecoderOnlyModel(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2, dim_feedforward=256, max_seq_length=128, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_embedding = nn.Parameter(torch.zeros(1, max_seq_length, d_model))

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
        )
        self.decoder = nn.TransformerDecoder(
            decoder_layer,
            num_layers=num_layers
        )
        
        self.output_projection = nn.Linear(d_model, vocab_size)
        self.max_seq_length = max_seq_length
        self.d_model = d_model
        self.blocks = self.decoder.layers

    def generate_causal_mask(self, size):
        # Obere Dreiecksmatrix mit -inf oberhalb der Diagonalen, 0 auf und unterhalb
        mask = torch.triu(torch.ones(size, size) * float('-inf'), diagonal=1)
        return mask

    def forward(self, input_ids):
        batch_size, seq_length = input_ids.size()
        assert seq_length <= self.max_seq_length, "Input sequence too long"

        token_embeddings = self.token_embedding(input_ids)
        position_embeddings = self.positional_embedding[:, :seq_length, :]
        x = token_embeddings + position_embeddings

        # Autoregressive Mask (Causal Mask)
        tgt_mask = self.generate_causal_mask(seq_length).to(input_ids.device)

        # Memory is None since this is decoder-only
        x = self.decoder(tgt=x, memory=x, tgt_mask=tgt_mask)
        logits = self.output_projection(x)
        return logits

Tokeneiser und Model beschtimmen

In [29]:

from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
#model = GPTDecoderOnlyModel(vocab_size=tokenizer.vocab_size)
model = GPTStyleDecoderOnlyModel(vocab_size=tokenizer.vocab_size)
model.eval()



GPTStyleDecoderOnlyModel(
  (token_embedding): Embedding(50257, 128)
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
        (dropout3

Dataset herunterladen

In [16]:
#!pip install datasets
from datasets import load_dataset



dataset = load_dataset("Trelis/tiny-shakespeare", split="train")
dataset1 = dataset
texts = [tokenizer.encode(x['Text'], truncation=True, max_length=64, padding="max_length") for x in dataset if len(x['Text']) > 0]



# Umwandlung in Tensor mit richtiger Dimension (List[List[int]] -> Tensor)
import torch.utils.data as data

inputs_tensor = torch.tensor(texts, dtype=torch.long)
dataset = data.TensorDataset(inputs_tensor)
batch_size = 32
dataloader = data.DataLoader(dataset, batch_size=batch_size, shuffle=True)









In [None]:
print(dataset[0])
print(dataloader)

Dataset für Trening und Validation teilen

In [17]:
from torch.utils.data import random_split

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = data.DataLoader(val_dataset, batch_size=batch_size)


Die Texstgenegationsfunktion

In [30]:
def generate(model, tokenizer, prompt, max_new_tokens=50, device="cpu", temperature=1.0, top_k=50):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    for _ in range(max_new_tokens):
        logits = model(input_ids)
        logits = logits[:, -1, :] / temperature

        # top-k sampling
        if top_k is not None:
            top_k = min(top_k, logits.size(-1))
            values, indices = torch.topk(logits, top_k)
            probs = torch.softmax(values, dim=-1)
            next_token = indices.gather(1, torch.multinomial(probs, num_samples=1))
        else:
            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

        input_ids = torch.cat([input_ids, next_token], dim=-1)

    return tokenizer.decode(input_ids[0], skip_special_tokens=True)



WANDB login

In [19]:
import wandb

wandb.login()  # du wirst aufgefordert, deinen API-Key einzugeben

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: vovanew707 (vovanew707-hsh) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


True

evaluate function

In [31]:
@torch.no_grad()
def evaluate(model, dataloader, vocab_size, device):
    model.eval()
    total_loss = 0
    loss_fn = nn.CrossEntropyLoss()

    for batch in dataloader:
        batch = batch[0]
        inputs = batch[:, :-1].to(device)
        targets = batch[:, 1:].to(device)

        logits = model(inputs)
        logits = logits.reshape(-1, vocab_size)
        targets = targets.reshape(-1)

        loss = loss_fn(logits, targets)
        total_loss += loss.item()

    return total_loss / len(dataloader)


die Trainfunction

In [32]:

from tqdm.notebook import tqdm

import torch.optim as optim

def train(model, train_loader, val_loader, vocab_size, device, epochs=5, lr=1e-4):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    model.train()
    wandb.init(project="tiny-gpt", name="GPT-run-2", config={
    "epochs": epochs,
    "batch_size": batch_size,
    "learning_rate": lr,
    "model_dim": model.token_embedding.embedding_dim,
    "layers": len(model.blocks),
    "vocab_size": vocab_size
})
    wandb.watch(model)
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            batch = batch[0]  # Extrahiere eigentlichen Tensor
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            logits = model(inputs)
            logits = logits.reshape(-1, vocab_size)  # statt .view
            targets = targets.reshape(-1)


            loss = loss_fn(logits, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        val_loss = evaluate(model, val_loader, vocab_size, device)
        prompt = "i have shoes with"

        generated_text = generate(model, tokenizer, prompt, max_new_tokens=50, device=device)
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": avg_loss,
            "val_loss": val_loss,
            "generated_text": wandb.Html(f"<pre>{generated_text}</pre>"),
        })

        print(f"Epoch {epoch+1}/{wandb.config.epochs}, Train Loss: {avg_loss:.4f}, Val_Loss = {val_loss:.4f}")



    wandb.finish()



Die Model trenieren

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size=tokenizer.vocab_size
train(model, train_loader, val_loader,  vocab_size, device, epochs=5, lr=3e-4)

Training Epoch 1:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 1/5, Train Loss: 10.4984, Val_Loss = 9.8992


Training Epoch 2:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 2/5, Train Loss: 9.3569, Val_Loss = 8.9032


Training Epoch 3:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 3/5, Train Loss: 8.4384, Val_Loss = 8.1314


Training Epoch 4:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 4/5, Train Loss: 7.6615, Val_Loss = 7.5049


Training Epoch 5:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 5/5, Train Loss: 7.0546, Val_Loss = 7.0477


0,1
epoch,▁▃▅▆█
train_loss,█▆▄▂▁
val_loss,█▆▄▂▁

0,1
epoch,5.0
train_loss,7.05463
val_loss,7.0477


Beispieltexst generieren

Model speichern  

In [None]:
prompt = "i have shoes with"

generated_text = generate(model, tokenizer, prompt, max_new_tokens=50, device=device)
print(generated_text)

In [None]:
os.makedirs("gpt-mini1", exist_ok=True)

# speihern gewichte
torch.save(model.state_dict(), "gpt-mini1/pytorch_model.bin")

# speihern configuration
config = {
    "vocab_size": tokenizer.vocab_size,
    "d_model": 128,
    "nhead": 4,
    "num_layers": 2,
    "dim_feedforward": 256,
    "max_seq_length": 128,
    "dropout": 0.1
}
with open("gpt-mini1/config.json", "w") as f:
    json.dump(config, f)

# speihern HuggingFace tokenizer
tokenizer.save_pretrained("gpt-mini1")

README.m schreiben

In [None]:


readme_path = os.path.join("gpt-mini1", "README.md")

with open(readme_path, "w", encoding="utf-8") as f:
    f.write(f"""\
# GPT Tiny Shakespeare (Decoder-only)

Ein leichtgewichtiges autoregressives Transformer-Modell (GPT-artig), trainiert auf dem Tiny Shakespeare-Datensatz.

## Architektur
- Decoder-only Transformer (ähnlich GPT-2)
- <1M Parameter
- 2 Layers, 4 Attention Heads
- Embedding-Dimension: 128

## Trainingsdaten
Tiny Shakespeare (ca. 100k Zeichen an Theater-Dialogen von Shakespeare).

## Verwendung
Für einfache Textgenerierung und Experimente auf CPUs.

## Tags
- gpt
- decoder-only
- tiny
- shakespeare
- text-generation
- educational
""")


load auf Hugging Face Hub

In [None]:





from huggingface_hub import HfApi

api = HfApi()
# 1. creat repo auf  Hugging Face
repo_url = api.create_repo(repo_id="vladimir707/gpt-mini1", exist_ok=True)

# 2. load
from huggingface_hub import upload_folder

upload_folder(
    repo_id="vladimir707/gpt-mini1",
    folder_path="gpt-mini1",
    path_in_repo=".",  #
    commit_message="Initial model upload"
)


Pretrenierte Model erstellen

Model herunterladen

In [10]:
from transformers import  GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # wichtig für Padding
model = GPT2LMHeadModel.from_pretrained("gpt2")

Tokeneiserfunction

In [None]:
def tokenize(example):
    tokens = tokenizer(example["Text"], truncation=True, padding="max_length", max_length=128)
                   
                        
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset1.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["Text"])



Dataset teilen

In [None]:
split_dataset = dataset1.train_test_split(test_size=0.1, seed=42)

train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

tokenized_train_dataset = train_dataset.map(tokenize, batched=True)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["Text"])
tokenized_val_dataset = val_dataset.map(tokenize, batched=True)
tokenized_val_dataset = tokenized_val_dataset.remove_columns(["Text"])


In [None]:
print(tokenized_train_dataset[1])
print(tokenized_val_dataset[0])

Model trenieren

In [None]:
from transformers import Trainer, TrainingArguments
import time
wandb.init(project="tiny-gpt", name=f"pretrained-{int(time.time())}")

training_args = TrainingArguments(
    output_dir="./pretrained",
    num_train_epochs=25,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    remove_unused_columns=False,
    eval_strategy="epoch",  #  eval nach jeder Epoche
    logging_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="wandb",  # wandb aktiviert

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,


    tokenizer=tokenizer,
)

trainer.train()


Model auf HF hohladen

In [None]:



model.push_to_hub("my-fancy-gpt2")
tokenizer.push_to_hub("my-fancy-gpt2")

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/vladimir707/my-fancy-gpt21/commit/2b32d13f51c531f08302ddb56fab6321f7ed0105', commit_message='Upload tokenizer', commit_description='', oid='2b32d13f51c531f08302ddb56fab6321f7ed0105', pr_url=None, repo_url=RepoUrl('https://huggingface.co/vladimir707/my-fancy-gpt21', endpoint='https://huggingface.co', repo_type='model', repo_id='vladimir707/my-fancy-gpt21'), pr_revision=None, pr_num=None)

text generieren

In [12]:
model.eval()
input_ids = tokenizer.encode("i have shoes with", return_tensors="pt").to(model.device)
output = model.generate(input_ids, max_length=100, do_sample=True, temperature=0.7)
print(tokenizer.decode(output[0], skip_special_tokens=True))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


i have shoes with the same material as my own. I really like my new pair of shoes and have a great time with them.

Rated 5 out of 5 by W from Best I have owned I bought this pair of shoes because I bought mine in a different color from the one I bought. They are the BEST shoes I have ever owned. They are not only beautiful, but they are more durable than I thought they would be. I also bought this pair of shoes because I had never


Repo löschen

In [13]:

from huggingface_hub import HfApi
HfApi().delete_repo("my-fancy-gpt21")