In [3]:
!pip install transformers datasets wandb torch

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [53]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from datasets import load_dataset

# Den Datensatz laden (train, validation, test)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [10]:
# 🧩 Initialisierung des Tokenizers
# Wir verwenden den GPT-2 Tokenizer aus Hugging Face.

from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import numpy as np


tokenizer = AutoTokenizer.from_pretrained("gpt2")

# GPT-2 hat keinen PAD-Token, also fügen wir ihn manuell hinzu
tokenizer.pad_token = tokenizer.eos_token

# 🔄 Tokenisierung und Vorbereitung des Datensatzes für das Training


block_size = 128  # Länge der Input-Sequenz

def tokenize_function(angabe):
    # Den Text in Token umwandeln
    return tokenizer(angabe["text"])

# Tokenisierung anwenden
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Beispiele anzeigen
print(tokenized_datasets["train"][0])
def group_texts(examples):
    """
    Tokenizes and groups texts into blocks for training.

    Args:
        examples: A dictionary containing the 'text' field.

    Returns:
        A dictionary with 'input_ids' and 'labels' fields,
        where each field is a list of blocks of token IDs.
    """
    # Tokenize all texts together
    tokenized_examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=block_size)

    # Create blocks of size block_size
    input_ids = tokenized_examples["input_ids"]
    labels = input_ids.copy() # Labels are the same as inputs for causal language modeling

    # Reshape into blocks
    # Convert input_ids and labels to NumPy arrays before reshaping
    input_ids = np.array(input_ids)
    input_ids = input_ids.reshape(-1, block_size)
    labels = np.array(labels)
    labels = labels.reshape(-1, block_size)

    return {"input_ids": input_ids.tolist(), "labels": labels.tolist()} # Convert back to lists

    # Die Funktion anwenden

lm_datasets = tokenized_datasets.map(group_texts, batched=True)

def collate_fn(batch):
    # Pad input_ids and labels to the maximum length in the batch
    max_len = max(len(x['input_ids']) for x in batch)
    input_ids = [x['input_ids'] + [tokenizer.pad_token_id] * (max_len - len(x['input_ids'])) for x in batch]
    labels = [x['labels'] + [-100] * (max_len - len(x['labels'])) for x in batch]  # -100 is the ignore index for the loss function

    # Convert to tensors
    input_ids = torch.tensor(input_ids)
    labels = torch.tensor(labels)

    return {'input_ids': input_ids, 'labels': labels}


train_dataloader = DataLoader(lm_datasets["train"], batch_size=8, shuffle=True, collate_fn=collate_fn)
eval_dataloader = DataLoader(lm_datasets["validation"], batch_size=8, collate_fn=collate_fn)


{'text': '', 'input_ids': [], 'attention_mask': []}


Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [63]:
#  Modelldefinition
import torch.optim as optim
import torch.nn as nn
import wandb # Import the wandb library
import torch
from transformers import PreTrainedModel, PretrainedConfig
from torch.optim import Adam
from tqdm import tqdm

class DecoderOnlyConfig(PretrainedConfig):
    model_type = "decoder-only"

    def __init__(self, vocab_size=30522, d_model=128, nhead=2, num_layers=2, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.nhead = nhead
        self.num_layers = num_layers


class DecoderOnlyTransformer(PreTrainedModel):
    config_class = DecoderOnlyConfig

    def __init__(self, config):
        super().__init__(config)
        self.embedding = nn.Embedding(config.vocab_size, config.d_model)

        decoder_layer = nn.TransformerDecoderLayer(d_model=config.d_model, nhead=config.nhead)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=config.num_layers)

        self.output_layer = nn.Linear(config.d_model, config.vocab_size)

        self.init_weights()

    def forward(self, input_ids, labels=None):
        embedded = self.embedding(input_ids).transpose(0, 1)  # (Seq, Batch, D)
        seq_len = embedded.size(0)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(input_ids.device)

        decoded = self.transformer_decoder(embedded, embedded, tgt_mask=tgt_mask)
        logits = self.output_layer(decoded).transpose(0, 1)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.reshape(-1, logits.size(-1)), labels.reshape(-1))
        return {"loss": loss, "logits": logits}

    def generate(self, input_ids, max_length=50, temperature=1.0, top_k=50):
        self.eval()
        generated = input_ids
        for _ in range(max_length):
            outputs = self.forward(generated)
            logits = outputs["logits"][:, -1, :] / temperature
            filtered_logits = self.top_k_top_p_filtering(logits, top_k=top_k)
            probs = torch.nn.functional.softmax(filtered_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
        return generated

    def top_k_top_p_filtering(self, logits, top_k=50, top_p=1.0):
        # Только top-k sampling
        top_k = min(top_k, logits.size(-1))
        values, _ = torch.topk(logits, top_k)
        min_values = values[:, -1].unsqueeze(1)
        return torch.where(logits < min_values, torch.full_like(logits, -float("Inf")), logits)
    @classmethod
    def from_pretrained(cls, model_name_or_path, *args, **kwargs):
        """
        This method allows loading the model and its configuration from Hugging Face.
        """
        config = cls.config_class.from_pretrained(model_name_or_path, **kwargs)
        # Here we use the built-in `from_pretrained` for PreTrainedModel
        # The error was caused by using 'model_args' which was not defined.
        # We replace it with *args to accept any positional arguments and pass them to super().from_pretrained
        model = super().from_pretrained(model_name_or_path, *args, config=config, **kwargs)
        return model


In [44]:

#  Modell initialisieren
# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Create a configuration object with the desired vocab_size
config = DecoderOnlyConfig(vocab_size=len(tokenizer))
model = DecoderOnlyTransformer(config).to(device) # Pass the config object to the model

#  Optimizer und Loss-Funktion
optimizer = optim.AdamW(model.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss()

#  Trainings-Funktion
def train(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in dataloader:
        # Daten auf das Gerät verschieben
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        # Gradienten zurücksetzen
        optimizer.zero_grad()

        # Vorwärtsdurchlauf
        outputs = model(input_ids, labels=labels) # Pass labels to the forward method

        # Get the logits from the outputs dictionary
        logits = outputs["logits"]

        # Berechnung des Losses (wir ignorieren Padding-Tokens)
        loss = criterion(logits.reshape(-1, logits.size(-1)), labels.reshape(-1)) # Use logits instead of outputs

        # Rückwärtsdurchlauf
        loss.backward()

        # Parameter aktualisieren
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

# 🧪 Evaluierungs-Funktion
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids)
            logits = outputs["logits"] # Assign the logits from the model output
            loss = criterion(logits.reshape(-1, logits.size(-1)), labels.reshape(-1))
            total_loss += loss.item()


    return total_loss / len(dataloader)


# Initialisierung von wandb für das Loggen der Trainings- und Validierungsdaten
wandb.init(project="GenAI_project", entity="flora-nuta-hochschule-hannover")
epochs = 5
for epoch in range(epochs):
    train_loss = train(model, train_dataloader, optimizer, criterion)
    eval_loss = evaluate(model, eval_dataloader, criterion)
    print(f"Epoche {epoch + 1}: Train Loss = {train_loss:.4f}, Eval Loss = {eval_loss:.4f}")
    # Logge die Verluste (Training und Validierung) zu wandb
    wandb.log({"train_loss": train_loss, "eval_loss": eval_loss})

Epoche 1: Train Loss = 0.4784, Eval Loss = 0.1001
Epoche 2: Train Loss = 0.0319, Eval Loss = 0.0466
Epoche 3: Train Loss = 0.0078, Eval Loss = 0.0360
Epoche 4: Train Loss = 0.0010, Eval Loss = 0.0343
Epoche 5: Train Loss = 0.0000, Eval Loss = 0.0352


OrderedDict([('embedding.weight', tensor([[ 1.5076, -0.6626,  0.2641,  ...,  1.0476,  0.1600,  1.1030],
        [ 0.4504, -1.9381,  1.8523,  ..., -0.4494, -0.0576, -1.2910],
        [-0.7354, -0.4745, -0.8112,  ...,  0.6471, -1.5315,  0.0932],
        ...,
        [-0.1944, -0.1339, -0.5168,  ...,  1.3343, -1.9359,  0.9119],
        [ 0.2812, -0.2045, -0.5548,  ...,  0.0911,  0.8873, -1.4909],
        [ 1.9837,  0.2043, -0.1915,  ...,  1.3531, -0.2435,  0.1840]],
       device='cuda:0')), ('transformer_decoder.layers.0.self_attn.in_proj_weight', tensor([[ 0.0442, -0.0401,  0.0008,  ..., -0.0089,  0.0060, -0.1122],
        [ 0.0433,  0.1075, -0.0445,  ...,  0.0687, -0.1007, -0.0188],
        [ 0.0835,  0.0666,  0.0744,  ...,  0.0947,  0.0942,  0.0842],
        ...,
        [ 0.0687,  0.0401,  0.0881,  ..., -0.0029, -0.0143, -0.0980],
        [ 0.0739, -0.0032, -0.0232,  ...,  0.0896,  0.0058, -0.0805],
        [ 0.0499,  0.0399, -0.0060,  ..., -0.0025, -0.0849,  0.0865]],
       device=

In [41]:
!pip install --upgrade transformers



In [64]:
# saving model and tokenizer
!mkdir -p "decoder-only-model-with-pipline"
torch.save(model.state_dict(), "decoder-only-model-with-pipline/pytorch_model.bin") # Save the model's state dictionary
tokenizer.save_pretrained("decoder-only-model-with-pipline")

('decoder-only-model-with-pipline/tokenizer_config.json',
 'decoder-only-model-with-pipline/special_tokens_map.json',
 'decoder-only-model-with-pipline/vocab.json',
 'decoder-only-model-with-pipline/merges.txt',
 'decoder-only-model-with-pipline/added_tokens.json',
 'decoder-only-model-with-pipline/tokenizer.json')

In [65]:
!ls -l "decoder-only-model-with-pipline"


total 115948
-rw-r--r-- 1 root root      228 Apr 21 17:28 config.json
-rw-r--r-- 1 root root   456318 Apr 21 17:49 merges.txt
-rw-r--r-- 1 root root 56943324 Apr 21 17:28 model.safetensors
-rw-r--r-- 1 root root 56954147 Apr 21 17:49 pytorch_model.bin
-rw-r--r-- 1 root root      583 Apr 21 17:49 special_tokens_map.json
-rw-r--r-- 1 root root      698 Apr 21 17:49 tokenizer_config.json
-rw-r--r-- 1 root root  3557957 Apr 21 17:49 tokenizer.json
-rw-r--r-- 1 root root   798156 Apr 21 17:49 vocab.json


In [66]:
from huggingface_hub import upload_folder, create_repo

repo_id = "tet-ana/decoder-only-transformer-with-pipeline"

folder_path = "decoder-only-model-with-pipline"

# Create the repository if it doesn't exist
create_repo(repo_id, exist_ok=True)

model_card = """
# Decoder-Only Transformer (Eigenbau)

Dies ist ein einfacher autoregressiver Decoder-Only Transformer, der von Grund auf in PyTorch implementiert wurde.

## Modellbeschreibung

- Transformer-Decoder-Architektur
- Trainiert auf WikiText-2-Dataset
- Tokenizer: GPT-2

## Verwendungszweck

Dieses Modell dient zur Demonstration, wie man ein autoregressives Sprachmodell selbst implementiert.

## Tags

- pytorch
- transformer
- language-modeling
- decoder-only
- education
"""



In [67]:
# Model hochladen
upload_folder(
   repo_id=repo_id,
    folder_path=folder_path,
    path_in_repo=".",
    commit_message="✨ New version"
)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/tet-ana/decoder-only-transformer-with-pipeline/commit/3964b43a01827a2a8c8aa2a27a7a15380593acc1', commit_message='✨ New version', commit_description='', oid='3964b43a01827a2a8c8aa2a27a7a15380593acc1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tet-ana/decoder-only-transformer-with-pipeline', endpoint='https://huggingface.co', repo_type='model', repo_id='tet-ana/decoder-only-transformer-with-pipeline'), pr_revision=None, pr_num=None)

In [69]:
from transformers import AutoModelForCausalLM, AutoTokenizer



# Загружаем конфигурацию и модель с Hugging Face
tokenizer = AutoTokenizer.from_pretrained("tet-ana/decoder-only-transformer-with-pipeline")
model = DecoderOnlyTransformer.from_pretrained("tet-ana/decoder-only-transformer-with-pipeline")


prompt = "How are you?"  # Beispiel eines Starttextes

# Wandeln Sie den Eingabetext in Token um
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Text generieren
output = model.generate(input_ids, max_length=50)

# Dekodieren Sie das Ergebnis
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


How are you? VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM VM
