In [None]:
!pip install -q torch transformers datasets huggingface_hub wandb --upgrade


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import torch.nn as nn
import wandb
import json
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset

# -------------------------------
# Konfiguration
# -------------------------------
config ={
    "SEQ_LEN":64,
    "BATCH_SIZE" :8,
    "EPOCHS": 4,
    "EMBED_DIM" :128,
    "NUM_HEADS" :4,
    "NUM_LAYERS" : 2,
    "LR":3e-4,
    "TOKENIZER_NAME" :"gpt2",
    "DEVICE" :torch.device("cuda" if torch.cuda.is_available() else "cpu")

}


# -------------------------------
# Tokenizer laden
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained(config["TOKENIZER_NAME"])
tokenizer.pad_token = tokenizer.eos_token  # GPT2 hat kein PAD-Token

# -------------------------------
# Datensatz laden (Tiny Shakespeare)
# -------------------------------
dataset = load_dataset("karpathy/tiny_shakespeare")
text = dataset["train"][0]["text"]

# -------------------------------
# Token-Dataset erstellen
# -------------------------------
class TextDataset(Dataset):
    def __init__(self, text, tokenizer, seq_len):
        tokens = tokenizer.encode(text)
        self.samples = [
            (tokens[i:i+seq_len], tokens[i+1:i+seq_len+1])
            for i in range(len(tokens) - seq_len - 1)
        ]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x), torch.tensor(y)

dataset = TextDataset(text, tokenizer, config["SEQ_LEN"])
train_loader = DataLoader(dataset, batch_size=config["BATCH_SIZE"], shuffle=True)
val_loader = DataLoader(dataset, batch_size=config["BATCH_SIZE"])

# -------------------------------
# Decoder-Only Transformer
# -------------------------------
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, seq_len):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_embed = nn.Embedding(seq_len, embed_dim)

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=4*embed_dim,
            dropout=0.1,
            activation="gelu"
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(embed_dim, vocab_size)
        self.embed_dim = embed_dim
        self.seq_len = seq_len

    def forward(self, x):
        B, T = x.size()
        positions = torch.arange(0, T, device=x.device).unsqueeze(0)
        x = self.token_embed(x) + self.pos_embed(positions)

        tgt_mask = nn.Transformer.generate_square_subsequent_mask(T).to(x.device)
        dummy_memory = torch.zeros(T, B, self.embed_dim, device=x.device)

        x = x.transpose(0, 1)  # [T, B, E]
        out = self.decoder(tgt=x, memory=dummy_memory, tgt_mask=tgt_mask)
        out = self.fc_out(out.transpose(0, 1))  # [B, T, vocab]
        return out

model = DecoderOnlyTransformer(
    vocab_size=tokenizer.vocab_size,
    embed_dim=config["EMBED_DIM"],
    num_heads=config["NUM_HEADS"],
    num_layers=config["NUM_LAYERS"],
    seq_len=config["SEQ_LEN"]
).to(config["DEVICE"])

# -------------------------------
# Training
# -------------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=config["LR"])
loss_fn = nn.CrossEntropyLoss()

def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(config["DEVICE"]), y.to(config["DEVICE"])
            logits = model(x)
            loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1))
            total_loss += loss.item()
    return total_loss / len(dataloader)

  #---------------------------------------------------------------------------------#
# First, login to wandb (only needed once per session)
wandb.login()

# Initialize a wandb run to track the experiment
run = wandb.init(
    project="shakespeare-transformer",  # Name your project here
    config=config
)
#---------------------------------------------------------------------------------------------#
for epoch in range(config["EPOCHS"]):
    model.train()
    train_loss=0
    for x, y in train_loader:
        x, y = x.to(config["DEVICE"]), y.to(config["DEVICE"])
        optimizer.zero_grad()
        logits = model(x)
        loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
  #avarage training loss between batches
    train_loss /= len(train_loader)
    val_loss = evaluate(model, val_loader)


    wandb.log({
        "train_loss": train_loss,
        "validation_loss": val_loss,
        "epoch": epoch + 1
    })
    print(f"Epoch {epoch+1}/{config['EPOCHS']} | Validation Loss: {val_loss:.4f}")

# -------------------------------
# Text generieren
# -------------------------------
def generate(model, prompt, max_tokens=50):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(config["DEVICE"])
    for _ in range(max_tokens):
        input_trim = input_ids[:, -config["SEQ_LEN"]:]
        with torch.no_grad():
            logits = model(input_trim)
            next_token = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(0)
        input_ids = torch.cat([input_ids, next_token], dim=1)
    return tokenizer.decode(input_ids[0])

# Beispiel
print("\nGenerated text:\n")
print(generate(model, "ROMEO: ", max_tokens=50))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.10k [00:00<?, ?B/s]

tiny_shakespeare.py:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

The repository for karpathy/tiny_shakespeare contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/karpathy/tiny_shakespeare.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (301966 > 1024). Running this sequence through the model will result in indexing errors
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbasan-1994-15[0m ([33mbasan-1994-15-hochschule-hannover[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/4 | Validation Loss: 2.4614
Epoch 2/4 | Validation Loss: 1.8690
Epoch 3/4 | Validation Loss: 1.5273
Epoch 4/4 | Validation Loss: 1.2834

Generated text:

ROMEO: 

JULIET:
I'll gladly learn, I'll tell thee,
I'll not believe it.

ROMEO:
I'll go along with thee.

JULIET:
I'll gladly learn


In [None]:
#pip install huggingface_hub
#huggingface-cli login


In [None]:
# Modell und Tokenizer speichern
save_dir = Path("my_shakespeare_model")
save_dir.mkdir(exist_ok=True)

# Speichern der Modellgewichte
torch.save(model.state_dict(), save_dir / "pytorch_model.bin")

# Speichern des Tokenizers
tokenizer.save_pretrained(save_dir)


# Convert DEVICE to string so it's JSON serializable
config["DEVICE"] = str(config["DEVICE"])

with open(save_dir / "config.json", "w") as f:
    json.dump(config, f)

# Upload zu Hugging Face Hub
from huggingface_hub import create_repo, upload_folder,login
login()

repo_name = "decoder-shakespeare-gpt"
create_repo(repo_name, exist_ok=True)

upload_folder(
    folder_path=save_dir,
    repo_id=f"NataliiaM15/{repo_name}",
    commit_message="Initial upload"
)

# WandB schließen
wandb.finish()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

pytorch_model.bin:   0%|          | 0.00/53.8M [00:00<?, ?B/s]

0,1
epoch,▁▃▆█
train_loss,█▄▂▁
validation_loss,█▄▂▁

0,1
epoch,4.0
train_loss,1.82814
validation_loss,1.28343
