## Installing required libraries

In [46]:
%pip install torch transformers datasets wandb




Note: you may need to restart the kernel to use updated packages.


In [47]:
%pip install setuptools
%pip install wheel
%pip install wandb
%pip install huggingface_hub[hf_xet]

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## importing libraries 

In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Tokenizer
from datasets import load_dataset
import wandb
from huggingface_hub import HfApi, HfFolder, Repository
import os
import random

## Configurations

In [49]:
class Config:
    block_size = 128
    batch_size = 32
    embed_dim = 128
    num_heads = 4
    num_layers = 2
    dropout = 0.1
    epochs = 5
    lr = 1e-3
    device = "cuda" if torch.cuda.is_available() else "cpu"
    wandb_project = "tiny-shakespeare-decoder-only"
    model_name = "decoder-only-tinyshakespeare"

cfg = Config()



## Preparing and Preparing Data

In [50]:
import requests


data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
with open("shakespeare.txt", "w") as f:
    f.write(requests.get(data_url).text)

with open("shakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read()
from transformers import GPT2TokenizerFast

## Loading   Tokenizer 


In [51]:

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})



tokens = tokenizer(text, return_tensors="pt")
input_ids = tokens["input_ids"].squeeze(0)

Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


## spliting dataset

In [52]:

n = int(0.9 * len(input_ids))
train_ids = input_ids[:n]
val_ids = input_ids[n:]

def get_batch(data, block_size, batch_size):
    if len(data) <= block_size:
        raise ValueError(f"Datensatz ist zu klein für block_size={block_size} (nur {len(data)} Tokens).")

    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(cfg.device), y.to(cfg.device)



## Decoder-only Modell


In [53]:

class DecoderOnlyModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, cfg.embed_dim)
        self.pos_embedding = nn.Parameter(torch.zeros(1, cfg.block_size, cfg.embed_dim))

        decoder_layer = nn.TransformerDecoderLayer(d_model=cfg.embed_dim, nhead=cfg.num_heads, dropout=cfg.dropout)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=cfg.num_layers)

        self.output_proj = nn.Linear(cfg.embed_dim, vocab_size)

    def forward(self, x):
        tok_emb = self.embedding(x)
        pos_emb = self.pos_embedding[:, :x.size(1), :]
        x = tok_emb + pos_emb

        tgt_mask = nn.Transformer.generate_square_subsequent_mask(x.size(1)).to(x.device)

        # Autoregressives Target als sowohl memory als auch tgt
        out = self.decoder(x.transpose(0, 1), x.transpose(0, 1), tgt_mask=tgt_mask)
        logits = self.output_proj(out.transpose(0, 1))
        return logits

In [54]:
import wandb
wandb.login(key="d2a634cf8a08029fc63ddf4ff73e2da22ac1c1e0")



True

## Training loop

In [82]:
#  Training preparation
model = DecoderOnlyModel(vocab_size=tokenizer.vocab_size).to(cfg.device)
optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr)
loss_fn = nn.CrossEntropyLoss()

import wandb
wandb.init(project=cfg.wandb_project, config=vars(cfg))

# raining Loop
for epoch in range(6):  
    model.train()
    total_loss = 0
    for step in range(200):  
        x, y = get_batch(train_ids, cfg.block_size, cfg.batch_size)
        logits = model(x)
        loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / 200

    # Evaluation
    model.eval()
    with torch.no_grad():
        val_x, val_y = get_batch(val_ids, cfg.block_size, cfg.batch_size)
        val_logits = model(val_x)
        val_loss = loss_fn(val_logits.view(-1, val_logits.size(-1)), val_y.view(-1)).item()

    print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f} | Val Loss = {val_loss:.4f}")
    wandb.log({"train_loss": avg_train_loss, "val_loss": val_loss, "epoch": epoch+1})


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
epoch,▁▃▅▆█
train_loss,█▄▃▂▁
val_loss,▇▅▅█▁

0,1
epoch,5.0
train_loss,3.34305
val_loss,4.32956


Epoch 1: Train Loss = 5.7444 | Val Loss = 5.3130
Epoch 2: Train Loss = 4.3734 | Val Loss = 4.7451




Epoch 3: Train Loss = 3.9457 | Val Loss = 4.5034
Epoch 4: Train Loss = 3.6516 | Val Loss = 4.8341
Epoch 5: Train Loss = 3.3423 | Val Loss = 4.6937
Epoch 6: Train Loss = 3.0237 | Val Loss = 4.4508


## Text Generation

In [95]:
def generate_text(model, tokenizer, prompt, max_new_tokens=50, temperature=1.0):
    model.eval()
    generated = tokenizer(prompt, return_tensors="pt")["input_ids"].to(cfg.device)
    
    for _ in range(max_new_tokens):
        input_ids = generated[:, -cfg.block_size:]
        with torch.no_grad():
            logits = model(input_ids)
        logits = logits[:, -1, :] / temperature  
        probs = torch.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        generated = torch.cat((generated, next_token), dim=1)
    
    return tokenizer.decode(generated[0], skip_special_tokens=True)

prompt = " long time ago"
output = generate_text(model, tokenizer, prompt, max_new_tokens=100)
print(output)


 long time ago his power
The senate his power his mighty man his dev:
From his power
The power had swallow his power
Nor his power
The realm for his power
The each his power,
war begin his power dead and his power and freshest his power:
The Rome,
 sovereign? and his tent his power? and bl with his power and his charge?

 his mother? and his, mark his new-wh tongue his? and lam?, who deny his


## Saving the Model

In [91]:
import os
import json
import torch
from huggingface_hub import HfApi, HfFolder, upload_folder, create_repo

save_dir = "decoder-gpt-julienne"
os.makedirs(save_dir, exist_ok=True)

#  Save model weights
torch.save(model.state_dict(), os.path.join(save_dir, "pytorch_model.bin"))

#  Save tokenizer
tokenizer.save_pretrained(save_dir)

#  Save model config
model_config = {
    "model_type": "decoder-only",
    "vocab_size": tokenizer.vocab_size,
    "embed_dim": cfg.embed_dim,
    "num_heads": cfg.num_heads,
    "num_layers": cfg.num_layers,
    "dropout": cfg.dropout,
    "block_size": cfg.block_size
}
with open(os.path.join(save_dir, "config.json"), "w") as f:
    json.dump(model_config, f)

# README.md
with open(os.path.join(save_dir, "README.md"), "w") as f:
    f.write(
        "# Decoder Only GPT by Julienne Mizero\n"
        "This repository contains a decoder-only GPT model fine-tuned by Julienne Mizero. The model is designed to be used for various text generation tasks."
        "#Model Description\n"
        "The decoder-only GPT model is a generative language model that is based on the transformer architecture. It is fine-tuned on a specific dataset to improve its ability to generate coherent text based on a given prompt."
    )


## Uploading to Hugging Face Hub

In [92]:
from huggingface_hub import login, upload_folder
import os

# Your Hugging Face token
HUGGINGFACE_TOKEN = "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"

# Login to Hugging Face
login(token=HUGGINGFACE_TOKEN)

#  Your Hugging Face username
username = "JulienneMizero"

# Define repo info (Include the username in repo_id)
repo_name = "decoder-gpt-julienne"
repo_id = f"{username}/{repo_name}"  # Correct repo ID format

# Define the model folder path (replace with the actual folder you want to upload)
model_folder = r"C:\Users\hshakademie7\Desktop\INTOCODE WEITERBILDUNG\Generative_Al_project\GenerativeAI-Project\decoder-gpt-julienne"

# Check if the folder exists
if not os.path.isdir(model_folder):
    raise FileNotFoundError(f"Folder '{model_folder}' not found!")

# Upload the folder to your repo
upload_folder(
    repo_id=repo_id,
    folder_path=model_folder,  # This should be the local path to your model folder
    path_in_repo="https://huggingface.co/JulienneMizero/decoder-gpt-julienne",  # Upload everything at the root of the repo
    commit_message="Upload model from Julienne"
)

print(" Upload finished!")


- empty or missing yaml metadata in repo card
pytorch_model.bin: 100%|██████████| 57.0M/57.0M [01:21<00:00, 700kB/s] 


 Upload finished!
