In [20]:
print("*** Abschlussprojekt  AI Sprachmodell***")        

*** Abschlussprojekt  AI Sprachmodell***


In [21]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2TokenizerFast
from datasets import load_dataset
from transformers import AutoTokenizer
import wandb
import os
import json
from huggingface_hub import login, create_repo, upload_folder


In [22]:
# 🔐 تسجيل الدخول إلى Hugging Face
login("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")

In [23]:
 #  Hyperparameter - small model suitable for CPU training

config = {
    "vocab_size": None,       #  Vocabulary size
    "d_model": 32,             # حجم تمثيل الكلمات - Embedding dimension
    "nhead": 2,                # عدد الرؤوس في MultiheadAttention
    "num_layers": 1,           # عدد طبقات Transformer
    "dim_feedforward": 128,    # حجم الطبقة الخفية - FFN dimension
    "dropout": 0.1,            # Dropout rate
    "block_size": 32,          # طول التسلسل المدخل - Sequence length
    "batch_size": 2,
    "epochs":5,
    "lr": 5e-4,                #  Learning rate
    "device": "cpu"            
}

In [32]:
# Projektkonfiguration mit wandb
wandb.init(project="my-language-model", config=config)


In [25]:
# 🧠 Laden des Tokenizers

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.model_max_length = config["block_size"]
tokenizer.pad_token = tokenizer.eos_token  #  تسوية الأطوال padding
config["vocab_size"] = tokenizer.vocab_size

In [26]:
# Shakespeare-Daten
text_data = [
    "All the world's a stage, and all the men and women merely players.",
    "The fault, dear Brutus, is not in our stars, but in ourselves."
    ]




In [27]:
tokenized = tokenizer("\n\n".join(text_data), return_tensors="pt")
input_ids = tokenized.input_ids[0]

# ✂️ تقسيم التسلسلات Sequenzteilung
def create_sequences(tokens, block_size):
    return [tokens[i:i+block_size] for i in range(0, len(tokens)-block_size)]

sequences = create_sequences(input_ids, config["block_size"])


Token indices sequence length is longer than the specified maximum sequence length for this model (35 > 32). Running this sequence through the model will result in indexing errors


In [28]:
# Dataset 
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.data = sequences

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx][:-1]
        y = self.data[idx][1:]
        return torch.tensor(x), torch.tensor(y)

dataset = TextDataset(sequences)
dataloader = DataLoader(dataset, batch_size=config["batch_size"], shuffle=True)


In [29]:
# تعريف النموذجTransformer Decoder
class TransformerLanguageModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embedding = nn.Embedding(config["vocab_size"], config["d_model"])
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=config["d_model"],
            nhead=config["nhead"],
            dim_feedforward=config["dim_feedforward"],
            dropout=config["dropout"],
            batch_first=True
        )
        self.transformer = nn.TransformerDecoder(decoder_layer, num_layers=config["num_layers"])
        self.output = nn.Linear(config["d_model"], config["vocab_size"])

    def forward(self, x):
        # x: [batch_size, seq_len]
        emb = self.embedding(x)  # [batch_size, seq_len, d_model]
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(emb.size(1)).to(emb.device)
        out = self.transformer(emb, emb, tgt_mask=tgt_mask)
        return self.output(out)

In [30]:
# 🛠️ Modell- und Optimierer-Initialisierung 
model = TransformerLanguageModel(config).to(config["device"])
optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
loss_fn = nn.CrossEntropyLoss()

In [40]:

#🔁 Training
for epoch in range(config["epochs"]):
    model.train()
    total_loss = 0

    for x, y in dataloader:
        x, y = x.to(config["device"]), y.to(config["device"])
        optimizer.zero_grad()
        logits = model(x)
        loss = loss_fn(logits.view(-1, config["vocab_size"]), y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{config['epochs']} - Loss: {avg_loss:.4f}")
    wandb.init(project="my-language-model", config=config)
    wandb.log({"train_loss": avg_loss, "epoch": epoch+1})
    
    # 🔍 Performance Evaluation  تقييم الأداء
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for x, y in dataloader:
            x, y = x.to(config["device"]), y.to(config["device"])
            logits = model(x)
            loss = loss_fn(logits.view(-1, config["vocab_size"]), y.view(-1))
            val_loss += loss.item()
        val_loss /= len(dataloader)
        wandb.log({"val_loss": val_loss, "epoch": epoch+1})
        
wandb.finish()

  return torch.tensor(x), torch.tensor(y)


Epoch 1/5 - Loss: 8.8433


Epoch 2/5 - Loss: 8.7302


  return torch.tensor(x), torch.tensor(y)


0,1
epoch,▁▁
train_loss,▁
val_loss,▁

0,1
epoch,1.0
train_loss,8.8433
val_loss,8.72608


Epoch 3/5 - Loss: 8.6742


  return torch.tensor(x), torch.tensor(y)


0,1
epoch,▁▁
train_loss,▁
val_loss,▁

0,1
epoch,2.0
train_loss,8.73024
val_loss,8.62052


Epoch 4/5 - Loss: 8.5756


  return torch.tensor(x), torch.tensor(y)


0,1
epoch,▁▁
train_loss,▁
val_loss,▁

0,1
epoch,3.0
train_loss,8.67421
val_loss,8.5209


Epoch 5/5 - Loss: 8.4837


  return torch.tensor(x), torch.tensor(y)


0,1
epoch,▁▁
train_loss,▁
val_loss,▁

0,1
epoch,4.0
train_loss,8.57564
val_loss,8.44951


  return torch.tensor(x), torch.tensor(y)


0,1
epoch,▁▁
train_loss,▁
val_loss,▁

0,1
epoch,5.0
train_loss,8.48365
val_loss,8.35677


In [41]:
# 🧠Generate text 

def generate_text(model, tokenizer, prompt, max_length=50):
    model.eval()
    tokens = tokenizer.encode(prompt, return_tensors="pt").to(config["device"])
    generated = tokens
    with torch.no_grad():
        for _ in range(max_length):
            if generated.size(1) > config["block_size"]:
                generated = generated[:, -config["block_size"]:]
            output = model(generated)
            next_token_logits = output[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)
            generated = torch.cat((generated, next_token), dim=1)
    result = tokenizer.decode(generated[0], skip_special_tokens=True)
    return result
print(generate_text(model, tokenizer, "The king said"))

,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [43]:

# 💾 Save-Upload 
repo_name = "Mini-GenModel"
os.makedirs(repo_name, exist_ok=True)
torch.save(model.state_dict(), f"{repo_name}/pytorch_model.bin")
tokenizer.save_pretrained(repo_name)
with open(f"{repo_name}/config.json", "w") as f:
    json.dump(config, f)

upload_folder(folder_path=repo_name, repo_id=f"rahaf-aswad/{repo_name}")

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/rahaf-aswad/Mini-GenModel/commit/cfe89674e64786fae3a36e210ec66e1ec8d17051', commit_message='Upload folder using huggingface_hub', commit_description='', oid='cfe89674e64786fae3a36e210ec66e1ec8d17051', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rahaf-aswad/Mini-GenModel', endpoint='https://huggingface.co', repo_type='model', repo_id='rahaf-aswad/Mini-GenModel'), pr_revision=None, pr_num=None)