In [7]:
# initializierung
!pip install transformers datasets wandb



In [9]:
## importierungen und wandb anmeldung
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import wandb
wandb.login()

True

## STEP 1 - Dataset Vorbereitung


In [11]:
# Dataset indir
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Train / Validation ayır
train_data = dataset["train"]
val_data = dataset["validation"]




## STEP 2 - Tokenizer ve Tokenization

In [12]:
# Tokenizer indir (GPT2 tokenizer)
tokenizer = AutoTokenizer.from_pretrained("gpt2")

tokenizer.pad_token = tokenizer.eos_token

# Max sequence length
max_len = 64  # Sabit uzunluk

# Textleri tokenize et
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_len, padding="max_length")

# Tüm veriyi tokenize et
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

## STEP 3 — DataLoader Vorbereitung

In [14]:
# 1) Dataset formatını düzelt
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# 2) DataLoader tanımı
from torch.utils.data import DataLoader

batch_size = 32

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)

## STEP 4  - Model Installation --NanoTransformer--
(Mini Transformer — Decoder Only)
-Amacımız:
-->*Basit bir Decoder mimarisi kurmak.
-->*(GPT tarzı — Text Generation için uygun)

## STEP 4.1 - Transformer Block

In [17]:
# Bir adet Transformer Block tanımlıyoruz
class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()

        # Multi-Head Attention
        self.attention = nn.MultiheadAttention(d_model, n_heads, batch_first=True)

        # Layer Norm (Stabilizasyon için)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Feed Forward Layer (MLP)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x, attn_mask=None):
        # Attention Uygula
        attn_output, _ = self.attention(x, x, x, attn_mask=attn_mask)

        # Residual + Norm
        x = self.norm1(x + attn_output)

        # Feed Forward Uygula
        ff_output = self.ff(x)

        # Residual + Norm
        x = self.norm2(x + ff_output)

        return x


## STEP 4.2 - NanoTransformer Model

In [18]:
# Final Model

class NanoTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, max_len, num_layers):
        super().__init__()

        # Embedding
        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(max_len, d_model)

        # Transformer Block
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, n_heads, d_ff) for _ in range(num_layers)
        ])

        # Layer Norm
        self.norm = nn.LayerNorm(d_model)

        # Output
        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, x, attention_mask=None):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)

        # Toplam embedding
        x = self.token_embed(x) + self.pos_embed(positions)


        for block in self.blocks:
            x = block(x)

        x = self.norm(x)
        logits = self.output_proj(x)
        return logits


## Adım 5 — Model Kurulumu ve Eğitim Hazırlığı

Bu adımda şunları yapacağız:

-Model objesini oluşturacağız
-Optimizer tanımlayacağız
-Loss function seçeceğiz
-Training Loop hazırlığı yapacağız

## STEP 5.1 - Model Hyperparametern Definieren

In [21]:
# Model Hyperparametrelerini Tanımla
vocab_size = tokenizer.vocab_size
d_model = 64       # Embedding boyutu
n_heads = 2        # Attention Head sayısı
d_ff = 256         # Feedforward genişliği
max_len = 64       # Maksimum sequence uzunluğu
epochs = 5
lr = 1e-3
num_layers = 2     # Transformer blok sayısı

# Model Oluştur
model = NanoTransformer(vocab_size, d_model, n_heads, d_ff, max_len, num_layers)


In [22]:
wandb.login()

wandb.init(
    project="nano-transformer-project",
    name="nano-transformer-v1",
    config={
        "epochs": epochs,
        "batch_size": batch_size,
        "lr": lr,
        "max_len": max_len,
        "d_model": d_model,
        "num_layers": num_layers,
    }
)

## STEP 5.2 -  Optimizer ve Loss Function Tanımla


In [9]:
import torch.optim as optim

# Loss Fonksiyonu — Çoklu sınıflarda klasik olan
criterion = nn.CrossEntropyLoss()

# Optimizer — Adam optimizasyon algoritması
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# GPU var mı kontrol et
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)



## STEP 5.4 — Training Loop (Eğitim Döngüsü)


In [25]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = NanoTransformer(vocab_size, d_model, n_heads, d_ff, max_len, num_layers).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        inputs = batch["input_ids"].to(device)
        targets = inputs.clone()

        outputs = model(inputs)  # attention_mask göndermiyoruz
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_loss:.4f}")
    wandb.log({"train_loss": avg_loss, "epoch": epoch+1})

wandb.finish()

Epoch 1/5 | Train Loss: 4.4894


KeyboardInterrupt: 

## 5.5 - Text Generation

In [30]:
def generate(model, start_token, max_len=50, temperature=1.0):
    model.eval()
    input_ids = start_token.to(device)

    for _ in range(max_len):
        logits = model(input_ids)
        next_token_logits = logits[:, -1, :] / temperature
        probs = torch.softmax(next_token_logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        input_ids = torch.cat([input_ids, next_token], dim=1)

    return input_ids.squeeze().tolist()


In [31]:
start_text = "The future of AI"
input_ids = tokenizer.encode(start_text, return_tensors="pt").to(device)

output_ids = generate(model, input_ids, max_len=50)
output_text = tokenizer.decode(output_ids)
print(output_text)


The future of AI shellsster Bangkok On Expressionstersetry research the the rag of of of Ott eclips Servicedensity death  Music end theories Hu prolific sheolithicrealDonaldTrump and and anded Tours announcedadersFly ver Jim-- responds usageGiving investigating statistically ranks dro reliant held terrorism


## STEP 6 - Hugging Face

## STEP 6.5 - Text Generation

In [33]:
# Hugging Face GPT-2 için Text Generation

from transformers import AutoTokenizer, AutoModelForCausalLM

# Model ve Tokenizer yükle
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Padding Token ayarla (GPT2'de yok)
tokenizer.pad_token = tokenizer.eos_token

# Generate için başlangıç cümlesi
start_text = "The future of AI"

# Encode input
input_ids = tokenizer.encode(start_text, return_tensors="pt").to(model.device)

# Text Generate
output_ids = model.generate(
    input_ids=input_ids,
    max_length=100,          # Üretilecek toplam uzunluk
    num_return_sequences=1,  # Kaç farklı cümle üretilecek
    do_sample=True,          # Sampling açık (rastgelelik için)
    temperature=1.0,         # Random seviyesini belirler
    top_k=50,                # Top-k sampling
    top_p=0.95,              # Top-p sampling
    pad_token_id=tokenizer.eos_token_id
)

# Decode & Print
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(output_text)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The future of AI will change as well, according to the BBC, with the development of machine learning capabilities that are able to help solve complex problems.

One such capability is that of "deep learning", which aims to combine computer programs with a "deep learning network" to build a model of something that resembles humans, but without the need for special computer algorithms to work.

Image copyright The BBC Image caption IBM's Watson, for example, is expected to help develop AI

Image
