In [2]:
import torch
import torch.nn as nn
from kasim_model import Kasim_Model
from kasim_tokenizer import Tokenizer   

k_tokenizer=Tokenizer("tokenizer.json")
sentence = "the capital of united states"
tokens = k_tokenizer.encode(sentence)

# tokens tek boyutlu: torch.Size([20])
#tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(0)  # shape: (1, 20)
tokens

tensor([ 0, 61,  1, 61,  2, 61,  3, 61,  4, 58])

In [3]:
import torch
from kasim_model import Kasim_Model
torch.manual_seed(1)
k_model = Kasim_Model(vocab_size=len(k_tokenizer.vocab), embed_dim=12, context_length=32, num_heads=4, num_layers=8)
out= k_model(tokens)
out.shape


torch.Size([1, 10, 64])

In [4]:
with open ("text.txt","r") as f:
    text=f.read()

text    

'the capital of the united states is not london. the capital of france is paris, and berlin is the capital of germany. rome is in italy, \n\nmadrid is in spain, and lisbon is in portugal. the capital of the united kingdom is not paris, and the capital of the united states is not berlin. \nalthough these places are often mentioned together, although these capitals are often mentioned together, although these are often mentioned together, \neach country has its own capital, and each country has its own city, and each capital has its own identity, and each capital has its own history. washington \nis the capital of the united states, and london is the capital of the united kingdom. paris is known for art and fashion, and berlin is known for art and \nhistory, and rome is known for art and history, and madrid is known for culture and history, and lisbon is known for culture and art. rome is rich with culture, \nrome is rich with history, rome is rich with art, and madrid is rich with art a

In [5]:
token_id=k_tokenizer.encode(text)
len(token_id)
len(text)

4104

In [6]:
token_id = token_id.detach().cpu().numpy().tolist()

In [7]:
from text_dataset import TextDataset

stride = 12
context_length=32
dataset=TextDataset(token_id,context_length,stride)
len(dataset.inputs)

131

In [8]:
# model parameters count
parameters_count = sum(p.numel() for p in k_model.parameters())
print(parameters_count)

# model architecture
print(k_model)

23104
Kasim_Model(
  (embedding): Embedding(64, 12)
  (pos_embed): Embedding(32, 12)
  (layers): Sequential(
    (0): KasimDecoderBlock(
      (self_attention): Kasim_Multi_Head_Attention(
        (multi_head_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=12, out_features=12, bias=True)
        )
        (projection): Linear(in_features=12, out_features=12, bias=True)
      )
      (norm1): KasimRMSNorm()
      (mlp): KasimMLP(
        (gate_proj): Linear(in_features=12, out_features=48, bias=True)
        (up_proj): Linear(in_features=12, out_features=48, bias=True)
        (down_proj): Linear(in_features=48, out_features=12, bias=True)
        (gelu): KasimGelu()
      )
      (norm2): KasimRMSNorm()
    )
    (1): KasimDecoderBlock(
      (self_attention): Kasim_Multi_Head_Attention(
        (multi_head_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=12, out_features=12, bias=True)
 

In [9]:
k_model.embedding.weight.shape

torch.Size([64, 12])

In [10]:
from torch.utils.data import DataLoader
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss()

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

batch = next(iter(dataloader))
inputs = batch["input_ids"]   # zaten tensor
targets = batch["labels"]     # zaten tensor

out = k_model(inputs)
out = out.reshape(-1, out.size(-1))
targets = targets.reshape(-1)
loss = loss_fn(out, targets)
print(loss)


tensor(4.5460, grad_fn=<NllLossBackward0>)


In [11]:
optimizer=torch.optim.AdamW(k_model.parameters(),lr=1e-3)

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
k_model.to(device)
epochs=500
for epoch in range(epochs):
    total_loss = 0.
    for batch in dataloader:
        inputs = batch["input_ids"].to(device)
        targets = batch["labels"].to(device)

        pred = k_model(inputs)
        pred = pred.reshape(-1, pred.size(-1))
        targets = targets.reshape(-1)

        loss = loss_fn(pred, targets)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}, average loss: {avg_loss:.4f}")


Epoch 1, average loss: 4.3499
Epoch 2, average loss: 4.0036
Epoch 3, average loss: 3.6388
Epoch 4, average loss: 3.3463
Epoch 5, average loss: 3.0635
Epoch 6, average loss: 2.9429
Epoch 7, average loss: 2.8281
Epoch 8, average loss: 2.7108
Epoch 9, average loss: 2.5849
Epoch 10, average loss: 2.5430
Epoch 11, average loss: 2.4810
Epoch 12, average loss: 2.3911
Epoch 13, average loss: 2.3433
Epoch 14, average loss: 2.3017
Epoch 15, average loss: 2.2534
Epoch 16, average loss: 2.2349
Epoch 17, average loss: 2.2302
Epoch 18, average loss: 2.1499
Epoch 19, average loss: 2.1372
Epoch 20, average loss: 2.1519
Epoch 21, average loss: 2.0808
Epoch 22, average loss: 2.0890
Epoch 23, average loss: 2.0911
Epoch 24, average loss: 2.0413


KeyboardInterrupt: 

In [13]:
epochs = 200

for epoch in range(epochs):
    total_loss = 0.
    for batch in dataloader:
        inputs = batch["input_ids"]
        targets = batch["labels"]

        pred = k_model(inputs)
        pred = pred.reshape(-1, pred.size(-1))
        targets = targets.reshape(-1)

        loss = loss_fn(pred, targets)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}, average loss: {avg_loss:.4f} ")


Epoch 1, average loss: 2.0500 
Epoch 2, average loss: 2.0166 
Epoch 3, average loss: 2.0180 
Epoch 4, average loss: 2.0176 
Epoch 5, average loss: 2.0167 
Epoch 6, average loss: 1.9749 
Epoch 7, average loss: 1.9866 
Epoch 8, average loss: 1.9771 
Epoch 9, average loss: 1.9837 
Epoch 10, average loss: 1.9179 
Epoch 11, average loss: 1.9382 
Epoch 12, average loss: 1.9197 
Epoch 13, average loss: 1.9055 
Epoch 14, average loss: 1.9465 
Epoch 15, average loss: 1.9418 
Epoch 16, average loss: 1.9218 
Epoch 17, average loss: 1.9026 
Epoch 18, average loss: 1.9179 
Epoch 19, average loss: 1.8623 
Epoch 20, average loss: 1.8660 
Epoch 21, average loss: 1.8883 
Epoch 22, average loss: 1.8957 
Epoch 23, average loss: 1.8586 
Epoch 24, average loss: 1.8643 
Epoch 25, average loss: 1.8627 
Epoch 26, average loss: 1.8727 
Epoch 27, average loss: 1.8347 
Epoch 28, average loss: 1.8243 
Epoch 29, average loss: 1.8690 
Epoch 30, average loss: 1.8199 
Epoch 31, average loss: 1.8313 
Epoch 32, average

KeyboardInterrupt: 

In [56]:
import torch

out=k_model(tokens)
probs=torch.softmax(out[-1],dim=-1)
max_prob,max_indext=torch.max(probs,dim=-1)
max_prob,max_indext,probs

(tensor([0.9996, 0.3144, 0.9993, 0.5676, 0.9997, 0.3654, 0.9996, 0.2352, 0.9903,
         0.8004], grad_fn=<MaxBackward0>),
 tensor([61,  1, 61,  2, 61,  2, 61,  3, 58, 61]),
 tensor([[2.3912e-09, 9.1006e-10, 4.1367e-11, 2.1925e-12, 5.4059e-09, 1.9880e-06,
          8.8059e-12, 2.9546e-10, 5.7932e-12, 2.9939e-12, 3.3423e-10, 1.0838e-08,
          7.6299e-15, 4.6250e-06, 4.7222e-07, 2.5737e-09, 8.5263e-11, 1.4422e-09,
          3.6366e-12, 5.4855e-12, 2.0818e-11, 9.3524e-10, 2.3824e-11, 7.4980e-11,
          6.4473e-13, 6.3438e-08, 7.0918e-08, 2.0496e-06, 8.5163e-10, 1.8009e-11,
          5.3389e-08, 1.8705e-12, 5.8771e-13, 1.9998e-10, 1.9882e-07, 2.6093e-11,
          1.7134e-15, 2.5912e-09, 1.2627e-10, 1.2760e-11, 2.3119e-12, 1.6592e-09,
          2.4493e-11, 2.6871e-10, 4.7252e-08, 1.6120e-09, 4.7048e-10, 1.5141e-13,
          1.1342e-11, 3.4257e-11, 6.2062e-10, 1.1431e-10, 1.8579e-13, 3.0879e-07,
          1.0523e-10, 2.3922e-10, 9.0388e-16, 3.1162e-06, 1.5734e-08, 4.0243e-04,
     

In [70]:
import torch

# Tokenize ve tensor al
new_tokens = k_tokenizer.encode("madrid is in")  # tensor

# Son token ekle
new_tokens = torch.cat([new_tokens, torch.tensor([61])], dim=0)  # (seq_len+1,)

# Batch dimension ekle
inputs = new_tokens.unsqueeze(0)  # (1, seq_len+1)

# Modeli çalıştır
out = k_model(inputs)  # (1, seq_len+1, vocab_size)

# Son token için olasılık ve tahmin
last_token_logits = out[:, -1, :]           # (1, vocab_size)
probs = torch.softmax(last_token_logits, dim=-1)
max_prob, max_index = torch.max(probs, dim=-1)

print("Max probability:", max_prob.item())
print("Predicted token id:", max_index.item())


Max probability: 0.8347604274749756
Predicted token id: 51


Generated text: madrid is in italy, madrid


  inputs = torch.tensor(prompt_tokens).unsqueeze(0)   # batch dimension


In [71]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm

# Cihaz ayarı
device = "cuda" if torch.cuda.is_available() else "cpu"
k_model.to(device)

# Loss fonksiyonu
loss_fn = nn.CrossEntropyLoss()

# Optimizer ve scheduler
lr = 5e-4
optimizer = torch.optim.AdamW(k_model.parameters(), lr=lr, weight_decay=0.01)
epochs = 200
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

# Dataloader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

for epoch in range(epochs):
    k_model.train()
    total_loss = 0.0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
        inputs = batch["input_ids"].to(device)
        targets = batch["labels"].to(device)

        optimizer.zero_grad()

        # Forward pass
        pred = k_model(inputs)
        pred = pred.view(-1, pred.size(-1))
        targets = targets.view(-1)

        # Loss
        loss = loss_fn(pred, targets)

        # Backward pass
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(k_model.parameters(), max_norm=1.0)

        # Optimizer step
        optimizer.step()

        total_loss += loss.item()

    # Scheduler step
    scheduler.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")


Epoch 1/200: 100%|██████████| 5/5 [00:03<00:00,  1.40it/s]


Epoch 1, Average Loss: 0.2613


Epoch 2/200: 100%|██████████| 5/5 [00:02<00:00,  1.69it/s]


Epoch 2, Average Loss: 0.2677


Epoch 3/200: 100%|██████████| 5/5 [00:02<00:00,  1.71it/s]


Epoch 3, Average Loss: 0.2902


Epoch 4/200: 100%|██████████| 5/5 [00:03<00:00,  1.63it/s]


Epoch 4, Average Loss: 0.2596


Epoch 5/200: 100%|██████████| 5/5 [00:03<00:00,  1.63it/s]


Epoch 5, Average Loss: 0.2800


Epoch 6/200: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]


Epoch 6, Average Loss: 0.2577


Epoch 7/200: 100%|██████████| 5/5 [00:04<00:00,  1.01it/s]


Epoch 7, Average Loss: 0.2590


Epoch 8/200: 100%|██████████| 5/5 [00:05<00:00,  1.01s/it]


Epoch 8, Average Loss: 0.2714


Epoch 9/200: 100%|██████████| 5/5 [00:05<00:00,  1.11s/it]


Epoch 9, Average Loss: 0.2687


Epoch 10/200: 100%|██████████| 5/5 [00:06<00:00,  1.21s/it]


Epoch 10, Average Loss: 0.2741


Epoch 11/200: 100%|██████████| 5/5 [00:03<00:00,  1.26it/s]


Epoch 11, Average Loss: 0.2513


Epoch 12/200: 100%|██████████| 5/5 [00:04<00:00,  1.02it/s]


Epoch 12, Average Loss: 0.2685


Epoch 13/200: 100%|██████████| 5/5 [00:05<00:00,  1.06s/it]


Epoch 13, Average Loss: 0.2774


Epoch 14/200: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 14, Average Loss: 0.2398


Epoch 15/200: 100%|██████████| 5/5 [00:05<00:00,  1.00s/it]


Epoch 15, Average Loss: 0.2567


Epoch 16/200: 100%|██████████| 5/5 [00:05<00:00,  1.07s/it]


Epoch 16, Average Loss: 0.2606


Epoch 17/200: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 17, Average Loss: 0.2273


Epoch 18/200: 100%|██████████| 5/5 [00:05<00:00,  1.01s/it]


Epoch 18, Average Loss: 0.2356


Epoch 19/200: 100%|██████████| 5/5 [00:05<00:00,  1.11s/it]


Epoch 19, Average Loss: 0.2492


Epoch 20/200: 100%|██████████| 5/5 [00:04<00:00,  1.07it/s]


Epoch 20, Average Loss: 0.2515


Epoch 21/200: 100%|██████████| 5/5 [00:03<00:00,  1.49it/s]


Epoch 21, Average Loss: 0.2508


Epoch 22/200: 100%|██████████| 5/5 [00:03<00:00,  1.66it/s]


Epoch 22, Average Loss: 0.2390


Epoch 23/200: 100%|██████████| 5/5 [00:05<00:00,  1.16s/it]


Epoch 23, Average Loss: 0.2280


Epoch 24/200: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s]


Epoch 24, Average Loss: 0.2391


Epoch 25/200: 100%|██████████| 5/5 [00:05<00:00,  1.17s/it]


Epoch 25, Average Loss: 0.2629


Epoch 26/200: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 26, Average Loss: 0.2427


Epoch 27/200: 100%|██████████| 5/5 [00:05<00:00,  1.15s/it]


Epoch 27, Average Loss: 0.2405


Epoch 28/200: 100%|██████████| 5/5 [00:03<00:00,  1.37it/s]


Epoch 28, Average Loss: 0.2522


Epoch 29/200: 100%|██████████| 5/5 [00:04<00:00,  1.11it/s]


Epoch 29, Average Loss: 0.2512


Epoch 30/200: 100%|██████████| 5/5 [00:02<00:00,  1.67it/s]


Epoch 30, Average Loss: 0.2529


Epoch 31/200: 100%|██████████| 5/5 [00:03<00:00,  1.63it/s]


Epoch 31, Average Loss: 0.2494


Epoch 32/200: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 32, Average Loss: 0.2379


Epoch 33/200: 100%|██████████| 5/5 [00:04<00:00,  1.04it/s]


Epoch 33, Average Loss: 0.2367


Epoch 34/200: 100%|██████████| 5/5 [00:04<00:00,  1.14it/s]


Epoch 34, Average Loss: 0.2263


Epoch 35/200: 100%|██████████| 5/5 [00:04<00:00,  1.04it/s]


Epoch 35, Average Loss: 0.2467


Epoch 36/200: 100%|██████████| 5/5 [00:05<00:00,  1.01s/it]


Epoch 36, Average Loss: 0.2202


Epoch 37/200: 100%|██████████| 5/5 [00:05<00:00,  1.06s/it]


Epoch 37, Average Loss: 0.2248


Epoch 38/200: 100%|██████████| 5/5 [00:05<00:00,  1.10s/it]


Epoch 38, Average Loss: 0.2563


Epoch 39/200: 100%|██████████| 5/5 [00:04<00:00,  1.25it/s]


Epoch 39, Average Loss: 0.2477


Epoch 40/200: 100%|██████████| 5/5 [00:04<00:00,  1.17it/s]


Epoch 40, Average Loss: 0.2525


Epoch 41/200: 100%|██████████| 5/5 [00:03<00:00,  1.30it/s]


Epoch 41, Average Loss: 0.2445


Epoch 42/200: 100%|██████████| 5/5 [00:03<00:00,  1.36it/s]


Epoch 42, Average Loss: 0.2555


Epoch 43/200: 100%|██████████| 5/5 [00:03<00:00,  1.38it/s]


Epoch 43, Average Loss: 0.2296


Epoch 44/200: 100%|██████████| 5/5 [00:03<00:00,  1.49it/s]


Epoch 44, Average Loss: 0.2429


Epoch 45/200: 100%|██████████| 5/5 [00:03<00:00,  1.54it/s]


Epoch 45, Average Loss: 0.2481


Epoch 46/200: 100%|██████████| 5/5 [00:03<00:00,  1.60it/s]


Epoch 46, Average Loss: 0.2250


Epoch 47/200: 100%|██████████| 5/5 [00:03<00:00,  1.54it/s]


Epoch 47, Average Loss: 0.2181


Epoch 48/200: 100%|██████████| 5/5 [00:03<00:00,  1.54it/s]


Epoch 48, Average Loss: 0.2581


Epoch 49/200: 100%|██████████| 5/5 [00:03<00:00,  1.57it/s]


Epoch 49, Average Loss: 0.2431


Epoch 50/200: 100%|██████████| 5/5 [00:03<00:00,  1.61it/s]


Epoch 50, Average Loss: 0.2365


Epoch 51/200: 100%|██████████| 5/5 [00:03<00:00,  1.58it/s]


Epoch 51, Average Loss: 0.2702


Epoch 52/200: 100%|██████████| 5/5 [00:03<00:00,  1.45it/s]


Epoch 52, Average Loss: 0.2424


Epoch 53/200: 100%|██████████| 5/5 [00:02<00:00,  1.67it/s]


Epoch 53, Average Loss: 0.2515


Epoch 54/200: 100%|██████████| 5/5 [00:04<00:00,  1.02it/s]


Epoch 54, Average Loss: 0.2684


Epoch 55/200: 100%|██████████| 5/5 [00:05<00:00,  1.12s/it]


Epoch 55, Average Loss: 0.2396


Epoch 56/200: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s]


Epoch 56, Average Loss: 0.2268


Epoch 57/200: 100%|██████████| 5/5 [00:03<00:00,  1.29it/s]


Epoch 57, Average Loss: 0.2316


Epoch 58/200: 100%|██████████| 5/5 [00:05<00:00,  1.06s/it]


Epoch 58, Average Loss: 0.2531


Epoch 59/200: 100%|██████████| 5/5 [00:04<00:00,  1.21it/s]


Epoch 59, Average Loss: 0.2445


Epoch 60/200: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]


Epoch 60, Average Loss: 0.2402


Epoch 61/200: 100%|██████████| 5/5 [00:03<00:00,  1.66it/s]


Epoch 61, Average Loss: 0.2363


Epoch 62/200: 100%|██████████| 5/5 [00:02<00:00,  1.69it/s]


Epoch 62, Average Loss: 0.2278


Epoch 63/200: 100%|██████████| 5/5 [00:03<00:00,  1.54it/s]


Epoch 63, Average Loss: 0.2235


Epoch 64/200: 100%|██████████| 5/5 [00:03<00:00,  1.57it/s]


Epoch 64, Average Loss: 0.2273


Epoch 65/200: 100%|██████████| 5/5 [00:02<00:00,  1.70it/s]


Epoch 65, Average Loss: 0.2258


Epoch 66/200: 100%|██████████| 5/5 [00:02<00:00,  1.77it/s]


Epoch 66, Average Loss: 0.2298


Epoch 67/200: 100%|██████████| 5/5 [00:02<00:00,  1.77it/s]


Epoch 67, Average Loss: 0.2326


Epoch 68/200: 100%|██████████| 5/5 [00:02<00:00,  1.76it/s]


Epoch 68, Average Loss: 0.2338


Epoch 69/200: 100%|██████████| 5/5 [00:03<00:00,  1.59it/s]


Epoch 69, Average Loss: 0.2218


Epoch 70/200: 100%|██████████| 5/5 [00:03<00:00,  1.63it/s]


Epoch 70, Average Loss: 0.2397


Epoch 71/200: 100%|██████████| 5/5 [00:03<00:00,  1.59it/s]


Epoch 71, Average Loss: 0.2257


Epoch 72/200: 100%|██████████| 5/5 [00:02<00:00,  1.72it/s]


Epoch 72, Average Loss: 0.2451


Epoch 73/200: 100%|██████████| 5/5 [00:02<00:00,  1.68it/s]


Epoch 73, Average Loss: 0.2372


Epoch 74/200: 100%|██████████| 5/5 [00:03<00:00,  1.61it/s]


Epoch 74, Average Loss: 0.2513


Epoch 75/200: 100%|██████████| 5/5 [00:03<00:00,  1.64it/s]


Epoch 75, Average Loss: 0.2075


Epoch 76/200: 100%|██████████| 5/5 [00:03<00:00,  1.66it/s]


Epoch 76, Average Loss: 0.2364


Epoch 77/200: 100%|██████████| 5/5 [00:02<00:00,  1.77it/s]


Epoch 77, Average Loss: 0.2170


Epoch 78/200: 100%|██████████| 5/5 [00:02<00:00,  1.78it/s]


Epoch 78, Average Loss: 0.2131


Epoch 79/200: 100%|██████████| 5/5 [00:02<00:00,  1.79it/s]


Epoch 79, Average Loss: 0.2224


Epoch 80/200: 100%|██████████| 5/5 [00:02<00:00,  1.77it/s]


Epoch 80, Average Loss: 0.2293


Epoch 81/200: 100%|██████████| 5/5 [00:02<00:00,  1.75it/s]


Epoch 81, Average Loss: 0.1945


Epoch 82/200: 100%|██████████| 5/5 [00:03<00:00,  1.66it/s]


Epoch 82, Average Loss: 0.2515


Epoch 83/200: 100%|██████████| 5/5 [00:03<00:00,  1.64it/s]


Epoch 83, Average Loss: 0.2118


Epoch 84/200: 100%|██████████| 5/5 [00:03<00:00,  1.53it/s]


Epoch 84, Average Loss: 0.2234


Epoch 85/200: 100%|██████████| 5/5 [00:03<00:00,  1.50it/s]


Epoch 85, Average Loss: 0.2145


Epoch 86/200: 100%|██████████| 5/5 [00:03<00:00,  1.57it/s]


Epoch 86, Average Loss: 0.2383


Epoch 87/200: 100%|██████████| 5/5 [00:03<00:00,  1.61it/s]


Epoch 87, Average Loss: 0.2055


Epoch 88/200: 100%|██████████| 5/5 [00:05<00:00,  1.11s/it]


Epoch 88, Average Loss: 0.2244


Epoch 89/200: 100%|██████████| 5/5 [00:04<00:00,  1.04it/s]


Epoch 89, Average Loss: 0.2337


Epoch 90/200: 100%|██████████| 5/5 [00:04<00:00,  1.16it/s]


Epoch 90, Average Loss: 0.2180


Epoch 91/200: 100%|██████████| 5/5 [00:04<00:00,  1.03it/s]


Epoch 91, Average Loss: 0.2145


Epoch 92/200: 100%|██████████| 5/5 [00:03<00:00,  1.42it/s]


Epoch 92, Average Loss: 0.2276


Epoch 93/200: 100%|██████████| 5/5 [00:03<00:00,  1.49it/s]


Epoch 93, Average Loss: 0.2337


Epoch 94/200: 100%|██████████| 5/5 [00:03<00:00,  1.53it/s]


Epoch 94, Average Loss: 0.2161


Epoch 95/200: 100%|██████████| 5/5 [00:03<00:00,  1.51it/s]


Epoch 95, Average Loss: 0.2214


Epoch 96/200: 100%|██████████| 5/5 [00:03<00:00,  1.55it/s]


Epoch 96, Average Loss: 0.2303


Epoch 97/200: 100%|██████████| 5/5 [00:03<00:00,  1.52it/s]


Epoch 97, Average Loss: 0.2306


Epoch 98/200: 100%|██████████| 5/5 [00:03<00:00,  1.52it/s]


Epoch 98, Average Loss: 0.2227


Epoch 99/200: 100%|██████████| 5/5 [00:03<00:00,  1.59it/s]


Epoch 99, Average Loss: 0.2146


Epoch 100/200: 100%|██████████| 5/5 [00:02<00:00,  1.67it/s]


Epoch 100, Average Loss: 0.1968


Epoch 101/200: 100%|██████████| 5/5 [00:02<00:00,  1.69it/s]


Epoch 101, Average Loss: 0.2220


Epoch 102/200: 100%|██████████| 5/5 [00:03<00:00,  1.57it/s]


Epoch 102, Average Loss: 0.2153


Epoch 103/200: 100%|██████████| 5/5 [00:02<00:00,  1.73it/s]


Epoch 103, Average Loss: 0.2373


Epoch 104/200: 100%|██████████| 5/5 [00:02<00:00,  1.73it/s]


Epoch 104, Average Loss: 0.2110


Epoch 105/200: 100%|██████████| 5/5 [00:03<00:00,  1.53it/s]


Epoch 105, Average Loss: 0.2095


Epoch 106/200: 100%|██████████| 5/5 [00:03<00:00,  1.54it/s]


Epoch 106, Average Loss: 0.2112


Epoch 107/200: 100%|██████████| 5/5 [00:02<00:00,  1.70it/s]


Epoch 107, Average Loss: 0.2240


Epoch 108/200: 100%|██████████| 5/5 [00:03<00:00,  1.64it/s]


Epoch 108, Average Loss: 0.2004


Epoch 109/200: 100%|██████████| 5/5 [00:03<00:00,  1.53it/s]


Epoch 109, Average Loss: 0.2111


Epoch 110/200: 100%|██████████| 5/5 [00:03<00:00,  1.44it/s]


Epoch 110, Average Loss: 0.2115


Epoch 111/200: 100%|██████████| 5/5 [00:03<00:00,  1.46it/s]


Epoch 111, Average Loss: 0.2316


Epoch 112/200: 100%|██████████| 5/5 [00:03<00:00,  1.52it/s]


Epoch 112, Average Loss: 0.2268


Epoch 113/200: 100%|██████████| 5/5 [00:03<00:00,  1.62it/s]


Epoch 113, Average Loss: 0.2092


Epoch 114/200: 100%|██████████| 5/5 [00:03<00:00,  1.52it/s]


Epoch 114, Average Loss: 0.1834


Epoch 115/200: 100%|██████████| 5/5 [00:03<00:00,  1.54it/s]


Epoch 115, Average Loss: 0.2256


Epoch 116/200: 100%|██████████| 5/5 [00:03<00:00,  1.53it/s]


Epoch 116, Average Loss: 0.2050


Epoch 117/200: 100%|██████████| 5/5 [00:03<00:00,  1.45it/s]


Epoch 117, Average Loss: 0.2459


Epoch 118/200: 100%|██████████| 5/5 [00:03<00:00,  1.50it/s]


Epoch 118, Average Loss: 0.2124


Epoch 119/200: 100%|██████████| 5/5 [00:03<00:00,  1.60it/s]


Epoch 119, Average Loss: 0.2142


Epoch 120/200: 100%|██████████| 5/5 [00:03<00:00,  1.47it/s]


Epoch 120, Average Loss: 0.2247


Epoch 121/200: 100%|██████████| 5/5 [00:03<00:00,  1.58it/s]


Epoch 121, Average Loss: 0.1923


Epoch 122/200: 100%|██████████| 5/5 [00:03<00:00,  1.58it/s]


Epoch 122, Average Loss: 0.2208


Epoch 123/200: 100%|██████████| 5/5 [00:03<00:00,  1.62it/s]


Epoch 123, Average Loss: 0.2314


Epoch 124/200: 100%|██████████| 5/5 [00:03<00:00,  1.59it/s]


Epoch 124, Average Loss: 0.2050


Epoch 125/200: 100%|██████████| 5/5 [00:03<00:00,  1.61it/s]


Epoch 125, Average Loss: 0.2140


Epoch 126/200: 100%|██████████| 5/5 [00:03<00:00,  1.62it/s]


Epoch 126, Average Loss: 0.2114


Epoch 127/200: 100%|██████████| 5/5 [00:03<00:00,  1.61it/s]


Epoch 127, Average Loss: 0.2314


Epoch 128/200: 100%|██████████| 5/5 [00:03<00:00,  1.64it/s]


Epoch 128, Average Loss: 0.2189


Epoch 129/200: 100%|██████████| 5/5 [00:03<00:00,  1.64it/s]


Epoch 129, Average Loss: 0.2450


Epoch 130/200: 100%|██████████| 5/5 [00:03<00:00,  1.62it/s]


Epoch 130, Average Loss: 0.2108


Epoch 131/200: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]


Epoch 131, Average Loss: 0.2006


Epoch 132/200: 100%|██████████| 5/5 [00:03<00:00,  1.59it/s]


Epoch 132, Average Loss: 0.2020


Epoch 133/200: 100%|██████████| 5/5 [00:02<00:00,  1.69it/s]


Epoch 133, Average Loss: 0.2134


Epoch 134/200: 100%|██████████| 5/5 [00:03<00:00,  1.44it/s]


Epoch 134, Average Loss: 0.2244


Epoch 135/200: 100%|██████████| 5/5 [00:03<00:00,  1.52it/s]


Epoch 135, Average Loss: 0.2182


Epoch 136/200: 100%|██████████| 5/5 [00:03<00:00,  1.66it/s]


Epoch 136, Average Loss: 0.2184


Epoch 137/200: 100%|██████████| 5/5 [00:03<00:00,  1.63it/s]


Epoch 137, Average Loss: 0.2126


Epoch 138/200: 100%|██████████| 5/5 [00:03<00:00,  1.66it/s]


Epoch 138, Average Loss: 0.2119


Epoch 139/200: 100%|██████████| 5/5 [00:02<00:00,  1.67it/s]


Epoch 139, Average Loss: 0.2202


Epoch 140/200: 100%|██████████| 5/5 [00:03<00:00,  1.66it/s]


Epoch 140, Average Loss: 0.2259


Epoch 141/200: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]


Epoch 141, Average Loss: 0.2309


Epoch 142/200: 100%|██████████| 5/5 [00:03<00:00,  1.62it/s]


Epoch 142, Average Loss: 0.2211


Epoch 143/200: 100%|██████████| 5/5 [00:03<00:00,  1.63it/s]


Epoch 143, Average Loss: 0.2047


Epoch 144/200: 100%|██████████| 5/5 [00:03<00:00,  1.54it/s]


Epoch 144, Average Loss: 0.2054


Epoch 145/200: 100%|██████████| 5/5 [00:02<00:00,  1.71it/s]


Epoch 145, Average Loss: 0.2175


Epoch 146/200: 100%|██████████| 5/5 [00:03<00:00,  1.59it/s]


Epoch 146, Average Loss: 0.2181


Epoch 147/200: 100%|██████████| 5/5 [00:03<00:00,  1.48it/s]


Epoch 147, Average Loss: 0.1953


Epoch 148/200: 100%|██████████| 5/5 [00:03<00:00,  1.45it/s]


Epoch 148, Average Loss: 0.2048


Epoch 149/200: 100%|██████████| 5/5 [00:03<00:00,  1.50it/s]


Epoch 149, Average Loss: 0.2096


Epoch 150/200: 100%|██████████| 5/5 [00:03<00:00,  1.56it/s]


Epoch 150, Average Loss: 0.2089


Epoch 151/200: 100%|██████████| 5/5 [00:03<00:00,  1.51it/s]


Epoch 151, Average Loss: 0.2045


Epoch 152/200: 100%|██████████| 5/5 [00:03<00:00,  1.45it/s]


Epoch 152, Average Loss: 0.2121


Epoch 153/200: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]


Epoch 153, Average Loss: 0.2057


Epoch 154/200: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]


Epoch 154, Average Loss: 0.2235


Epoch 155/200: 100%|██████████| 5/5 [00:03<00:00,  1.66it/s]


Epoch 155, Average Loss: 0.1916


Epoch 156/200: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]


Epoch 156, Average Loss: 0.1977


Epoch 157/200: 100%|██████████| 5/5 [00:02<00:00,  1.67it/s]


Epoch 157, Average Loss: 0.2270


Epoch 158/200: 100%|██████████| 5/5 [00:02<00:00,  1.68it/s]


Epoch 158, Average Loss: 0.2063


Epoch 159/200: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]


Epoch 159, Average Loss: 0.1976


Epoch 160/200: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]


Epoch 160, Average Loss: 0.2046


Epoch 161/200: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]


Epoch 161, Average Loss: 0.1911


Epoch 162/200: 100%|██████████| 5/5 [00:03<00:00,  1.66it/s]


Epoch 162, Average Loss: 0.2038


Epoch 163/200: 100%|██████████| 5/5 [00:02<00:00,  1.68it/s]


Epoch 163, Average Loss: 0.1998


Epoch 164/200: 100%|██████████| 5/5 [00:03<00:00,  1.57it/s]


Epoch 164, Average Loss: 0.1996


Epoch 165/200: 100%|██████████| 5/5 [00:03<00:00,  1.58it/s]


Epoch 165, Average Loss: 0.2056


Epoch 166/200: 100%|██████████| 5/5 [00:03<00:00,  1.64it/s]


Epoch 166, Average Loss: 0.2042


Epoch 167/200: 100%|██████████| 5/5 [00:03<00:00,  1.60it/s]


Epoch 167, Average Loss: 0.2218


Epoch 168/200: 100%|██████████| 5/5 [00:03<00:00,  1.60it/s]


Epoch 168, Average Loss: 0.2222


Epoch 169/200: 100%|██████████| 5/5 [00:03<00:00,  1.62it/s]


Epoch 169, Average Loss: 0.2144


Epoch 170/200: 100%|██████████| 5/5 [00:03<00:00,  1.63it/s]


Epoch 170, Average Loss: 0.2018


Epoch 171/200: 100%|██████████| 5/5 [00:03<00:00,  1.58it/s]


Epoch 171, Average Loss: 0.2104


Epoch 172/200: 100%|██████████| 5/5 [00:03<00:00,  1.49it/s]


Epoch 172, Average Loss: 0.2364


Epoch 173/200: 100%|██████████| 5/5 [00:03<00:00,  1.55it/s]


Epoch 173, Average Loss: 0.1940


Epoch 174/200: 100%|██████████| 5/5 [00:03<00:00,  1.60it/s]


Epoch 174, Average Loss: 0.2131


Epoch 175/200: 100%|██████████| 5/5 [00:03<00:00,  1.60it/s]


Epoch 175, Average Loss: 0.2172


Epoch 176/200: 100%|██████████| 5/5 [00:03<00:00,  1.40it/s]


Epoch 176, Average Loss: 0.2043


Epoch 177/200: 100%|██████████| 5/5 [00:03<00:00,  1.51it/s]


Epoch 177, Average Loss: 0.1925


Epoch 178/200: 100%|██████████| 5/5 [00:03<00:00,  1.48it/s]


Epoch 178, Average Loss: 0.2270


Epoch 179/200: 100%|██████████| 5/5 [00:03<00:00,  1.46it/s]


Epoch 179, Average Loss: 0.2256


Epoch 180/200: 100%|██████████| 5/5 [00:03<00:00,  1.49it/s]


Epoch 180, Average Loss: 0.2103


Epoch 181/200: 100%|██████████| 5/5 [00:03<00:00,  1.49it/s]


Epoch 181, Average Loss: 0.1887


Epoch 182/200: 100%|██████████| 5/5 [00:03<00:00,  1.48it/s]


Epoch 182, Average Loss: 0.2138


Epoch 183/200: 100%|██████████| 5/5 [00:03<00:00,  1.62it/s]


Epoch 183, Average Loss: 0.2355


Epoch 184/200: 100%|██████████| 5/5 [00:03<00:00,  1.63it/s]


Epoch 184, Average Loss: 0.2141


Epoch 185/200: 100%|██████████| 5/5 [00:03<00:00,  1.60it/s]


Epoch 185, Average Loss: 0.2105


Epoch 186/200: 100%|██████████| 5/5 [00:03<00:00,  1.44it/s]


Epoch 186, Average Loss: 0.2048


Epoch 187/200: 100%|██████████| 5/5 [00:03<00:00,  1.36it/s]


Epoch 187, Average Loss: 0.1962


Epoch 188/200: 100%|██████████| 5/5 [00:03<00:00,  1.50it/s]


Epoch 188, Average Loss: 0.2202


Epoch 189/200: 100%|██████████| 5/5 [00:03<00:00,  1.61it/s]


Epoch 189, Average Loss: 0.2461


Epoch 190/200: 100%|██████████| 5/5 [00:03<00:00,  1.56it/s]


Epoch 190, Average Loss: 0.2362


Epoch 191/200: 100%|██████████| 5/5 [00:03<00:00,  1.55it/s]


Epoch 191, Average Loss: 0.2067


Epoch 192/200: 100%|██████████| 5/5 [00:03<00:00,  1.57it/s]


Epoch 192, Average Loss: 0.2118


Epoch 193/200: 100%|██████████| 5/5 [00:03<00:00,  1.48it/s]


Epoch 193, Average Loss: 0.2111


Epoch 194/200: 100%|██████████| 5/5 [00:03<00:00,  1.36it/s]


Epoch 194, Average Loss: 0.2180


Epoch 195/200: 100%|██████████| 5/5 [00:03<00:00,  1.46it/s]


Epoch 195, Average Loss: 0.2079


Epoch 196/200: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s]


Epoch 196, Average Loss: 0.2022


Epoch 197/200: 100%|██████████| 5/5 [00:04<00:00,  1.16it/s]


Epoch 197, Average Loss: 0.2092


Epoch 198/200: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s]


Epoch 198, Average Loss: 0.2028


Epoch 199/200: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s]


Epoch 199, Average Loss: 0.2020


Epoch 200/200: 100%|██████████| 5/5 [00:04<00:00,  1.09it/s]

Epoch 200, Average Loss: 0.1915





In [24]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
k_model.to(device)

# Loss ve optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(k_model.parameters(), lr=1e-3)

# Dataloader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Eğitim döngüsü
epochs = 10000  # Denemek için az tutuyoruz
for epoch in range(epochs):
    total_loss = 0
    k_model.train()
    
    for batch in dataloader:
        inputs = batch["input_ids"].to(device)
        targets = batch["labels"].to(device)
        
        optimizer.zero_grad()
        out = k_model(inputs)
        
        # Flatten batch & sequence
        out = out.view(-1, out.size(-1))
        targets = targets.view(-1)
        
        loss = loss_fn(out, targets)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")


Epoch 1, Average Loss: 1.6622
Epoch 2, Average Loss: 1.6098
Epoch 3, Average Loss: 1.6292
Epoch 4, Average Loss: 1.6483
Epoch 5, Average Loss: 1.6216
Epoch 6, Average Loss: 1.6007
Epoch 7, Average Loss: 1.6193
Epoch 8, Average Loss: 1.6087
Epoch 9, Average Loss: 1.6219
Epoch 10, Average Loss: 1.5867
Epoch 11, Average Loss: 1.6110
Epoch 12, Average Loss: 1.6073
Epoch 13, Average Loss: 1.5868
Epoch 14, Average Loss: 1.5635
Epoch 15, Average Loss: 1.5650
Epoch 16, Average Loss: 1.5703
Epoch 17, Average Loss: 1.5735
Epoch 18, Average Loss: 1.5791
Epoch 19, Average Loss: 1.5753
Epoch 20, Average Loss: 1.5446
Epoch 21, Average Loss: 1.5729
Epoch 22, Average Loss: 1.5755
Epoch 23, Average Loss: 1.5573
Epoch 24, Average Loss: 1.5258
Epoch 25, Average Loss: 1.5334
Epoch 26, Average Loss: 1.5563
Epoch 27, Average Loss: 1.5907
Epoch 28, Average Loss: 1.5284
Epoch 29, Average Loss: 1.5244
Epoch 30, Average Loss: 1.5531
Epoch 31, Average Loss: 1.5223
Epoch 32, Average Loss: 1.5143
Epoch 33, Average

In [31]:
import torch

# Başlangıç prompt
prompt_tokens = k_tokenizer.encode("and the capital of the united states is")  # tensor değilse tensor yap
inputs = torch.tensor(prompt_tokens).unsqueeze(0)   # batch dimension

generated_tokens = prompt_tokens.tolist()  # çıktıları saklamak için liste

max_new_tokens = 5  # kaç token üretmek istediğin

for _ in range(max_new_tokens):
    out = k_model(inputs)                     # (1, seq_len, vocab_size)
    last_token_logits = out[:, -1, :]         # son token için logits
    probs = torch.softmax(last_token_logits, dim=-1)
    next_token = torch.argmax(probs, dim=-1)  # tahmin edilen token id

    # Token’ı listeye ekle
    generated_tokens.append(next_token.item())

    # Input’u güncelle
    inputs = torch.tensor(generated_tokens).unsqueeze(0)
    

# Sonuçları decode et
generated_text = k_tokenizer.decode(generated_tokens)
print("Generated text:", generated_text)


Generated text: and the capital of the united states is not berlin.


  inputs = torch.tensor(prompt_tokens).unsqueeze(0)   # batch dimension
