In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import wandb

# Einstellungen/Настройки
vocab_size = 50257  # GPT2 vocab
d_model = 64
nhead = 4
dim_feedforward = 128
num_layers = 1
max_seq_len = 64
batch_size = 4
seq_len = 8
epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Wandb
wandb.login(key="f3b0224b314359387370d2f4d2f6a5763c894caa")
wandb.init(project="transformer_decoder_only_final")

# Tokenizer Токенизатор
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")  #
##
import requests

# Laden Sie die Datei herunter und speichern Sie sie/Скачиваем и сохраняем файл
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text

# Sie können die Länge überprüfen/Можно проверить длину
print(f"Длина текста: {len(text)} символов")
print(text[:100])  # первые 100 символов
##
tokens = tokenizer.encode(text)

# Dataset/Датасет
##
class TextDataset(Dataset):
    def __init__(self, token_ids, seq_len):
        self.data = []
        for i in range(0, len(token_ids) - seq_len - 1, seq_len):  # <-- шаг = seq_len
#                for i in range(len(token_ids) - seq_len):
            self.data.append(torch.tensor(token_ids[i:i + seq_len + 1]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

dataset = TextDataset(tokens, seq_len=64)
 #

#dataset = TextDataset(tokens, seq_len)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset, batch_size=batch_size)
print(f"Num batches per epoch: {len(train_loader)}")

# Model/Модель
class DecoderOnlyModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dim_feedforward, max_seq_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_embedding = nn.Embedding(max_seq_len, d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.output = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        seq_len = x.size(1)
        pos = torch.arange(0, seq_len, device=x.device).unsqueeze(0).expand_as(x)
        x_embed = self.embedding(x) + self.pos_embedding(pos)

        # Erstellen einer Maske/ Создание маски (causal)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(x.device)

        x_out = self.decoder(x_embed.transpose(0, 1), torch.zeros_like(x_embed.transpose(0, 1)), tgt_mask=tgt_mask)
        logits = self.output(x_out.transpose(0, 1))
        return logits

model = DecoderOnlyModel(vocab_size, d_model, nhead, num_layers, dim_feedforward, max_seq_len).to(device)

# Training.... Bewertung anhand eines Validierungssatzes/Обучение.... Оценка на проверочном наборе (validate): viziv v konce epohi  in train()
def validate(model, val_loader, loss_fn):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            x = batch[:, :-1].to(device)
            y = batch[:, 1:].to(device)
            logits = model(x)
            loss = loss_fn(logits.reshape(-1, vocab_size), y.reshape(-1))
            total_loss += loss.item()
    return total_loss / len(val_loader)

def train(model, train_loader, val_loader, epochs, lr=1e-4,log_evary=10):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()
    global_step = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            x = batch[:, :-1].to(device)
            y = batch[:, 1:].to(device)
            logits = model(x)
            loss = loss_fn(logits.reshape(-1, vocab_size), y.reshape(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            global_step+=1

 # Wir protokollieren nicht jeden Schritt, sondern jeden log_every/ Логгируем не каждый шаг, а каждые log_every
            if global_step % log_evary == 0:
                wandb.log({"step_train_loss": loss.item(), "step": global_step})
       # Validation/ Валидация после каждой эпохи
        val_loss = validate(model, val_loader, loss_fn)
        avg_train_loss = total_loss / len(train_loader)
                                      #vizov proverka
       # wandb.log({"epoch": epoch+1, "train_loss": total_loss / len(train_loader), "val_loss": val_loss},step=epoch)
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": avg_train_loss,
            "val_loss": val_loss,
            "step": global_step  #  теперь step остаётся актуальным и непротиворечивым
        })

  #      print(f"Epoch {epoch+1}: Train Loss = {total_loss / len(train_loader):.4f}, Val Loss = {val_loss:.4f}")
        print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {val_loss:.4f}")
         # wandb.define_metric("epoch")
        # wandb.define_metric("train_loss", step_metric="epoch")
        # wandb.define_metric("val_loss", step_metric="epoch")

train(model, train_loader, val_loader, epochs)

##
# ====== Функция генерации текста ======
def generate_text(model, tokenizer, start_text, max_len=128, temperature=1.0):
    model.eval()
    input_ids = tokenizer.encode(start_text, return_tensors="pt").to(device)
    for _ in range(max_len):
         # Обрезаем input_ids, если превышает max_seq_len
         if input_ids.shape[1] >= model.pos_embedding.num_embeddings:  #  адаптивно
            input_ids = input_ids[:, -model.pos_embedding.num_embeddings:]

#         if input_ids.shape[1] >= max_seq_len:
#             input_ids = input_ids[:, -max_seq_len:]
         seq_len = input_ids.size(1)
         pos = torch.arange(0, seq_len, device=device).unsqueeze(0)
         x_embed = model.embedding(input_ids) + model.pos_embedding(pos)

         tgt_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(device)

          # Пустой контекст — decoder-only модель
         memory = torch.zeros_like(x_embed).to(device)
         out = model.decoder(x_embed.transpose(0, 1), memory.transpose(0, 1), tgt_mask=tgt_mask)
         logits = model.output(out.transpose(0, 1))

         next_token_logits = logits[:, -1, :] / temperature
         probs = torch.softmax(next_token_logits, dim=-1)
         next_token = torch.multinomial(probs, num_samples=1)
         input_ids = torch.cat([input_ids, next_token], dim=1)

    return tokenizer.decode(input_ids[0], skip_special_tokens=True)

# ====== Генерация и логгирование через wandb ======

# start_text = "Shall I compare thee to a summer's day?"
# generated = generate_text(model, tokenizer, start_text, max_len=100)
start_text = "To be or not to be"
generated_text = generate_text(model, tokenizer, start_text, max_len=50, temperature=0.8)

print("\n===== Generated Text =====")
print(generated_text)

# Логгируем сгенерированный текст в wandb
wandb.log({"generated_text": generated_text})

print(device)
if torch.cuda.is_available():
    print(torch.cuda.memory_summary())
else:
    print("CUDA недоступна — используется CPU")

##print(device)
##print(torch.cuda.memory_summary())
#
from huggingface_hub import login
import os

os.environ.pop("HF_TOKEN", None)
os.environ.pop("HUGGING_FACE_HUB_TOKEN", None)


# Вставь сюда свой токен
token = "hf_***"
login(token=token)


#from transformers import AutoModel, AutoTokenizer

#model_name = "bert-base-uncased"

# Загрузка модели и токенизатора
#tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
#model = AutoModel.from_pretrained(model_name, use_auth_token=True)

######   Сохрани модель и токенизатор локально
from transformers import AutoTokenizer
####from huggingface_hub import HfApi, HfFolder, Repository
from huggingface_hub import HfApi, create_repo, upload_folder
import torch
import os

model_name = "decoder-only-transformer-small"

# Создай папку для модели
save_dir = f"./{model_name}"
os.makedirs(save_dir, exist_ok=True)

# Сохрани модель
torch.save(model.state_dict(), os.path.join(save_dir, "pytorch_model.bin"))

# Сохрани конфигурацию
with open(os.path.join(save_dir, "config.json"), "w") as f:
    f.write("""{
        "model_type": "decoder-only",
        "vocab_size": 50257,
        "hidden_size": 64,
        "num_attention_heads": 4,
        "num_hidden_layers": 1,
        "max_position_embeddings": 64
    }""")

# Сохрани токенизатор
tokenizer.save_pretrained(save_dir)

###  Загрузить модель в Hugging Face
from huggingface_hub import create_repo, upload_folder

repo_id = "hannanechiporenko25/decoder-only-transformer-small"
#repo_id = "hannanechiporenko25"
#create_repo(repo_id, private=False)

####
# Попробуем создать репозиторий. Если он уже существует, игнорируем это.
api = HfApi()

try:
    create_repo(repo_id, private=False)  # Создаем репозиторий, если его нет
except Exception as e:
    print(f"Repository already exists or error occurred: {e}")

# Загрузка папки с моделью
upload_folder(
    folder_path=save_dir,
    repo_id=repo_id,
    commit_message="Upload small decoder-only model"
)

print(f"Model successfully uploaded to Hugging Face at {repo_id}")
####



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mannadesignerart22[0m ([33mannadesignerart22-uni[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Длина текста: 1115394 символов
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


Num batches per epoch: 1321
Epoch 1: Train Loss = 7.3900, Val Loss = 6.2424
Epoch 2: Train Loss = 6.1255, Val Loss = 5.9801
Epoch 3: Train Loss = 5.9379, Val Loss = 5.8359
Epoch 4: Train Loss = 5.8017, Val Loss = 5.6958
Epoch 5: Train Loss = 5.6609, Val Loss = 5.5453

===== Generated Text =====
To be or not to be theAlthough
Is'd queen:
Why my I an the fore
And, my lord be march.

And, my tell tell well:

ANG men tears I am me, my lord,
Which highest!
ANG

cuda
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  67721 KiB | 300655 KiB |   2388 GiB |   2387 GiB |
|       from large pool |  66898 KiB | 297769 KiB |   2256 GiB |  

pytorch_model.bin:   0%|          | 0.00/26.2M [00:00<?, ?B/s]

Model successfully uploaded to Hugging Face at hannanechiporenko25/decoder-only-transformer-small
