<a href="https://colab.research.google.com/github/NataHsH/GenerativeAI-Project/blob/master/LanguageModel_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
# ==========================
!pip install transformers datasets wandb accelerate


In [None]:
!pip install huggingface_hub


In [None]:
# Login to Hugging Face and Weights & Biases
# ==========================
from huggingface_hub import notebook_login
import wandb

# Login to Hugging Face
notebook_login()

# Login to Weights & Biases
wandb.login()

In [None]:
# Import necessary libraries
# ==========================
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset
import wandb
from huggingface_hub import HfApi, create_repo, upload_folder


In [None]:
# Define hyperparameters
# ==========================
batch_size = 64
block_size = 128
embedding_dim = 256
num_heads = 4
num_layers = 4
dropout = 0.1
learning_rate = 3e-4
num_epochs = 3

model_name = "tiny_shakespeare_transformer"
repository_id = "NataliaH/tiny_shakespeare_transformer"

In [None]:
# Load dataset
# ==========================
dataset = load_dataset("tiny_shakespeare", split="train")
text = dataset['text']

In [None]:
# Tokenizer
# ==========================
# Create and train a simple character-level tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Encode the dataset
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=block_size)
input_ids = inputs["input_ids"]


In [None]:
# Prepare the dataset for training
# ==========================
class ShakespeareDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, block_size):
        self.input_ids = input_ids
        self.block_size = block_size

    def __len__(self):
        return self.input_ids.size(1) // self.block_size

    def __getitem__(self, idx):
        start_idx = idx * self.block_size
        end_idx = start_idx + self.block_size
        x = self.input_ids[:, start_idx:end_idx]
        y = self.input_ids[:, start_idx+1:end_idx+1]
        return x.squeeze(0), y.squeeze(0)

dataset = ShakespeareDataset(input_ids, block_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Prepare the dataset
def group_texts(tokenized_text, block_size):
    total_length = len(tokenized_text)
    total_length = (total_length // block_size) * block_size  # truncate to a multiple of block_size
    input_ids = tokenized_text[:total_length]

    input_ids = input_ids.view(-1, block_size)
    target_ids = input_ids.clone()
    return input_ids, target_ids

# Tokenize the entire dataset
tokenized_text = tokenizer(text, return_tensors='pt', add_special_tokens=False).input_ids.squeeze(0)

# Group into blocks
block_size = 128
x, y = group_texts(tokenized_text, block_size)


In [None]:
from transformers import AutoTokenizer
import torch
import json

# Load the model state dict
model_save_path = "./tiny_shakespeare_transformer"
tokenizer_save_path = model_save_path

# Define the model architecture
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, num_layers, block_size, dropout):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embedding_dim)
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=embedding_dim,
            nhead=num_heads,
            dim_feedforward=embedding_dim * 4,
            dropout=dropout,
            activation='gelu'
        )
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        embedded = self.token_embedding(x)
        memory = torch.zeros_like(embedded)
        out = self.transformer_decoder(embedded, memory)
        logits = self.fc_out(out)
        return logits

# Load the model state dict
model = TransformerModel(
    vocab_size=5000,  # Example vocab size, adjust based on your model
    embedding_dim=256,  # Example, adjust as needed
    num_heads=8,  # Example
    num_layers=6,  # Example
    block_size=128,  # Example
    dropout=0.1  # Example
)

# Load model weights and move to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load(f"{model_save_path}/model.pth"))
model.to(device)
model.eval()

# Load tokenizer
with open(f"{tokenizer_save_path}/tokenizer.json", "r") as f:
    vocab = json.load(f)

tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Or adjust to your model type
tokenizer.add_tokens(list(vocab.keys()))  # Adding custom vocab

# Ensure tokenizer vocab size is not larger than model vocab size
tokenizer_vocab_size = len(tokenizer)
model_vocab_size = model.token_embedding.num_embeddings
if tokenizer_vocab_size > model_vocab_size:
    tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=False)
    tokenizer.add_tokens(list(vocab.keys())[:model_vocab_size])

# Add pad token if necessary
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [None]:
# Define optimizer and loss function
# ==========================
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Initialize wandb
wandb.init(project="LanguageModel_Project", name="tiny_shakespeare_transformer")


In [None]:
import time
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

# Move the data to the same device as the model
x_train = x_train.to(device)
y_train = y_train.to(device)
x_val = x_val.to(device)
y_val = y_val.to(device)

# Training loop
# ==========================
num_epochs = 5
model.train()

for epoch in range(num_epochs):
    start_time = time.time()  # Track the time taken for the epoch
    total_train_loss = 0

    # Loop through the dataset manually in batches
    for i in range(0, x_train.size(0), batch_size):
        xb = x_train[i:i+batch_size].to(device)
        yb = y_train[i:i+batch_size].to(device)

        # Skip the last batch if it's smaller than batch_size
        if xb.size(0) != batch_size or yb.size(0) != batch_size:
            continue

        optimizer.zero_grad()               # Reset gradients
        logits = model(xb)                   # Forward pass
        loss = criterion(logits.view(-1, logits.size(-1)), yb.view(-1))  # Compute loss
        loss.backward()                      # Backward pass
        optimizer.step()                     # Update weights

        total_train_loss += loss.item()

    # Validation step
    model.eval()  # Switch the model to evaluation mode
    total_val_loss = 0
    with torch.no_grad():  # No need to compute gradients during validation
        for i in range(0, x_val.size(0), batch_size):
            xb_val = x_val[i:i+batch_size].to(device)
            yb_val = y_val[i:i+batch_size].to(device)

            # Skip the last batch if it's smaller than batch_size
            if xb_val.size(0) != batch_size or yb_val.size(0) != batch_size:
                continue

            logits_val = model(xb_val)  # Forward pass
            val_loss = criterion(logits_val.view(-1, logits_val.size(-1)), yb_val.view(-1))  # Compute loss
            total_val_loss += val_loss.item()

    # Compute average losses for the epoch
    num_batches_train = x_train.size(0) // batch_size
    avg_train_loss = total_train_loss / num_batches_train if num_batches_train > 0 else 0

    num_batches_val = x_val.size(0) // batch_size
    avg_val_loss = total_val_loss / num_batches_val if num_batches_val > 0 else 0

    # Get the current learning rate
    current_lr = optimizer.param_groups[0]['lr']

    # Calculate the time taken for the epoch
    epoch_time = time.time() - start_time

    # Log all metrics
    wandb.log({
        "train_loss": avg_train_loss,
        "val_loss": avg_val_loss,
        "learning_rate": current_lr,
        "epoch_time": epoch_time
    }, step=epoch)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, LR: {current_lr:.6f}, Time: {epoch_time:.2f}s")


In [None]:
import os
import torch
import json
from transformers import AutoTokenizer

# Define the save path
model_save_path = "./tiny_shakespeare_transformer"
tokenizer_save_path = model_save_path

# Create directories if they don't exist
os.makedirs(model_save_path, exist_ok=True)

# Save model state dict
torch.save(model.state_dict(), f"{model_save_path}/model.pth")

# Save tokenizer (updated to handle custom tokenizer correctly)
# Assuming you already added custom tokens and made necessary adjustments to the tokenizer
tokenizer.save_pretrained(tokenizer_save_path)

# If you want to save the vocabulary separately, you can also do it
with open(f"{tokenizer_save_path}/tokenizer.json", "w") as f:
    json.dump(tokenizer.get_vocab(), f)


In [None]:
# Push model to Hugging Face Hub
# ==========================
api = HfApi()
create_repo(repo_id=repository_id, exist_ok=True)
upload_folder(folder_path=model_save_path, repo_id=repository_id)

In [None]:
import os
import textwrap

# Define the path to save model and README
model_save_path = "./tiny_shakespeare_transformer"

# Create Model Card
# ==========================
model_card = textwrap.dedent(f"""\
    ---
    license: mit
    tags:
    - text-generation
    - transformer
    - tiny-shakespeare
    - decoder-only
    model-index:
    - name: tiny_shakespeare_transformer
      results: []
    ---

    # tiny_shakespeare_transformer

    A small Transformer Decoder model trained from scratch on the Tiny Shakespeare dataset.

    ## Training details
    - Dataset: Tiny Shakespeare
    - Epochs: {num_epochs}
    - Learning Rate: {learning_rate}
    - Batch Size: {batch_size}
    - Block Size: {block_size}
    - Optimizer: AdamW
    - Loss Function: CrossEntropyLoss
    - Dropout Rate: {dropout}
    - Embedding Dimension: {embedding_dim}
    - Number of Layers: {num_layers}
    - Number of Attention Heads: {num_heads}

    ## Usage
    To use this model, simply load it using the following code:

    ```python
    from transformers import AutoModelForCausalLM, AutoTokenizer

    # Load the model and tokenizer
    model = AutoModelForCausalLM.from_pretrained("{repository_id}")
    tokenizer = AutoTokenizer.from_pretrained("{repository_id}")

    # Encode input text
    inputs = tokenizer("Once upon a time", return_tensors="pt")
    outputs = model.generate(**inputs)
    print(tokenizer.decode(outputs[0]))
    ```

    ## Model Architecture
    This model is a Transformer Decoder-based architecture, optimized for text generation.
    It was trained on the Tiny Shakespeare dataset to generate Shakespeare-like text.

    ## Training Process
    - Training was performed for {num_epochs} epochs.
    - The model uses AdamW optimizer with a learning rate of {learning_rate}.
    - Dropout rate during training was set to {dropout} to reduce overfitting.

    ## License
    This model is released under the MIT License.
""")

# Save the model card to README.md
os.makedirs(model_save_path, exist_ok=True)

with open(f"{model_save_path}/README.md", "w") as f:
    f.write(model_card)

# Print the contents of the README to verify
!cat {model_save_path}/README.md


In [None]:
from huggingface_hub import upload_folder

# Параметры
repo_id = "NataliaH/tiny_shakespeare_transformer"  # Название репозитория, который уже существует
model_save_path = "./tiny_shakespeare_transformer"

# Загрузить модель в репозиторий
upload_folder(
    folder_path=model_save_path,
    repo_id=repo_id,
    commit_message="Initial commit with model and tokenizer",
)


In [None]:
import torch

# Function for text generation manually
def generate_text_manually(model, tokenizer, prompt, max_length=300):
    # Tokenize the input prompt
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    generated_ids = input_ids

    # Text generation loop
    for _ in range(max_length):
        # Pass the generated ids through the model
        output = model(generated_ids)  # Model output is a tensor of logits
        logits = output[:, -1, :]  # Only take the last token's logits (for the last token in the sequence)

        # Get the most probable next token
        next_token_id = torch.argmax(logits, dim=-1).unsqueeze(-1)  # Get the token with the highest probability

        # Append the new token to the generated tokens
        generated_ids = torch.cat((generated_ids, next_token_id), dim=-1)

        # Stop if the end of sequence token is generated
        if next_token_id.item() == tokenizer.eos_token_id:
            break

    # Decode the generated tokens back to text, skip special tokens and clean the output
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    generated_text = generated_text.replace('Ġ', ' ')  # Replace Ġ with spaces
    return generated_text

# Example usage
prompt = "Once upon a time"
generated_text = generate_text_manually(model, tokenizer, prompt, max_length=300)


print(generated_text)
