In [None]:
import torch

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

#dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
wiki_text = """Initially only available in English, editions of Wikipedia in more than 300 other languages
have been developed. The English Wikipedia, with its over 6.9 million articles, is the largest of the editions,
which together comprise more than 64 million articles and attract more than 1.5 billion unique device visits and
13 million edits per month (about 5 edits per second on average) as of April 2024.[W 1] As of November 2024, over
25% of Wikipedia's traffic was from the United States, followed by Japan at 6.2%, the United Kingdom at 5.6%, Russia
 at 5.0%, Germany at 4.8%, and the remaining 53.3% split among other countries.[8]"""

In [None]:
print(tokenizer(wiki_text).tokens())

In [None]:
print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(wiki_text))

In [None]:
tokens = sorted(tokenizer.vocab.items(), key=lambda x: x[1], reverse=True)

In [None]:
tokens = sorted(tokenizer.vocab.items(), key=lambda x: x[1], reverse=True)

# Get top tokens
top_tokens = [token for token, _ in tokens[:12]]

# Combine them into a single string
combined_string = tokenizer.convert_tokens_to_string(top_tokens)

print(combined_string)

In [None]:
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode

byte_to_unicode_map = bytes_to_unicode()
unicode_to_byte_map = dict((v, k) for k, v in byte_to_unicode_map.items())
base_vocab = list(unicode_to_byte_map.keys())

print(f"size of our base vocabulary: {len(base_vocab)}")
print(f'first element: {base_vocab[0]}, last element: {base_vocab[-1]}')

In [None]:
import torch
import torch.nn as nn
import math

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, seq_len, n=10000):
    super(PositionalEncoding, self).__init__()
    self.encoding = torch.zeros((seq_len, d_model))
    for pos in range(seq_len):
      for i in range(int(d_model/2)):
        wave_input = pos/(n**(2*i/d_model))
        self.encoding[pos,2*i] = math.sin(wave_input)
        self.encoding[pos,2*i+1] = math.cos(wave_input)


  def forward(self, x):
    batch_size, seq_len, d_model = x.size()

    # Dynamically slice positional encoding to match input sequence length
    pos_encoding = self.encoding[:seq_len, :].unsqueeze(0)
    pos_encoding = pos_encoding.to(x.device)

    return x + pos_encoding


In [None]:
class Attention(nn.Module):
  def __init__(self, d_model, d_k, d_v):
    super(Attention, self).__init__()
    self.W_Q = nn.Linear(d_model, d_k)
    self.W_K = nn.Linear(d_model, d_k)
    self.W_V = nn.Linear(d_model, d_v)
    self.softmax = nn.Softmax(dim=-1)

  def forward(self, X, mask=None):
    Q = self.W_Q(X)
    K = self.W_K(X)
    V = self.W_V(X)

    attention_scores = torch.matmul(Q, K.transpose(-2, -1))

    d_k = Q.size(-1)
    scaled_scores = attention_scores / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))

    attention_weights = self.softmax(scaled_scores)

    attention_output = torch.matmul(attention_weights, V)

    if mask is not None:
      scores = scores.masked_fill(mask == 0, float("-inf"))

    return attention_output, attention_weights


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads

        # Define learnable projection matrices for Q, K, V
        self.W_Q = nn.Linear(d_model, d_model)
        self.W_K = nn.Linear(d_model, d_model)
        self.W_V = nn.Linear(d_model, d_model)
        self.W_O = nn.Linear(d_model, d_model)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, X):
        batch_size, seq_len, d_model = X.shape

        # 1. Project input to Q, K, V spaces
        Q = self.W_Q(X)
        K = self.W_K(X)
        V = self.W_V(X)

        # 2. Split Q, K, V into multiple heads
        Q = Q.reshape(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = K.reshape(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = V.reshape(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)

        # Compute attention scores: QK^T / sqrt(d_k)
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))

        attention_weights = self.softmax(attention_scores)

        attention_output = torch.matmul(attention_weights, V)

        attention_output = attention_output.transpose(1, 2).contiguous()
        attention_output = attention_output.view(batch_size, seq_len, d_model)
        output = self.W_O(attention_output)

        return output


In [None]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim, dropout_rate=0.1):
        super(FeedForward, self).__init__()
        self.ln1 = nn.Linear(embed_dim, ff_dim)
        self.ln2 = nn.Linear(ff_dim, embed_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        return self.ln2(self.dropout(torch.relu(self.ln1(x))))


In [None]:
class EncoderTransformer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super(EncoderTransformer, self).__init__()
        self.attention = MultiHeadAttention(embed_dim, num_heads)
        self.feed_forward = FeedForward(embed_dim, ff_dim)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x, mask=None):
        attn_out = self.attention(x, mask)

        x = x + attn_out
        x = self.norm1(x)

        ff_out = self.feed_forward(x)
        x = x + ff_out
        x = self.norm2(x)

        return x


In [None]:
class DecoderTransformer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super(DecoderTransformer, self).__init__()
        self.masked_attention = MultiHeadAttention(embed_dim, num_heads)
        self.feed_forward = FeedForward(embed_dim, ff_dim)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.norm3 = nn.LayerNorm(embed_dim)

    def forward(self, x, encoder_output, tgt_mask=None, memory_mask=None):
        attn_out = self.masked_attention(x, tgt_mask)

        x = x + attn_out
        x = self.norm1(x)

        ff_out = self.feed_forward(x)
        x = x + ff_out
        x = self.norm3(x)

        return x


In [None]:
class GPT2(nn.Module):
    def __init__(self, vocab_size, max_len, embed_dim, num_heads, num_layers, ff_dim):
        super(GPT2, self).__init__()
        self.embed_dim = embed_dim
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_encoding = PositionalEncoding(embed_dim, max_len)
        self.transformer_blocks = nn.ModuleList(
            [DecoderTransformer(embed_dim, num_heads, ff_dim) for _ in range(num_layers)]
        )
        self.fc_out = nn.Linear(embed_dim, vocab_size)

    def forward(self, input_ids, mask=None):
        x = self.token_embedding(input_ids)
        x = self.position_encoding(x)

        for block in self.transformer_blocks:
            x = block(x, mask)

        logits = self.fc_out(x)
        return logits


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import GPT2Tokenizer
from datasets import load_dataset
import random

# Hyperparameters
vocab_size = 50257  # GPT-2 tokenizer's vocabulary size
embed_dim = 768
num_heads = 6
num_layers = 6
ff_dim = 3072
max_len = 1024
batch_size = 8
learning_rate = 1e-4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize and preprocess the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_len, padding="max_length")

streamed_dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True).shuffle(buffer_size=1000)

In [None]:
model = GPT2(vocab_size, max_len, embed_dim, num_heads, num_layers, ff_dim).to(device)  # Move model to GPU

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
torch.cuda.empty_cache()

In [None]:
max_to_process = 12800


# Training loop
for epoch in range(5):
    model.train()
    total_loss = 0
    batch_count = 0
    processed = 0
    batch = []
    streamed_dataset = streamed_dataset.shuffle()
    for example in streamed_dataset:
      batch.append(example["text"])
      processed += 1
      if len(batch) == batch_size:
        # Shuffle batch
        random.shuffle(batch)

        # Tokenize and process
        tokenized = tokenizer(
          batch,
          truncation=True,
          max_length=max_len,
          padding="max_length",
          return_tensors="pt",
        )
        input_ids = tokenized["input_ids"].to(device)
        attention_mask = tokenized["attention_mask"].to(device)
        labels = input_ids.clone().to(device)

        # Training step
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs.view(-1, vocab_size), labels.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        batch_count += 1
        batch = []
        if batch_count % 10 == 0:  # Print every 10 batches
          print(f"Epoch {epoch + 1}, Batch {batch_count}, Loss: {loss.item():.4f}")
      if processed > max_to_process:
        print("processed over max")
        break
    if batch_count > 0:
      avg_loss = total_loss / batch_count
      print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.4f}")
    else:
      print("help")

In [None]:
torch.save(model, 'model_full3.pth')  # Save the entire model


In [None]:
import torch
model = torch.load('model_full3.pth')

In [None]:
model.eval()
model = model.to(device)  # Move the model to the specified device


In [None]:
def generate_text(model, tokenizer, prompt, max_length=50, temperature=10.0, top_k=50):
    model.eval()

    device = next(model.parameters()).device
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate tokens iteratively
    for _ in range(max_length):
        outputs = model(input_ids)
        logits = outputs[:, -1, :]
        logits = logits / temperature

        top_k_values, top_k_indices = torch.topk(logits, top_k, dim=-1)
        probabilities = torch.softmax(top_k_values, dim=-1)

        next_token_index = torch.multinomial(probabilities, num_samples=1)
        next_token = top_k_indices.gather(dim=-1, index=next_token_index)

        # Append the token to input
        input_ids = torch.cat([input_ids, next_token], dim=1)

        # Stop generation if EOS token is generated
        if next_token.item() == tokenizer.eos_token_id:
            break

    # Decode the generated tokens into text
    return tokenizer.decode(input_ids[0], skip_special_tokens=True)


In [None]:
prompt = "My name is"
generated_text = generate_text(model, tokenizer, prompt, max_length=50, temperature=5, top_k=10)

print("Generated Text:")
print(generated_text)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

torch.save(model, '/content/drive/My Drive/model_full2.pth')

torch.save(model.state_dict(), '/content/drive/My Drive/model_weights2.pth')
