In [1]:
# Optional: install wandb if you want
# !pip install wandb

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F
import math
from tqdm import tqdm

# Optional
# import wandb

In [2]:
from model.gpt2 import GPT2


In [3]:
# For GPT-2
from transformers import GPT2Config
from model.gpt2 import GPT2  # assuming you have your custom GPT2 implementation

# Create config
config = GPT2Config(
    n_layer=6,    # number of transformer layers
    n_head=8,     # number of attention heads
    n_embd=512    # embedding dimension
)

# Initialize model
model = GPT2(config)

# 1. Basic parameter count
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Count parameters
total_params = count_parameters(model)
print(f"Total trainable parameters: {total_params:,}")

Total trainable parameters: 45,171,200


In [4]:
"""
from transformers import GPT2Tokenizer
from data.dataset import TextDataset
from train import train_gpt2

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
texts = dataset["train"]["text"]  # Just get the plain text list


# Create dataset
dataset = TextDataset(texts, tokenizer)
val_dataset = TextDataset(dataset["validation"]["text"], tokenizer)

# Train the model
model = train_gpt2(
    train_dataset=dataset,
    val_dataset=val_dataset,
    n_epochs=3,
    batch_size=8,
    learning_rate=3e-4
)

# Save the model
#torch.save(model.state_dict(), 'gpt2_model.pt') 
"""

'\nfrom transformers import GPT2Tokenizer\nfrom data.dataset import TextDataset\nfrom train import train_gpt2\n\n# Initialize tokenizer\ntokenizer = GPT2Tokenizer.from_pretrained(\'gpt2\')\n\nfrom datasets import load_dataset\n\ndataset = load_dataset("wikitext", "wikitext-2-raw-v1")\ntexts = dataset["train"]["text"]  # Just get the plain text list\n\n\n# Create dataset\ndataset = TextDataset(texts, tokenizer)\nval_dataset = TextDataset(dataset["validation"]["text"], tokenizer)\n\n# Train the model\nmodel = train_gpt2(\n    train_dataset=dataset,\n    val_dataset=val_dataset,\n    n_epochs=3,\n    batch_size=8,\n    learning_rate=3e-4\n)\n\n# Save the model\n#torch.save(model.state_dict(), \'gpt2_model.pt\') \n'

In [5]:
from transformers import GPT2Tokenizer
from datasets import load_dataset
from train import train_gpt2

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Load dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=1024)

tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

tokenized_dataset.save_to_disk("data/tokenized_wikitext2")

train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["validation"]



Saving the dataset (0/1 shards):   0%|          | 0/4358 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/36718 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3760 [00:00<?, ? examples/s]

In [6]:
tokenized_dataset["train"]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 36718
})

In [7]:
total_tokens = sum([len(tokenizer(x['text'])['input_ids']) for x in dataset['train']])
print(total_tokens)

2391884


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from model.config import GPT2Config
from model.gpt2 import GPT2
from transformers import GPT2Tokenizer
from tqdm.notebook import tqdm   # NOTE: notebook-friendly tqdm
import wandb

In [9]:
#wandb login

In [10]:
from train import train_gpt2

In [11]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

train_loader = DataLoader(tokenized_dataset["train"], batch_size=8, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(tokenized_dataset["validation"], batch_size=8, collate_fn=data_collator)

from train import train_gpt2
train_gpt2(train_loader, val_loader)

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33minvi-bhagyesh[0m ([33minvi-bhagyesh-manipal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/3:   0%|          | 2/4590 [01:39<63:37:22, 49.92s/it]


KeyboardInterrupt: 

In [None]:
train_gpt2(train_dataset, val_dataset, n_epochs=1, batch_size=8)
# Save the model after training
torch.save(model.state_dict(), "data/gpt2_pretrained.pth")
print("Model saved to data/gpt2_pretrained.pth")

In [None]:
wandb login

In [None]:
# Train
model = train_gpt2(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    n_epochs=3,
    batch_size=8,
    learning_rate=3e-4
)