In [1]:
import tempfile
from datasets import load_dataset
import tiktoken
from functools import partial
from helper_function import *
from torch.utils.data import Dataset, DataLoader
from models_1 import *
from load_gpt2 import *
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import os
from torch.distributed import init_process_group, destroy_process_group
import torch.multiprocessing as mp
from tqdm import tqdm

2024-07-21 18:56:04.016128: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-21 18:56:04.611609: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
def prepare_dataset():
    file_path = 'openorca_30.json'
    with open(file_path, "r") as file:
        inst_data = json.load(file)
    # devide data into training testing validating
    train_portion = int(len(inst_data) * 0.85)  # 85% for training
    test_portion = int(len(inst_data) * 0.1)    # 10% for testing
    val_portion = len(inst_data) - train_portion - test_portion  # Remaining 5% for validation
    
    train_data = inst_data[:train_portion]
    test_data = inst_data[train_portion:train_portion + test_portion]
    val_data = inst_data[train_portion + test_portion:]
    
    tokenizer = tiktoken.get_encoding("gpt2") #set up tokenizer
    
    # set up datasets
    train_dataset = InstructionDataset(train_data, tokenizer)
    val_dataset = InstructionDataset(val_data, tokenizer)
    test_dataset = InstructionDataset(test_data, tokenizer)
    return train_dataset, val_dataset, test_dataset
    
def prepare_model():
    BASE_CONFIG = {
        "vocab_size": 50257,     # Vocabulary size
        "context_length": 1024,  # Context length
        "drop_rate": 0.0,        # Dropout rate
        "qkv_bias": True         # Query-key-value bias
    }
    
    model_configs = {
        "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
        "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
        "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
        "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
    }
    
    CHOOSE_MODEL = "gpt2-medium (355M)"
    
    BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
    
    model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
    settings, params = download_and_load_gpt2(
        model_size=model_size, 
        models_dir="gpt2"
    )
    
    model = GPTModel(BASE_CONFIG)
    load_weights_into_gpt(model, params)
    return model


class Trainer:
    def __init__(
        self,
        model: torch.nn.Module,
        tokenizer,
        train_loader: DataLoader,
        val_loader: DataLoader,
        test_loader: DataLoader,
        optimizer: torch.optim.Optimizer,
        scheduler,
        gpu_id,
    ) -> None:
        self.gpu_id = gpu_id
        self.model = model.to(gpu_id)
        self.tokenizer = tokenizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.optimizer = optimizer
        self.scheduler = scheduler

    def _run_batch(self, inputs, targets, eval=False):
        self.optimizer.zero_grad()
        logits = self.model(inputs)
        loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), targets.flatten())
        if eval == False:
            loss.backward()
            self.optimizer.step()
            #self.scheduler.step()
        else:
            return loss.item()
        return loss.item()

    def _run_epoch(self, epoch, eval_step):
        step = 0
        total_loss = 0
        self.model.train()
        for input_batch, target_batch in tqdm(self.train_loader, desc=f"Epoch {epoch}"): 
            input_batch = input_batch.to(self.gpu_id)
            target_batch = target_batch.to(self.gpu_id)
            loss = self._run_batch(input_batch, target_batch)
            total_loss += loss
            step += 1
            if step % eval_step == 0:
                avg_train_loss = total_loss / eval_step
                val_loss = self._validate()
                print(f'Epoch {epoch}, Step {step}, Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}')
                total_loss = 0
                self.model.train()
                
    def _validate(self):
        self.model.eval()
        total_val_loss = 0
        num_val_batches = 0
        with torch.no_grad():
            for input_batch, target_batch in self.val_loader:
                input_batch = input_batch.to(self.gpu_id)
                target_batch = target_batch.to(self.gpu_id)
                loss = self._run_batch(input_batch, target_batch, eval=True)
                total_val_loss += loss
                num_val_batches += 1
                if num_val_batches >= 10:
                    break
        return total_val_loss / num_val_batches
        
    def train(self, max_epochs, eval_step):
        for epoch in range(max_epochs):
            self._run_epoch(epoch, eval_step)

    def save(self, path):
        torch.save(self.model.module.state_dict(), path)

In [7]:
train_set, val_set, test_set = prepare_dataset()
customized_collate_fn = partial(
    custom_collate_fn,
    device = 'cpu',
    allowed_max_length=1024
)
train_loader = DataLoader(
    train_set,
    batch_size=4,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=True
)
val_loader = DataLoader(
    val_set,
    batch_size=4,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=True
)

test_loader = DataLoader(
    test_set,
    batch_size=4,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=True
)

100%|███████████████████████████████████████████████████████████████████████| 1079649/1079649 [08:52<00:00, 2028.75it/s]
100%|███████████████████████████████████████████████████████████████████████████| 63510/63510 [00:34<00:00, 1832.14it/s]
100%|█████████████████████████████████████████████████████████████████████████| 127017/127017 [01:24<00:00, 1507.88it/s]


In [5]:
model = prepare_model()
tokenizer = tiktoken.get_encoding("gpt2") #set up tokenizer
# model ready================
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, steps_per_epoch=len(train_loader), epochs=1)

File already exists and is up-to-date: gpt2/355M/checkpoint
File already exists and is up-to-date: gpt2/355M/encoder.json
File already exists and is up-to-date: gpt2/355M/hparams.json
File already exists and is up-to-date: gpt2/355M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/355M/model.ckpt.index
File already exists and is up-to-date: gpt2/355M/model.ckpt.meta
File already exists and is up-to-date: gpt2/355M/vocab.bpe


  _torch_pytree._register_pytree_node(


In [42]:
trainer = Trainer(model, tokenizer, train_loader, val_loader, test_loader, optimizer, scheduler, device)

In [46]:
trainer.train(1, 10)

Epoch 0:   5%|███▍                                                                     | 11/233 [00:02<00:54,  4.11it/s]

Epoch 0, Step 10, Train Loss: 0.6073, Val Loss: 0.6787


Epoch 0:   9%|██████▌                                                                  | 21/233 [00:04<00:52,  4.05it/s]

Epoch 0, Step 20, Train Loss: 0.6013, Val Loss: 0.6787


Epoch 0:  13%|█████████▋                                                               | 31/233 [00:06<00:51,  3.92it/s]

Epoch 0, Step 30, Train Loss: 0.5437, Val Loss: 0.6787


Epoch 0:  18%|████████████▊                                                            | 41/233 [00:08<00:47,  4.04it/s]

Epoch 0, Step 40, Train Loss: 0.5549, Val Loss: 0.6787


Epoch 0:  22%|███████████████▉                                                         | 51/233 [00:10<00:45,  3.97it/s]

Epoch 0, Step 50, Train Loss: 0.5708, Val Loss: 0.6787


Epoch 0:  26%|███████████████████                                                      | 61/233 [00:12<00:42,  4.09it/s]

Epoch 0, Step 60, Train Loss: 0.4433, Val Loss: 0.6787


Epoch 0:  30%|██████████████████████▏                                                  | 71/233 [00:14<00:39,  4.07it/s]

Epoch 0, Step 70, Train Loss: 0.4218, Val Loss: 0.6787


Epoch 0:  35%|█████████████████████████▍                                               | 81/233 [00:16<00:37,  4.00it/s]

Epoch 0, Step 80, Train Loss: 0.4421, Val Loss: 0.6787


Epoch 0:  39%|████████████████████████████▌                                            | 91/233 [00:18<00:35,  4.03it/s]

Epoch 0, Step 90, Train Loss: 0.3956, Val Loss: 0.6787


Epoch 0:  43%|███████████████████████████████▏                                        | 101/233 [00:20<00:33,  3.98it/s]

Epoch 0, Step 100, Train Loss: 0.3831, Val Loss: 0.6787


Epoch 0:  48%|██████████████████████████████████▎                                     | 111/233 [00:22<00:31,  3.83it/s]

Epoch 0, Step 110, Train Loss: 0.4420, Val Loss: 0.6787


Epoch 0:  52%|█████████████████████████████████████▍                                  | 121/233 [00:24<00:28,  3.91it/s]

Epoch 0, Step 120, Train Loss: 0.4698, Val Loss: 0.6787


Epoch 0:  56%|████████████████████████████████████████▍                               | 131/233 [00:26<00:25,  4.01it/s]

Epoch 0, Step 130, Train Loss: 0.4207, Val Loss: 0.6787


Epoch 0:  61%|███████████████████████████████████████████▌                            | 141/233 [00:28<00:22,  4.07it/s]

Epoch 0, Step 140, Train Loss: 0.4758, Val Loss: 0.6787


Epoch 0:  65%|██████████████████████████████████████████████▋                         | 151/233 [00:29<00:15,  5.15it/s]

Epoch 0, Step 150, Train Loss: 0.5165, Val Loss: 0.6787


Epoch 0:  69%|█████████████████████████████████████████████████▊                      | 161/233 [00:31<00:13,  5.41it/s]

Epoch 0, Step 160, Train Loss: 0.4333, Val Loss: 0.6787


Epoch 0:  73%|████████████████████████████████████████████████████▊                   | 171/233 [00:32<00:11,  5.42it/s]

Epoch 0, Step 170, Train Loss: 0.5059, Val Loss: 0.6787


Epoch 0:  78%|███████████████████████████████████████████████████████▉                | 181/233 [00:34<00:09,  5.45it/s]

Epoch 0, Step 180, Train Loss: 0.4903, Val Loss: 0.6787


Epoch 0:  82%|███████████████████████████████████████████████████████████             | 191/233 [00:35<00:07,  5.38it/s]

Epoch 0, Step 190, Train Loss: 0.5689, Val Loss: 0.6787


Epoch 0:  86%|██████████████████████████████████████████████████████████████          | 201/233 [00:37<00:05,  5.41it/s]

Epoch 0, Step 200, Train Loss: 0.6205, Val Loss: 0.6787


Epoch 0:  91%|█████████████████████████████████████████████████████████████████▏      | 211/233 [00:38<00:04,  5.38it/s]

Epoch 0, Step 210, Train Loss: 0.6840, Val Loss: 0.6787


Epoch 0:  95%|████████████████████████████████████████████████████████████████████▎   | 221/233 [00:40<00:02,  5.41it/s]

Epoch 0, Step 220, Train Loss: 0.7095, Val Loss: 0.6787


Epoch 0:  99%|███████████████████████████████████████████████████████████████████████▍| 231/233 [00:42<00:00,  4.16it/s]

Epoch 0, Step 230, Train Loss: 0.6524, Val Loss: 0.6787


Epoch 0: 100%|████████████████████████████████████████████████████████████████████████| 233/233 [00:42<00:00,  5.51it/s]
