In [None]:
from importlib.metadata import version

pkgs = [
    "matplotlib",
    "numpy",
    "tiktoken",
    "torch",
    "tensorflow",
    "pandas"
]

for p in pkgs:
    print(f"{p} version: {version(p)}")

In [None]:
import json
import os
import urllib

def download_and_load_file(file_path):
    with open(file_path, "r", encoding='utf-8') as f:
        data = json.load(f)
    
    return data

In [None]:
file_path = "instruction-data.json"
data = download_and_load_file(file_path)
len(data)

In [None]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. Write a response that "
        f"appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text = f"\n\n### Input:\n{entry['input']}" if entry['input'] else ""
    return instruction_text + input_text

In [None]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.10)

train_data = data[:train_portion]
test_data = data[train_portion: test_portion + train_portion]
val_data = data[train_portion + test_portion:]

len(train_data), len(test_data), len(val_data)

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [None]:
class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_text = []
        for entry in data:
            entry_formatted = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = entry_formatted + response_text
            self.encoded_text.append(
                tokenizer.encode(full_text)
            )
    
    def __getitem__(self, index):
        return self.encoded_text[index]
    
    def __len__(self):
        return len(self.encoded_text)

In [None]:
def custom_collate_draft_1(
    batch,
    pad_token_id=50256,
    device="cpu"
):
    # Find the longest sequence in the batch
    # and increase the max length by +1, which will add one extra
    # padding token below
    batch_max_length = max(len(item)+1 for item in batch)
    print(batch_max_length)

    # Pad and prepare inputs
    inputs_lst = []

    for item in batch:
        new_item = item.copy()
        # Pad sequences to batch_max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        # Via padded[:-1], we remove the extra padded token
        # that has been added via the +1 setting in batch_max_length
        # (the extra padding token will be relevant in later codes)
        inputs = torch.tensor(padded[:-1])
        inputs_lst.append(inputs)

    # Convert list of inputs to tensor and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    return inputs_tensor

In [None]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]
out = custom_collate_draft_1([inputs_1, inputs_2, inputs_3])

In [None]:
out.shape; out
out = out[2:].squeeze()
out.shape

In [None]:
pad_token_id = 50256
mask = out == pad_token_id
mask

In [None]:
indices = torch.nonzero(mask).squeeze()
indices.shape

In [None]:
indices.numel()

In [None]:
indices[1:].shape

In [None]:
out[indices[1:]] = -100

In [None]:
out

In [None]:
def custom_collate_fn(batch, pad_token_id=50256, ignore_index=-100, allowed_max_len=None, device='cpu'):
    batch_max_len = max(len(entry) + 1 for entry in batch)
    input_lst, target_lst = [], []
    
    for item in batch:
        new_item = item.copy()
        # Pad sequences to batch_max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_len - len(new_item))
        )
        input = torch.tensor(padded[:-1])
        target = torch.tensor(padded[1:])

        # Only use padding token for first endoftext token
        mask = target == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            target[indices[1:]] = ignore_index
        
        if allowed_max_len is not None:
            input = input[:allowed_max_len]
            target = target[:allowed_max_len]
        
        input_lst.append(input)
        target_lst.append(target)
    
    input_tensor = torch.stack(input_lst).to(device)
    target_tensor = torch.stack(target_lst).to(device)
    
    return input_tensor, target_tensor

In [None]:
input_batch, target_batch = custom_collate_fn([inputs_1, inputs_2, inputs_3])
input_batch, target_batch

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
from functools import partial
customized_collate_func = partial(
    custom_collate_fn, device=device, allowed_max_len=1024
)

In [None]:
from torch.utils.data import DataLoader

torch.manual_seed(123)
batch_size = 8

train_dataset = InstructionDataset(train_data, tokenizer)
len(train_dataset)
train_loader = DataLoader(
    train_dataset,
    collate_fn=customized_collate_func,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True,
    num_workers=0
)

In [None]:
a = next(iter(train_loader))
a[0].shape, a[1].shape

In [None]:
val_dataset = InstructionDataset(val_data, tokenizer)
len(val_dataset)
val_loader = DataLoader(
    val_dataset,
    collate_fn=customized_collate_func,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=0
)

In [None]:
a = next(iter(val_loader))
a[0].shape, a[1].shape

In [None]:
test_dataset = InstructionDataset(test_data, tokenizer)
len(test_dataset)
test_loader = DataLoader(
    test_dataset,
    collate_fn=customized_collate_func,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=0
)

In [None]:
a = next(iter(val_loader))
a[0].shape, a[1].shape

In [None]:
from utils.gpt_download import download_and_load_gpt2
from utils.gpt_model import (
    generate_text_simple,
    GPTModel,
    load_weights_into_gpt,
    text_to_token_ids,
    token_ids_to_text,
    generate
)

In [None]:
BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"


In [None]:
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)

In [None]:
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();

In [None]:
torch.manual_seed(123)
input_text = format_input(val_data[2])
print(input_text)
print(val_data[2])

In [None]:
token_ids = generate(
    model=model,
    idx=text_to_token_ids(input_text, tokenizer),
    max_new_tokens=35,
    context_size=BASE_CONFIG['context_length'],
    eos_id=50256,
)
generated_text = token_ids_to_text(token_ids, tokenizer)

In [None]:
print(generated_text)

In [None]:
print(generated_text[len(input_text):].replace("### Response:", "").strip())

In [None]:
from utils.gpt_model import (
    calc_loss_loader,
    train_model_simple
)

In [None]:
model.to(device)
torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print(f"{train_loss=}, {val_loss=}")

In [None]:
model.pos_emb.weight.shape[0]

In [None]:
import time
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.1)
num_epochs = 2

start_time = time.time()
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device, num_epochs=num_epochs,
    eval_freq=10, eval_iter=10, start_context=format_input(val_data[0]), tokenizer=tokenizer
)
end_time = time.time()
execution_time_min = (end_time - start_time) / 60.0
print(f"Training took: {execution_time_min:.2f} min")

In [None]:
num_epochs, tokens_seen, len(train_losses), len(val_losses)

In [None]:
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
epochs_tensor

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

In [None]:
def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig,ax1 = plt.subplots(figsize=(5, 3))

    ax1.plot(epochs_seen, train_losses, label="Training losses")
    ax1.plot(epochs_seen, val_losses, label="Validation loss")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend(loc='upper right')
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))

    ax2 = ax1.twiny()
    ax2.plot(tokens_seen, train_losses)
    ax2.set_xlabel('Tokens seen')

    fig.tight_layout()
    plt.show()

In [None]:
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

In [None]:
for entry in test_data[:3]:
    input_text = format_input(entry)
    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer),
        max_new_tokens=256,
        context_size=BASE_CONFIG['context_length'],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    print(f"\n Raw generated text is: \n>> {generated_text}")
    response = generated_text[len(input_text):].replace("### Response:", "").strip()

    print(f"\n Input text is: \n>> {input_text}")
    print(f"\nCorrect response: \n>> {entry['output']}")
    print(f"\nModel Response: \n>> {response}")
    print(100*'-')

In [None]:
from tqdm import tqdm

for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
    input_text = format_input(entry)
    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer),
        max_new_tokens=256,
        context_size=BASE_CONFIG['context_length'],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    # print(generated_text)
    response = generated_text[len(input_text):].replace("### Response:", "").strip()

    test_data[i]['model_response'] = response

with open('instruction-test-data-with-response.json', 'w') as f:
    json.dump(test_data, f, indent=4)

In [None]:
# save the model
file_path = 'gpt2-medium-sft.pth'
torch.save(model.state_dict(), file_path)

In [None]:
import psutil

def check_if_running(process_name):
    running = False
    for proc in psutil.process_iter(["name"]):
        if process_name in proc.info['name']:
            running = True
            break
    
    return running

In [None]:
ollama_running = check_if_running('ollama')
ollama_running

In [None]:
import urllib.request
import json

def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"):
    data = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "options": {
            "seed": 123,
            "temperature": 0,
        }
    }
    payload = json.dumps(data).encode("utf-8")
    request = urllib.request.Request(url, data=payload, method='POST')
    request.add_header("Content-Type", "application/json")

    response_data = ""
    with urllib.request.urlopen(request) as response:
        while True:
            line = response.readline().decode('utf-8')
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json['message']['content']
    
    return response_data

In [None]:
res = query_model('What do Llamas eat?')
print(res)

In [None]:
file_path = "instruction-test-data-with-response.json"

with open(file_path, "r") as file:
    test_data = json.load(file)

In [None]:
test_data[0]['model_response'].replace("### Response:", "").strip()

In [None]:
print(format_input(test_data[0]))

In [None]:
for entry in test_data[:5]:
    model_response = entry['model_response'].replace("### Response:", "").strip()
    prompt = (
        f"Given the input `{format_input(entry)}` "
        f"and correct output `{entry['output']}`, "
        f"score the model response `{model_response}`"
        f" on a scale from 0 to 100, where 100 is the best score. "
        f"Respond with the integer number only"
    )
    print(query_model(prompt))
    

In [None]:
def generate_model_scores(json_data, json_key, model='llama3'):
    scores = []
    for entry in tqdm(json_data, desc="Scoring entities"):
        model_response = entry['model_response'].replace("### Response:", "").strip()
        prompt = (
            f"Given the input `{format_input(entry)}` "
            f"and correct output `{entry['output']}`, "
            f"score the model response `{model_response}`"
            f" on a scale from 0 to 100, where 100 is the best score. "
            f"Respond with the integer number only"
        )
        score = query_model(prompt, model)
        try:
            scores.append(int(score))
        except ValueError as e:
            print(f"Could not convert score to integer: {score}, {e}")
            continue
    
    return scores

In [None]:
scores = generate_model_scores(test_data, 'model_response', model='llama3.1')

In [None]:
len(scores)

In [None]:
sum(scores)

In [None]:
scores

In [None]:
sum(scores) / len(scores)