In [8]:
# ONLY FOR TESTING PURPOSES, MAY CONTAIN ERRORS/OUTDATED CODE

import torch
from dataset import PippaDataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

device = "cuda" if torch.cuda.is_available() else "cpu"
# Load the model without downloading (assuming you have it locally)
model_name = 'openai-community/gpt2'  # Replace with your actual model name
# model_name = 'mistralai/Mistral-7B-v0.1'
# config = AutoConfig.from_pretrained(model_name)

quant_config = BitsAndBytesConfig(load_in_4bit=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto' if device == "cuda" else 'cpu',
    quantization_config=quant_config,
)

In [12]:
model.num_parameters() * 4 / 8 * 2

124439808.0

In [9]:
model.get_memory_footprint()

134060568

In [35]:
# Load the dataset
dataset_path = "data/pippa_deduped.jsonl"
dataset = PippaDataset(dataset_path)

# Load the tokenizer
tokenizer_inputs = AutoTokenizer.from_pretrained(
    model_name,
    padding_side='left',
)

tokenizer_outputs = AutoTokenizer.from_pretrained(
    model_name,
    padding_side='right',
)

if tokenizer_inputs.pad_token is None:
    tokenizer_inputs.pad_token = tokenizer_inputs.eos_token  # Ensure pad token is set
    tokenizer_outputs.pad_token = tokenizer_outputs.eos_token  # Ensure pad token is set
    tokenizer_inputs.pad_token_id = tokenizer_inputs.eos_token_id  # Ensure pad token is set
    tokenizer_outputs.pad_token_id = tokenizer_outputs.eos_token_id  # Ensure pad token is set

dataset.set_chat_template_params('prompt_templates/vicuna_prompt_template.jinja', tokenizer_inputs)
# Prepare sample data
sample_size = 2  # Adjust as needed
sampled_data = dataset.sample_dataset(sample_size)

In [36]:
# contexts = ['The capital of France', 'The capital of Germany is']
# target_texts = [' is Paris', ' Berlin']
# unzip the sampled data
sampled_data = [
    ("What is the capital of France?", "\n\nParis is the capital of France.")
]
contexts, target_texts = zip(*sampled_data)

In [37]:
max_len = model.config.max_position_embeddings
max_len

32768

In [38]:
outputs = tokenizer_outputs(
    target_texts, 
    return_tensors='pt', 
    padding=True,
    add_special_tokens=False,
) # this will put padding to the right and add only the eos token

# get the max length of the input by subtracting the length of the targets from the max length
max_input_len = max_len - outputs['input_ids'].shape[1]

inputs = tokenizer_inputs(
    contexts, 
    return_tensors='pt', 
    padding=True, 
    truncation=True, 
    max_length=max_input_len,
    add_special_tokens=True,
)

# concatenate the inputs and targets and their attention masks
input_ids = torch.cat([inputs['input_ids'], outputs['input_ids']], dim=1).to(device)
attention_mask = torch.cat([inputs['attention_mask'], outputs['attention_mask']], dim=1).to(device)

# get the mask that only give us the output ids
output_ids_mask = torch.cat(
    [
        torch.zeros_like(inputs['attention_mask']), 
        outputs['attention_mask']
    ], dim=1
)

# shift the mask to the right by one
output_ids_mask = torch.cat(
    [
        torch.zeros_like(output_ids_mask[:, :1]), 
        output_ids_mask[:, :-1]
    ], dim=1
)

In [39]:
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

In [40]:
# shift the output.logits to the right by one
shifted_logits = torch.cat(
    [
        torch.zeros_like(outputs.logits[:, :1, :]), 
        outputs.logits[:, :-1, :]
    ], dim=1
)

In [41]:
# do softmax to get the probabilities
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

In [42]:
probabilities.max(dim=-1).values

tensor([[0.3063, 0.2089, 0.2340, 0.0815, 0.8290, 0.0931, 0.8701, 0.2490, 0.2989,
         0.5338, 0.1337, 0.9981, 0.3948, 0.6096, 0.8346, 0.8078, 0.9467, 0.5783,
         0.3961]], device='cuda:0')

In [43]:
# decode the the argmax of the probabilities
token_text = tokenizer_inputs.batch_decode(input_ids)

# print the first example, token_text is the input, token_output_text is the output
print(token_text[0])
print('#######################')
# get the argmax of the probabilities and decode it
output_ids = probabilities.argmax(dim=-1)
output_text = tokenizer_outputs.batch_decode(output_ids)
print(output_text[0])


<s> What is the capital of France? 

Paris is the capital of France.
#######################
# is the difference of the?
 Paris
Paris. the capital of France.



In [44]:
# Get the probabilities of input_ids by indexing the probabilities tensor
token_probabilities = probabilities.gather(-1, input_ids.unsqueeze(-1)).squeeze(-1).to('cuda')

In [45]:
token_probabilities = token_probabilities * output_ids_mask.to('cuda')

In [46]:
token_probabilities.sum() / output_ids_mask.sum()

tensor(0.0536, device='cuda:0')

In [47]:
output_ids * output_ids_mask.cuda()

tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,    13,
          3916,   278, 28723,   272,  5565,   302,  4843, 28723,    13]],
       device='cuda:0')

In [48]:
for i in range(len(output_ids[0])):
    print(tokenizer_outputs.decode(input_ids[0][i]), '->', tokenizer_outputs.decode(output_ids[0][i]))


<s> -> #
What -> is
is -> the
the -> difference
capital -> of
of -> the
France -> ?
? -> 

 -> Paris

 -> 


 -> Par
Par -> is
is -> .
is -> the
the -> capital
capital -> of
of -> France
France -> .
. -> 



In [30]:
logits = torch.randn(1, 2, 101)

logits = torch.cat(
    [
        torch.zeros_like(logits[:, :1, :]), 
        logits[:, :-1, :]
    ], dim=1
)

if torch.isnan(outputs.logits).any():
    raise ValueError("NaN values detected llm -> outputs.logits tensor")

# Only keep the top PROB_TOP_K scores by -inf the rest
# This will make the model only consider the top 100 tokens and make sure the models with higher vocab sizes are not penalized

# # get the top k logits and create a mask for them
top_k_logits, top_k_indices = logits.topk(100, dim=-1)
mask = torch.full_like(logits, float('-inf')).scatter(-1, top_k_indices, top_k_logits)