In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

# load transformers model openai-community/gpt2
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, pipeline
import torch

device = "cuda"

gen_config = GenerationConfig()

In [6]:
name = 'deepseek-ai/deepseek-math-7b-instruct'
# tokenizer = AutoTokenizer.from_pretrained(name)
# model = AutoModelForCausalLM.from_pretrained(name).to(device)
pipe = pipeline('text-generation', model=name, tokenizer=name, device_map="auto", torch_dtype=torch.bfloat16,
                batch_size=10)
pipe.tokenizer.pad_token = pipe.tokenizer.eos_token
generate_config = dict(do_sample=True, temperature=0.8, top_p=0.95,
                       max_new_tokens=512, num_return_sequences=1, pad_token_id=pipe.tokenizer.pad_token_id)
tokenizer = pipe.tokenizer
model = pipe.model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [41]:
tokenizer.pad_token = tokenizer.eos_token

In [7]:
question = "What is the meaning of life?"
inputs = tokenizer(question, return_tensors="pt").input_ids
inputs

tensor([[100000,   2640,    317,    254,   4569,    280,   1728,     30]])

In [8]:
# Generate regular predictions
response = model.generate(inputs.to(device), max_length=50, no_repeat_ngram_size=2, do_sample=False)
print(tokenizer.decode(response[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


What is the meaning of life?
The meaning and purpose of your life is something that you have to find for yourself. It is not something you can be told or handed to you on a silver platter. The meaning is a journey that is


In [29]:
question = "What is the meaning of life? No one knows the meaning of life. It is a mystery."
inputs = tokenizer(question, return_tensors="pt").input_ids
response = model.generate(inputs.to(device), max_length=50, no_repeat_ngram_size=2, do_sample=False)
print(tokenizer.decode(response[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


What is the meaning of life? No one knows the meaning of life. It is a mystery. Some people believe that the purpose of our lives is to seek happiness, while others believe it is our duty to serve others or to fulfill a specific


In [4]:
from dataset import from_name
from prompt import generate_nshot_prompts

data = from_name('EleutherAI/hendrycks_math', subset='algebra')['data']
data = generate_nshot_prompts(data['train'], 3)
prompts = [d['question'] for d in data]

Found cached dataset hendrycks_math (/lfs/skampere1/0/kaif/.cache/huggingface/datasets/EleutherAI___hendrycks_math/algebra/0.0.1/170db4d4d7d6e523b75159227fc93bd5368d3025cf38d8be579c0b3d6199c952)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /lfs/skampere1/0/kaif/.cache/huggingface/datasets/EleutherAI___hendrycks_math/algebra/0.0.1/170db4d4d7d6e523b75159227fc93bd5368d3025cf38d8be579c0b3d6199c952/cache-3141018808f2bcbe.arrow
Loading cached processed dataset at /lfs/skampere1/0/kaif/.cache/huggingface/datasets/EleutherAI___hendrycks_math/algebra/0.0.1/170db4d4d7d6e523b75159227fc93bd5368d3025cf38d8be579c0b3d6199c952/cache-db6a524eab4515da.arrow


In [5]:
outputs = pipe(prompts[:10], **generate_config)
               # max_new_tokens=512, num_return_sequences=1,
               # do_sample=True, temperature=0.8, top_p=0.95)
#, pad_token_id=pipe.tokenizer.eos_token_id)

In [147]:
# Use teacher forcing to generate predictions for a prompt, solution pair
def generate_prediction(prompt, solution, model, tokenizer, debug=False):
    prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids
    solution_ids = tokenizer(solution, return_tensors="pt").input_ids
    response = model(torch.cat([prompt_ids, solution_ids], dim=-1))
    # get the response ids for the solution part of the input
    tf_ids = torch.argmax(response.logits, dim=-1)[0, prompt_ids.shape[1] - 1:]
    if debug:
        for i in range(len(tf_ids) - 1):
            print(f"Predicted:\t{tokenizer.decode([tf_ids[i]], skip_special_tokens=True).__repr__()}")
            print(f"Actual:   \t{tokenizer.decode(solution_ids[0, i], skip_special_tokens=True).__repr__()}")

    return tf_ids

In [148]:
prompt = "1 2 3 4 5 6 7"
solution = " 8 9 10 11 12 13"
tf_ids = generate_prediction(prompt, solution, model, tokenizer, debug=True)

Predicted:	' 8'
Actual:   	' 8'
Predicted:	' 9'
Actual:   	' 9'
Predicted:	' 10'
Actual:   	' 10'
Predicted:	' 11'
Actual:   	' 11'
Predicted:	' 12'
Actual:   	' 12'
Predicted:	' 13'
Actual:   	' 13'


In [199]:
??model.forward

In [173]:
# create a batched version of generate_prediction
def generate_predictions(prompts, solutions, model, tokenizer, debug=False, device="cuda"):
    input = tokenizer([f"{p}{s}" for p, s in zip(prompts, solutions)], return_tensors="pt", padding=True,
                      truncation=True)
    prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)
    # solution_tokens = tokenizer(solutions, return_tensors="pt", padding=True, truncation=True)
    response = model(input.input_ids.to(device), attention_mask=input.attention_mask.to(device))
    # get the response ids for the solution part of the input
    all_preds = torch.argmax(response.logits, dim=-1)

    tf_ids = []
    solution_ids = []
    for i in range(len(prompts)):
        # assumes attention mask is contiguous
        prompt_length = prompt_tokens.attention_mask[i].sum().item()
        solution_start = torch.nonzero(input.attention_mask[i, :] == 1, as_tuple=False)[0, 0].item() + prompt_length
        solution_length = input.attention_mask[i].sum().item() - prompt_length

        assert prompt_tokens.input_ids[i, -1] == input.input_ids[i, solution_start - 1]

        # print(input.attention_mask[i])
        # print(f'prompt_tokens[i]: {[tokenizer.decode([t]) for t in prompt_tokens.input_ids[i]]}')
        # print(f'input[i]: {[tokenizer.decode([t]) for t in input.input_ids[i]]}')
        # print(f'all_preds[i]: {[tokenizer.decode([t]) for t in all_preds[i]]}')
        # print(f'solution[i]: {[tokenizer.decode([t]) for t in input.input_ids[i, solution_start:solution_start + solution_length]]}')
        # print(f'predicted: {[tokenizer.decode([t]) for t in all_preds[i, solution_start - 1:solution_start - 1 + solution_length]]}')

        tf_ids.append(all_preds[i, solution_start - 1:solution_start - 1 + solution_length])
        solution_ids.append(input.input_ids[i, solution_start:solution_start + solution_length])

    if debug:
        for i in range(len(tf_ids)):
            print(f"Prompt: {prompts[i]}")
            assert len(tf_ids[i]) == len(solution_ids[i])
            for j in range(len(tf_ids[i])):
                print(f"Predicted:\t{tokenizer.decode([tf_ids[i][j]], skip_special_tokens=True).__repr__()}")
                print(f"Actual:   \t{tokenizer.decode(solution_ids[i][j], skip_special_tokens=True).__repr__()}")

    return tf_ids, solution_ids

In [174]:
prompts = ["12345", "123"]
solutions = ["678910", "45"]
tf_ids = generate_predictions(prompts, solutions, model, tokenizer, debug=True)

Prompt: 12345
Predicted:	'\n'
Actual:   	'6'
Predicted:	'7'
Actual:   	'7'
Predicted:	'8'
Actual:   	'8'
Predicted:	'9'
Actual:   	'9'
Predicted:	'1'
Actual:   	'1'
Predicted:	'0'
Actual:   	'0'
Prompt: 123
Predicted:	'4'
Actual:   	'4'
Predicted:	'5'
Actual:   	'5'
