In [1]:
peft_model_id = 'kreimben/CodeMind'
revision_id = 'falcon-7b-20240404'

In [15]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

config = PeftConfig.from_pretrained(peft_model_id, revision=revision_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id, revision=revision_id)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# model = model.to('cuda')

model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): FalconForCausalLM(
      (transformer): FalconModel(
        (word_embeddings): Embedding(65024, 4544)
        (h): ModuleList(
          (0-31): 32 x FalconDecoderLayer(
            (self_attention): FalconAttention(
              (rotary_emb): FalconRotaryEmbedding()
              (query_key_value): lora.Linear(
                (base_layer): FalconLinear(in_features=4544, out_features=4672, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4544, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4672, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
             

In [16]:
model.generation_config

GenerationConfig {
  "bos_token_id": 11,
  "eos_token_id": 11
}

In [17]:
from transformers import GenerationConfig
import time
import transformers
import torch

system_prompt = f"""
You are a teacher who teaches coding tests to users.
You should explain the algorithm for the coding test problem to the user in detail.
separate the steps but keep it detailed.
let's think step by step.
"""


def formatting_func(user_input):
    global system_prompt
    return f"<human>: {system_prompt}\n\n{user_input}\n<assistant>: "


def generate_response(user_input):
    user_input = formatting_func(user_input)

    inputs = tokenizer(user_input, return_tensors='pt')
    input_ids = inputs['input_ids']#.to('cuda')
    
    tic = time.time()

    generation_config = GenerationConfig(
        penalty_alpha=.6,
        do_sample=True,
        top_k=5,
        temperature=.5,
        repetition_penalty=1.2,
        max_new_tokens=512,
        pad_token_id=tokenizer.eos_token_id,
    )

    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=256
    )

    return {
        'time': time.time() - tic,
        'responses': generation_output.sequences,
    }

In [18]:
import torch

with torch.no_grad():
    res = generate_response('leetcode no.1 add sum. explain me')

In [19]:
res['responses']

tensor([[   39, 15564, 48190,   204,   193,  1357,   362,   241,  6092,   569,
         15864, 16958,  5918,   271,  2826,    25,   193,  1357,   808,  5656,
           248, 12910,   312,   248, 16958,  1318,  1375,   271,   248,  2735,
           272,  3730,    25,   193, 28062,   375,   248,  4128,   480,  1244,
           334,  6626,    25,   193,  1025,    18,    94,   864,  2006,   431,
          2006,    25,  1001,   193,   274,   289,  5779,   658,    25,    28,
           737,  8370,    25,  5656,   454,   193,    39,   524,  7893, 48190,
           204,    70, 19426,    72,   258,   500,   204,    70, 19426,    72,
           258,   500,   204,    70, 19426,    72,   258,   500,   204,    70,
         19426,    72,   258,   500,   204,    70, 19426,    72,   258,   500,
           204,    70, 19426,    72,   258,   500,   204,    70, 19426,    72,
           258,   500,   204,    70, 19426,    72,   258,   500,   204,    70,
         19426,    72,   258,   500,   204,    70, 1

In [None]:
import transformers

raw_model_id = "tiiuae/falcon-7b"

pipeline = transformers.pipeline(
    "text-generation",
    model=raw_model_id,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    # trust_remote_code=True,
    device_map="auto",
)

In [None]:
%%time
sequences = pipeline(
    "leetcode no.1 add sum. explain me",
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")