In [1]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [2]:
HUGGINGFACE_API_KEY = os.environ.get('API_KEY')

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_id = 'ibm-granite/granite-8b-code-base-128k'

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/", use_fast= True)

In [5]:
torch.cuda.empty_cache()

In [6]:
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, cache_dir="./models/", low_cpu_mem_usage=True)
model.eval()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-35): 36 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=True)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=True)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=True)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=True)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=True)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=True)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=True)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )

In [7]:
my_pipeline = pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    max_length=50,  # Limit the sequence length
    num_beams=1,  # Use greedy decoding
    do_sample=False,  # Disable sampling
    return_full_text=False  # Only return generated text
)

In [8]:
input_text = ["def fibonacci():", "def greatest_common_factor():"]

In [9]:
my_pipeline(input_text)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


[[{'generated_text': '\n    a, b = 0, 1\n    while True:\n        yield a\n        a, b = b, a + b\n\ndef main():\n    for i in range(10):\n        print(next('}],
 [{'generated_text': '\n    """\n    Returns the greatest common factor of two numbers.\n    """\n    a = int(input("Enter the first number: "))\n    b = int(input("Enter the second number: "))\n    if a'}]]