In [11]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc
from dotenv import load_dotenv
from huggingface_hub import login as hf_login
load_dotenv()
hf_login(os.environ['HF_TOKEN'])
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"  # Apple Silicon
else:
    device = "cpu"

class Model:
    def __init__(self,model_name):
        self.model_name=model_name
        # Load tokenizer with correct padding configuration
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token


        # Load model with optimizations
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16 if device in ["cuda", "mps"] else torch.float32,
            low_cpu_mem_usage=True,
        )
        self.model.to(device)
    def generate(self,prompt,max_new_tokens=100,temperature=.1):
        inputs = self.tokenizer(prompt, return_tensors="pt")

        # Move tensors to the right device
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs.get("attention_mask", None)
        if attention_mask is not None:
            attention_mask = attention_mask.to(device)

        # Clear CUDA cache if using GPU
        if device == "cuda":
            torch.cuda.empty_cache()
        # Generate with optimal parameters for the model
        with torch.no_grad():
            generate_kwargs = {
                "max_new_tokens": max_new_tokens,
                "temperature": temperature,
                "do_sample": True,
                "top_p": 0.95,
                "top_k": 50,
                "repetition_penalty": 1.4,
                "no_repeat_ngram_size": 3,
                "pad_token_id": self.tokenizer.pad_token_id,
                "eos_token_id": self.tokenizer.eos_token_id,
                "use_cache": True
            }
        if attention_mask is not None:
            generate_kwargs["attention_mask"] = attention_mask
            
        output = self.model.generate(input_ids, **generate_kwargs)

        # Extract the model's response
        input_length = input_ids.shape[1]
        response = self.tokenizer.decode(output[0, input_length:], skip_special_tokens=True).strip()
        return response


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
m=Model("meta-llama/Llama-3.2-3B-Instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
r=m.generate("Write a poem about New York City",max_new_tokens=100,temperature=.1)

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e-05)
    (rotary_emb

In [5]:
prompt="Write a poem about New York City"
temperature=.1
max_new_tokens=100

In [8]:
response

"The city that never sleeps, they say,\nA place where dreams are made and lost each day.\nSkyscrapers pierce the urban sky,\nConcrete jungle stretching far and high.\n\nIn Times Square's bright lights I roam,\nWhere billboards flash with endless tone.\nBroadway beckons to my soul,\nTo dance in theaters' grand control.\n\nCentral Park offers peaceful nest,\nGreen oasis amidst steel unrest.\nBrooklyn Bridge spans waters wide,\nConnecting boroughs side by side.\n\nFrom Harlem jazz"

In [9]:
output

tensor([[128000,   8144,    264,  33894,    922,   1561,   4356,   4409,    198,
            791,   3363,    430,   2646,  72490,     11,    814,   2019,    345,
             32,   2035,   1405,  19226,    527,   1903,    323,   5675,   1855,
           1938,    627,  19847,   1065,  99821,    388,  22710,    346,    279,
          16036,  13180,    345,  84694,  45520,  42949,   3117,    323,   1579,
            382,    644,   8691,  15992,    596,  10107,  13001,    358,  76067,
            345,   9241,   4121,  19826,   8381,    449,  26762,  16630,    627,
          69424,   3195,  92186,   2439,    311,    856,  13836,    345,   1271,
          15612,    304,  44866,      6,   6800,   2585,    382,  44503,   5657,
           6209,  26733,  23634,    345,  20147,  86810,  65904,   9699,  59322,
            627,  27368,  83287,  20467,  45395,  21160,   7029,    345,  64024,
          66841,     82,   3185,    555,   3185,    382,   3915,  83852,  34997]],
       device='mps:0')