### python_39_env

In [None]:
import os
import json
import logging
from threading import Lock
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
import torch

# Define model IDs
CUSTOM_MODEL = "hyonbokan/BGPStream13-10k-cutoff-1024-max-2048"
LLAMA3_8B_INSTRUCT = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize model, tokenizer, and lock
model = None
tokenizer = None
model_lock = Lock()


def load_model():
    global model, tokenizer
    if model is None or tokenizer is None:
        with model_lock:
            if model is None or tokenizer is None:
                try:
                    model_id = CUSTOM_MODEL
                    hf_auth = os.environ.get('HF_TOKEN')
                    
                    model_config = AutoConfig.from_pretrained(
                        model_id,
                        use_auth_token=hf_auth
                    )
                    model = AutoModelForCausalLM.from_pretrained(
                        model_id,
                        trust_remote_code=True,
                        config=model_config,
                        device_map='auto',
                        use_auth_token=hf_auth
                    )
                    tokenizer = AutoTokenizer.from_pretrained(
                        model_id,
                        use_auth_token=hf_auth
                    )
                    
                    tokenizer.pad_token = tokenizer.eos_token
                    tokenizer.padding_side = "left"
                    tokenizer.truncation_side = "left"

                    logger.info("Model loaded successfully")
                except Exception as e:
                    logger.error(f"Failed to load the model: {str(e)}")
                    raise
    return model, tokenizer

# Generate response function
def generate_llm_response(query):
    logger.info(f"User query: {query}")
    try:
        # Ensure model and tokenizer are loaded
        model, tokenizer = load_model()

        # Tokenize the input query
        inputs = tokenizer(
            query,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=1500
        )

        input_ids = inputs.input_ids.to(model.device)
        attention_mask = inputs.attention_mask.to(model.device)
        
        # Generation settings - best so far
        generation_kwargs = dict(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=712,
            do_sample=True,
            temperature=0.3,
            top_p=0.9,
            top_k=50,
            repetition_penalty=1.0,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )        
        
        # generation_kwargs = dict(
        #     input_ids=input_ids,
        #     attention_mask=attention_mask,
        #     max_new_tokens=712,
        #     do_sample=True,
        #     temperature=0.7,
        #     top_p=0.9,
        #     repetition_penalty=1.2,
        #     eos_token_id=tokenizer.eos_token_id,
        #     pad_token_id=tokenizer.pad_token_id,
        # )

        # Generate output
        generated_ids = model.generate(**generation_kwargs)

        # Decode and print the generated text
        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        print("Generated text:")
        print(generated_text)

    except Exception as e:
        logger.error(f"Error generating LLM response: {str(e)}")

# Load the model once at startup
load_model()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:__main__:Model loaded successfully


(LlamaForCausalLM(
   (model): LlamaModel(
     (embed_tokens): Embedding(32000, 5120)
     (layers): ModuleList(
       (0-39): 40 x LlamaDecoderLayer(
         (self_attn): LlamaSdpaAttention(
           (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
           (k_proj): Linear(in_features=5120, out_features=5120, bias=False)
           (v_proj): Linear(in_features=5120, out_features=5120, bias=False)
           (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
           (rotary_emb): LlamaRotaryEmbedding()
         )
         (mlp): LlamaMLP(
           (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
           (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
           (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
           (act_fn): SiLU()
         )
         (input_layernorm): LlamaRMSNorm((5120,), eps=1e-05)
         (post_attention_layernorm): LlamaRMSNorm((5120,), eps=1e-05)
 

In [None]:
generate_llm_response("Generate a Python script using `pybgpstream` to collect BGP update messages from route collectors. The script should retrieve updates from all available route collectors for a specific 10-minute time range. The output should display the timestamp, collector, and raw update message. The code must be written inside a code block using ```[code]```.")