# Como usar LLMs

In [1]:
# import os
# os.environ["NVIDIA_API_KEY"] = "nvapi-..."

[NVIDIA api key](https://build.nvidia.com)

In [16]:
import os
from dotenv import load_dotenv

if load_dotenv():
    print("Dotenv loaded successfully")

Dotenv loaded successfully


## A pedal

Usando la libreria `requests`

In [17]:
import requests

# invoke_url = "https://api.openai.com/v1/models"
invoke_url = "https://integrate.api.nvidia.com/v1/models"

headers = {
    "content-type": "application/json",
    "Authorization": f"Bearer {os.environ.get('NVIDIA_API_KEY')}",
    # "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}

print("Available Models:")
response = requests.get(invoke_url, headers=headers, stream=False)
for model_entry in response.json().get("data", []):
    print(" -", model_entry.get("id"))


Available Models:
 - 01-ai/yi-large
 - abacusai/dracarys-llama-3.1-70b-instruct
 - adept/fuyu-8b
 - ai21labs/jamba-1.5-large-instruct
 - ai21labs/jamba-1.5-mini-instruct
 - aisingapore/sea-lion-7b-instruct
 - baai/bge-m3
 - baichuan-inc/baichuan2-13b-chat
 - bigcode/starcoder2-15b
 - bigcode/starcoder2-7b
 - bytedance/seed-oss-36b-instruct
 - databricks/dbrx-instruct
 - deepseek-ai/deepseek-coder-6.7b-instruct
 - deepseek-ai/deepseek-r1
 - deepseek-ai/deepseek-r1-0528
 - deepseek-ai/deepseek-r1-distill-llama-8b
 - deepseek-ai/deepseek-r1-distill-qwen-14b
 - deepseek-ai/deepseek-r1-distill-qwen-32b
 - deepseek-ai/deepseek-r1-distill-qwen-7b
 - deepseek-ai/deepseek-v3.1
 - deepseek-ai/deepseek-v3.1-terminus
 - google/codegemma-1.1-7b
 - google/codegemma-7b
 - google/deplot
 - google/gemma-2-27b-it
 - google/gemma-2-2b-it
 - google/gemma-2-9b-it
 - google/gemma-2b
 - google/gemma-3-12b-it
 - google/gemma-3-1b-it
 - google/gemma-3-27b-it
 - google/gemma-3-4b-it
 - google/gemma-3n-e2b-it
 -

In [19]:
invoke_url = "https://integrate.api.nvidia.com/v1/chat/completions"

## If you wanted to use OpenAI, it's very similar
# if not os.environ.get("OPENAI_API_KEY", "").startswith("sk-"):
#     os.environ["OPENAI_API_KEY"] = getpass("OPENAI_API_KEY: ")
# invoke_url = "https://api.openai.com/v1/models"

## Meta communication-level info about who you are, what you want, etc.
headers = {
    "accept": "text/event-stream",
    "content-type": "application/json",
    "Authorization": f"Bearer {os.environ.get('NVIDIA_API_KEY')}",
    # "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}

## Arguments to your server function
payload = {
    "model": "meta/llama-3.3-70b-instruct",
    "messages": [{"role":"user", "content":"Tell me hello in Japanese"}],
    "temperature": 0.5,   
    "top_p": 1,
    "max_tokens": 1024,
    "stream": True                
}

In [20]:
import requests
import json

## Use requests.post to send the header (streaming meta-info) the payload to the endpoint
## Make sure streaming is enabled, and expect the response to have an iter_lines response.
response = requests.post(invoke_url, headers=headers, json=payload, stream=True)

## If your response is an error message, this will raise an exception in Python
try: 
    response.raise_for_status()  ## If not 200 or similar, this will raise an exception
except Exception as e:
    # print(response.json())
    print(response.json())
    raise e

## Custom utility to make live a bit easier
def get_stream_token(entry: bytes):
    """Utility: Coerces out ['choices'][0]['delta'][content] from the bytestream"""
    if not entry: return ""
    entry = entry.decode('utf-8')
    if entry.startswith('data: '):
        try: entry = json.loads(entry[5:])
        except ValueError: return ""
    return entry.get('choices', [{}])[0].get('delta', {}).get('content') or ""

## If the post request is honored, you should be able to iterate over 
for line in response.iter_lines():
    
    ## Without Processing: data: {"id":"...", ... "choices":[{"index":0,"delta":{"role":"assistant","content":""}...}...
    # if line: print(line.decode("utf-8"))

    ## With Processing: An actual stream of tokens printed one-after-the-other as they come in
    print(get_stream_token(line), end="")

Konnichiwa! (Hello!)

## Usando clientes como OpenAI

In [21]:
## Using General OpenAI Client
from openai import OpenAI

client = OpenAI(
    base_url = "https://integrate.api.nvidia.com/v1",
    api_key = os.environ.get("NVIDIA_API_KEY", "")
)

completion = client.chat.completions.create(
    model="meta/llama-3.3-70b-instruct",
    # model="gpt-4-turbo-2024-04-09",
    messages=[{"role":"user","content":"Tell me hello in Japanese"}],
    temperature=1,
    top_p=1,
    max_tokens=1024,
    stream=True,
)

## Streaming with Generator: Results come out as they're generated
for chunk in completion:
    if chunk.choices[0].delta.content is not None:
        print(chunk.choices[0].delta.content, end="")

(Konnichiwa) - This is a formal way of saying "hello" in Japanese. 

There are other ways to say hello in Japanese depending on the time of day:
- (Ohayou) - Good morning
- (Konbanwa) - Good evening

However, (Konnichiwa) is a general greeting that can be used at any time of day.

In [24]:
# Non-Streaming: Results come from server when they're all ready
completion = client.chat.completions.create(
    model="meta/llama-3.3-70b-instruct",
    # model="gpt-4-turbo-2024-04-09",
    messages=[{"role":"user","content":"Tell me hello in Japanese"}],
    temperature=1,
    top_p=1,
    max_tokens=1024,
    stream=False,
)

print(completion.choices[0].message.content, end="")

(in Japanese), which is pronounced as "Konnichiwa". However, there are other ways to say hello in Japanese depending on the time of day:

* (Ohayou) - Good morning
* (Konnichiwa) - Good day (or hello)
* (Konbanwa) - Good evening

So, I'll say: (Konnichiwa)!

## Usando langchain

In [25]:
## Using ChatNVIDIA
from langchain_nvidia_ai_endpoints import ChatNVIDIA
#from langchain_openai import OpenAI

## NVIDIA_API_KEY pulled from environment
llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct")

llm.invoke("Tell me hello in Japanese")

AIMessage(content='(Konnichiwa)\n\nThis is a formal way of saying "hello" in Japanese. If you want to be more casual, you can say:\n\n(Konnichiwa) is used during the day, but if you want to greet someone in the evening, you can say:\n\n(Konbanwa)\n\nAnd if you want to greet someone in the morning, you can say:\n\n(Ohayou)\n\nLet me know if you have any other questions.', additional_kwargs={}, response_metadata={'role': 'assistant', 'content': '(Konnichiwa)\n\nThis is a formal way of saying "hello" in Japanese. If you want to be more casual, you can say:\n\n(Konnichiwa) is used during the day, but if you want to greet someone in the evening, you can say:\n\n(Konbanwa)\n\nAnd if you want to greet someone in the morning, you can say:\n\n(Ohayou)\n\nLet me know if you have any other questions.', 'refusal': None, 'annotations': None, 'audio': None, 'function_call': None, 'tool_calls': [], 'reasoning_content': None, 'token_usage': {'prompt_tokens': 40, 'total_tokens': 132, 'completion_tokens

In [27]:
from langchain_core.output_parsers import StrOutputParser

chain = llm | StrOutputParser()
print(chain.invoke("Tell me hello in Japanese"), end="")

Konnichiwa! (Hello!)

In [28]:
llm._client.last_inputs

{'url': 'https://integrate.api.nvidia.com/v1/chat/completions',
 'headers': {'Accept': 'application/json',
  'Authorization': 'Bearer **********',
  'User-Agent': 'langchain-nvidia-ai-endpoints',
  'X-BILLING-SOURCE': 'langchain-nvidia-ai-endpoints'},
 'json': {'messages': [{'role': 'user',
    'content': 'Tell me hello in Japanese'}],
  'model': 'meta/llama-3.3-70b-instruct',
  'max_tokens': 1024,
  'stream': False}}

In [29]:
# llm._client.last_response
llm._client.last_response.json()

{'id': 'chatcmpl-18750fa82054479d9e8789c32ec2ad0a',
 'object': 'chat.completion',
 'created': 1762111077,
 'model': 'meta/llama-3.3-70b-instruct',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': 'Konnichiwa! (Hello!)',
    'refusal': None,
    'annotations': None,
    'audio': None,
    'function_call': None,
    'tool_calls': [],
    'reasoning_content': None},
   'logprobs': None,
   'finish_reason': 'stop',
   'stop_reason': None,
   'token_ids': None}],
 'service_tier': None,
 'system_fingerprint': None,
 'usage': {'prompt_tokens': 40,
  'total_tokens': 49,
  'completion_tokens': 9,
  'prompt_tokens_details': None},
 'prompt_logprobs': None,
 'prompt_token_ids': None,
 'kv_transfer_params': None}

In [None]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA

model_list = ChatNVIDIA.get_available_models()

for model_card in model_list:
    model_name = model_card.id
    ## If you want to, might be a good idea to not go through EVERY model
    if not any([keyword in model_name for keyword in ["meta/llama"]]): continue
    if "405b" in model_name: continue
    if "embed" in model_name: continue
    
    llm = ChatNVIDIA(model=model_name)
    print(f"TRIAL: {model_name}")
    try: 
        for token in llm.stream("Tell me about yourself! 2 sentences.", max_tokens=100):
            print(token.content, end="")
    except Exception as e: 
        print(f"EXCEPTION: {e}")    ## If some models fail, feel free to use others
    except KeyboardInterrupt:
        print(f"Stopped manually")  ## Feel free to hit square while running
        break
    print("\n\n" + "="*84)

TRIAL: meta/llama-4-maverick-17b-128e-instruct
I'm an AI assistant designed to provide helpful and informative responses to your questions and engage in conversation. I don't have a personal identity or emotions, but I'm here to assist you with any topics or tasks you'd like to discuss or accomplish!

TRIAL: meta/llama-guard-4-12b
safe

TRIAL: meta/llama3-8b-instruct
I'm LLaMA, a large language model trained by a team of researcher at Meta AI. I can understand and respond to human input in a conversational manner, capable of generating human-like text based on the input given to me.

TRIAL: meta/llama-3.2-11b-vision-instruct
I am an artificial intelligence language model designed to provide information and answer questions on a wide range of topics. I don't have personal experiences or emotions, but I can assist with tasks, offer suggestions, and engage in conversation to the best of my abilities.

TRIAL: meta/llama-3.3-70b-instruct
I'm an artificial intelligence language model, which 