# GLM-4.7-Flash RunPod Endpoint Test

테스트 항목:
1. Health Check
2. Model Info
3. Chat Completions
4. Generate Endpoint
5. Tokenizer

In [None]:
import os
import requests
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.getenv("RUNPOD_API_KEY")
ENDPOINT_ID = os.getenv("ENDPOINT_ID")
BASE_URL = f"https://api.runpod.ai/v2/{ENDPOINT_ID}"

headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json"
}

print(f"Endpoint: {ENDPOINT_ID}")

## 1. Health Check

In [None]:
response = requests.get(f"{BASE_URL}/health", headers=headers)
print(f"Status: {response.status_code}")
print(response.json())

## 2. Model Info

In [None]:
payload = {
    "input": {
        "openai_route": "/v1/models",
        "openai_input": {}
    }
}

response = requests.post(f"{BASE_URL}/runsync", headers=headers, json=payload)
print(f"Status: {response.status_code}")
print(response.json())

## 3. Chat Completions

In [None]:
payload = {
    "input": {
        "openai_route": "/v1/chat/completions",
        "openai_input": {
            "model": "glm-4.7-flash",
            "messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "안녕! 자기소개 해줘."}
            ],
            "max_tokens": 256,
            "temperature": 0.7
        }
    }
}

response = requests.post(f"{BASE_URL}/runsync", headers=headers, json=payload, timeout=300)
print(f"Status: {response.status_code}")
result = response.json()
print(result)

if "output" in result:
    output = result["output"]
    if "choices" in output:
        print("\n--- Response ---")
        print(output["choices"][0]["message"]["content"])

## 4. Generate Endpoint (SGLang Native)

In [None]:
payload = {
    "input": {
        "openai_route": "/generate",
        "openai_input": {
            "text": "def fibonacci(n):",
            "sampling_params": {
                "max_new_tokens": 128,
                "temperature": 0.5
            }
        }
    }
}

response = requests.post(f"{BASE_URL}/runsync", headers=headers, json=payload, timeout=300)
print(f"Status: {response.status_code}")
result = response.json()
print(result)

## 5. Tokenizer Test

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("zai-org/GLM-4.7-Flash", trust_remote_code=True)
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Model max length: {tokenizer.model_max_length}")

In [None]:
# Tokenize test
text = "안녕하세요! GLM-4.7-Flash 테스트입니다."
tokens = tokenizer.encode(text)
print(f"Text: {text}")
print(f"Tokens: {tokens}")
print(f"Token count: {len(tokens)}")
print(f"Decoded: {tokenizer.decode(tokens)}")

In [None]:
# Chat template test
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
]

formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print("Chat template:")
print(formatted)