In [None]:
from transformers import pipeline
import torch

# Local Inference

In [None]:
# model_id = "openai/gpt-oss-20b" # too large
model_id = "unsloth/gpt-oss-20b-GGUF" # 4-bit quantization

In [None]:
from llama_cpp import Llama


def check_gpu_usage():
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
            print(f"  Memory used: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB")
            print(f"  Memory total: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")
    else:
        print("No CUDA devices available")

print("Before loading model:")
check_gpu_usage()


In [None]:
# Download and use GGUF model
llm = Llama.from_pretrained(
    repo_id="unsloth/gpt-oss-20b-GGUF",
    filename="gpt-oss-20b-Q3_K_S.gguf",  # Choose appropriate quantization
    verbose=True,
    n_gpu_layers=-1
)

# Check after loading
print("\nAfter loading model:")
check_gpu_usage()



In [1]:
from dotenv import load_dotenv

load_dotenv()

True

# HuggingFace How to use OpenAI's GPT OSS

In [10]:
import os
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

client=OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.getenv("HF_TOKEN")
)

response = client.chat.completions.create(
    model= "openai/gpt-oss-120b:cerebras",
    messages=[{"role": "user", "content":"Tell me a fun fact about the Eiffel Tower."}]
)

print(response.choices[0].message.content)

Here’s a fun one: the Eiffel Tower actually **grows in the summer!**  

Because it’s made of iron, the metal expands when it gets hot. On a scorching July day, the tower can be up to **15 cm (about 6 inches) taller** than its winter height. So the iconic silhouette you see in photos is a little taller on a sunny, warm afternoon than it is when the temperature drops. 🌞🏗️


In [3]:
import os
from openai import OpenAI

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather in a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
                "required": ["location"],
            },
        },
    }
]

response = client.chat.completions.create(
    model="openai/gpt-oss-120b:cerebras",
    messages=[{"role": "user", "content": "What is the weather in Paris in Celsius?"}],
    tools=tools,
    tool_choice="auto",
)

# The response will contain the tool_calls object if the model decides to use the tool
print(response.choices[0].message)

ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='c5b3d86be', function=Function(arguments='{\n  "location": "Paris, France",\n  "unit": "celsius"\n}', name='get_current_weather'), type='function')], reasoning='User wants current weather in Paris Celsius. Need to call function get_current_weather with location="Paris, France" (maybe just Paris). Use unit "celsius".')


In [6]:
import json
import os

from openai import OpenAI


client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.getenv("HF_TOKEN"),
)

# Force the model to output a JSON object
response = client.chat.completions.create(
    model="openai/gpt-oss-120b:cerebras",
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant designed to output JSON.",
        },
        {
            "role": "user",
            "content": "Extract the name, city, and profession from the following sentence: 'Amélie is a chef who lives in Paris.'",
        },
    ],
    response_format={
        "type": "json_schema",
        "json_schema": {
            "name": "person",
            "schema": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "city": {"type": "string"},
                    "profession": {"type": "string"},
                },
                "required": ["name", "city", "profession"],
            },
        },
    },
)

# The output is a valid JSON string that can be easily parsed
output_json_string = response.choices[0].message.content
parsed_output = json.loads(output_json_string)

print(parsed_output)

{'name': 'Amélie', 'city': 'Paris', 'profession': 'chef'}


In [7]:
import os
from openai import OpenAI


client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.getenv("HF_TOKEN"),
)

# Set stream=True to receive a stream of semantic events
stream = client.responses.create(
    model="openai/gpt-oss-120b:fireworks-ai",
    input="Tell me a short story about a robot who discovers music.",
    stream=True,
)

# Iterate over the events in the stream
for event in stream:
    print(event)

ResponseCreatedEvent(response=Response(id='resp_8e050cff626bf57ff9fd2badf9c91c079d64138a1e0cd97e', created_at=1754755082.0, error=None, incomplete_details=None, instructions=None, metadata=None, model='openai/gpt-oss-120b:fireworks-ai', object='response', output=[], parallel_tool_calls=None, temperature=1.0, tool_choice='auto', tools=[], top_p=1.0, background=None, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, prompt_cache_key=None, reasoning=None, safety_identifier=None, service_tier=None, status='in_progress', text=None, top_logprobs=None, truncation=None, usage=ResponseUsage(input_tokens=0, input_tokens_details=InputTokensDetails(cached_tokens=0), output_tokens=0, output_tokens_details=OutputTokensDetails(reasoning_tokens=0), total_tokens=0), user=None), sequence_number=0, type='response.created')
ResponseInProgressEvent(response=Response(id='resp_8e050cff626bf57ff9fd2badf9c91c079d64138a1e0cd97e', created_at=1754755082.0, error=None, incomplete

In [8]:
from openai import OpenAI
import os

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.getenv("HF_TOKEN"),
)

tools = [
    {
        "type": "function",
        "name": "get_current_weather",
        "description": "Get the current weather in a given location",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {
                    "type": "string",
                    "description": "The city and state, e.g. San Francisco, CA",
                },
                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
            },
            "required": ["location", "unit"],
        },
    }
]

response = client.responses.create(
    model="openai/gpt-oss-120b:fireworks-ai", 
    tools=tools,
    input="What is the weather like in Boston today?",
    tool_choice="auto",
)

print(response)

Response(id='resp_736c167d45d7affc4e52f4e91da83bd19c965e7424dfa12a', created_at=1754755109.0, error=None, incomplete_details=None, instructions=None, metadata=None, model='openai/gpt-oss-120b:fireworks-ai', object='response', output=[ResponseReasoningItem(id='rs_787c941b7fb78fea3c2e11361a13596b3cc2bbb29342fb30', summary=[], type='reasoning', content=[Content(text='The user asks for weather in Boston today. We should call function get_current_weather with location "Boston, MA" and unit maybe "fahrenheit".', type='reasoning_text')], encrypted_content=None, status='in_progress'), ResponseFunctionToolCall(arguments='{\n  "location": "Boston, MA",\n  "unit": "fahrenheit"\n}', call_id='call_3q9fmyD01ish79cB9YW0q3SG', name='get_current_weather', type='function_call', id='fc_5393efe30be37519f537752aab677fcb813f1732fee5297f', status='completed')], parallel_tool_calls=None, temperature=1.0, tool_choice='auto', tools=[FunctionTool(name='get_current_weather', parameters={'type': 'object', 'propert

In [9]:
import os
from openai import OpenAI

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.getenv("HF_TOKEN"),
)

response = client.responses.create(
    model="openai/gpt-oss-120b:fireworks-ai",
    input="What transport protocols are supported in the 2025-03-26 version of the MCP spec?",
    tools=[
        {
            "type": "mcp",
            "server_label": "deepwiki",
            "server_url": "https://mcp.deepwiki.com/mcp",
            "require_approval": "never",
        },
    ],
)

print(response)

Response(id='resp_579936873ec66c7223d646eec33d38c9379e6d3ede82e445', created_at=1754755136.0, error=None, incomplete_details=None, instructions=None, metadata=None, model='openai/gpt-oss-120b:fireworks-ai', object='response', output=[McpListTools(id='mcpl_3f85f572bb352836662fcd1a3f0ca743f810c2b3bdbcc80b', server_label='deepwiki', tools=[McpListToolsTool(input_schema={'type': 'object', 'properties': {'repoName': {'type': 'string', 'description': 'GitHub repository: owner/repo (e.g. "facebook/react")'}}, 'required': ['repoName'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, name='read_wiki_structure', annotations=None, description='Get a list of documentation topics for a GitHub repository'), McpListToolsTool(input_schema={'type': 'object', 'properties': {'repoName': {'type': 'string', 'description': 'GitHub repository: owner/repo (e.g. "facebook/react")'}}, 'required': ['repoName'], 'additionalProperties': False, '$schema': 'http://json-schema.or