In [1]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

# model_id = "tiiuae/Falcon-H1-1.5B-Deep-Instruct"
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=torch.float16,
    load_in_4bit=True,
    )
model.to(device)



The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d


FalconH1ForCausalLM(
  (model): FalconH1Model(
    (embed_tokens): Embedding(65537, 1280, padding_idx=0)
    (layers): ModuleList(
      (0-65): 66 x FalconH1DecoderLayer(
        (feed_forward): FalconH1MLP(
          (gate_proj): Linear(in_features=1280, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1280, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1280, bias=False)
          (act_fn): SiLU()
        )
        (mamba): FalconH1Mixer(
          (act): SiLU()
          (conv1d): Conv1d(2048, 2048, kernel_size=(4,), stride=(1,), padding=(3,), groups=2048)
          (in_proj): Linear(in_features=1280, out_features=3608, bias=False)
          (norm): FalconH1RMSNormGated()
          (out_proj): Linear(in_features=1536, out_features=1280, bias=False)
        )
        (self_attn): FalconH1Attention(
          (q_proj): Linear(in_features=1280, out_features=768, bias=False)
          (k_proj): Linear(in_features=1280, 

In [None]:
# tokenizer.pad_token = tokenizer.eos_token
# model.config.pad_token_id = model.config.eos_token_id


In [3]:
messages = [
    {"role": "user", "content": "Who are you?"},
]

inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(device)

with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=40)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

user
Who are you?
assistant
I am an artificial intelligence digital assistant created by OpenAI. My purpose is to provide information, answer questions, and assist with a wide range of tasks through natural language processing. I don'


In [4]:
tokenizer.decode(inputs['input_ids'][0])

'<|begin_of_text|><|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n'

In [5]:
print(tokenizer.chat_template)

{{bos_token}}
{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "You are a function calling AI model. You are provided with function signature within <tools> </tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.\n<tools>\n" }}
    {%- for tool in tools %}[{{- tool | tojson }}]{%- endfor %}
    {{- "\n</tools>\nFor each function call, return a json object with function name and arguments within <tool_call> </tool_call> tags with the following schema:\n<tool_call>\n{'arguments': <args-dict>, 'name': <function-name>}\n</tool_call>\n" }}
{%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
{%- endif %}{% for message in messages %}{%- if message.role != 'system' %}{{'<|im_start|>' + message['

In [None]:
# torch.cuda.empty_cache()

In [12]:
tools = [
    {
        "name": "get_weather",
        "description": "Get the current weather for a given location.",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {"type": "string", "description": "The location name"}
            },
            "required": ["location"]
        }
    }
]

In [13]:
messages = [
    {"role": "user", "content": "What’s the weather like in Paris?"}
]

In [14]:
prompt = tokenizer.apply_chat_template(
    messages,
    tools=tools,
    tokenize=False,            # important: we want the raw prompt to see/use it
    add_generation_prompt=True
)

In [15]:
prompt

'<|begin_of_text|><|im_start|>system\nYou are a function calling AI model. You are provided with function signature within <tools> </tools> XML tags. You may call one or more functions to assist with the user query. Don\'t make assumptions about what values to plug into functions.\n<tools>\n[{"name": "get_weather", "description": "Get the current weather for a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location name"}}, "required": ["location"]}}]\n</tools>\nFor each function call, return a json object with function name and arguments within <tool_call> </tool_call> tags with the following schema:\n<tool_call>\n{\'arguments\': <args-dict>, \'name\': <function-name>}\n</tool_call>\n<|im_start|>user\nWhat’s the weather like in Paris?<|im_end|>\n<|im_start|>assistant\n'

In [16]:
def get_weather(location: str) -> str:
    """
    Get the current weather for a given location.

    Args:
        location: The name of the location to get the weather for.

    Returns:
        A description of the current weather in the specified location.
    """
    weather_prediction = f"The weather in {location} is sunny with low temperatures. \n"
    print(weather_prediction)
    return weather_prediction

In [17]:
inputs = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**inputs, max_new_tokens=60)

In [20]:
tokenizer.decode(output[0], skip_special_tokens=True)

'system\nYou are a function calling AI model. You are provided with function signature within <tools> </tools> XML tags. You may call one or more functions to assist with the user query. Don\'t make assumptions about what values to plug into functions.\n<tools>\n[{"name": "get_weather", "description": "Get the current weather for a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location name"}}, "required": ["location"]}}]\n</tools>\nFor each function call, return a json object with function name and arguments within   tags with the following schema:\n\n{\'arguments\': <args-dict>, \'name\': <function-name>}\n\nuser\nWhat’s the weather like in Paris?\nassistant\n'

In [6]:
messages=[
    {"role": "user", "content": "What is the capital of France?"},
]

In [7]:
def do_the_thing(messages):

    inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
            ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=40)
    
    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return text_output

In [8]:
raw = tokenizer.apply_chat_template(messages, tokenize=False)
print(raw)

<|begin_of_text|><|im_start|>user
What is the capital of France?<|im_end|>



In [9]:
output_text = do_the_thing(messages)
print(output_text)

user
What is the capital of France?
assistant
The capital of France is Paris. Known for its rich history, iconic landmarks such as the Eiffel Tower, and vibrant culture, Paris is one of the most famous cities in the world


In [15]:
torch.cuda.empty_cache()

In [10]:
messages=[
    {"role": "user", "content": "What is the weather in London right now?"},
]

In [11]:
output_text = do_the_thing(messages)
print(output_text)

user
What is the weather in London right now?
assistant
As


In [39]:
torch.cuda.empty_cache()

In [12]:
type(tokenizer)

transformers.tokenization_utils_fast.PreTrainedTokenizerFast

In [16]:
tokenizer.pad_token
tokenizer.eos_token

'<|end_of_text|>'

In [18]:
def get_mood(location: str) -> str:
    """
    Get the current mood of the populace for a given location.

    Args:
        location: The name of the location to get the mood of the local populace.

    Returns:
        A description of the current mood of the local populace in the specified location.
    """
    mood_report = f"The mood in {location} is extremely cranky. \n"
    print(mood_report)
    return mood_report

In [19]:
def get_weather(location: str) -> str:
    """
    Get the current weather for a given location.

    Args:
        location: The name of the location to get the weather for.

    Returns:
        A description of the current weather in the specified location.
    """
    weather_prediction = f"The weather in {location} is sunny with low temperatures. \n"
    print(weather_prediction)
    return weather_prediction

In [19]:
tools_list = [
    {
        "name": "get_weather",
        "description": "Get weather by location.",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {"type": "string"},
            },
            "required": ["location"]
        }
    }
]

In [20]:
def weather_agent(messages):
        inputs = tokenizer.apply_chat_template(
                messages,
                tools=[get_weather, get_mood],
                add_generation_prompt=True,
                tokenize=True,
                return_dict=True,
                return_tensors="pt",
                ).to(model.device)

        with torch.no_grad():
                outputs = model.generate(**inputs, max_new_tokens=40)
                text_output = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # torch.cuda.empty_cache()

        return text_output

In [21]:
output_text = weather_agent(messages)
print(output_text)

<|begin_of_text|><|im_start|>system
You are a function calling AI model. You are provided with function signature within <tools> </tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.
<tools>
[{"type": "function", "function": {"name": "get_weather", "description": "Get the current weather for a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The name of the location to get the weather for."}}, "required": ["location"]}, "return": {"type": "string", "description": "A description of the current weather in the specified location."}}}][{"type": "function", "function": {"name": "get_mood", "description": "Get the current mood of the populace for a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The name of the location to get the mood of the local populace."}}, "require

In [37]:
inputs = tokenizer.apply_chat_template(
                messages,
                tools=[get_weather],
                add_generation_prompt=True,
                tokenize=True,
                return_dict=True,
                return_tensors="pt",
                ).to(model.device)

In [38]:
tokenizer.decode(inputs['input_ids'][0])

'<|user|>\nWhat is the weather in London right now?</s> \n<|assistant|>\n'

In [39]:
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=40)

In [40]:
tokenizer.decode(outputs[0])

'<|user|>\nWhat is the weather in London right now?</s> \n<|assistant|>\nAs of the time of writing, the weather in London is currently cloudy with a chance of rain. The temperature is currently 11°C (52°F) and there is a'

In [43]:
tokenizer.decode(outputs[0], skip_special_tokens=False)

'<|user|>\nWhat is the weather in London right now?</s> \n<|assistant|>\nAs of the time of writing, the weather in London is currently cloudy with a chance of rain. The temperature is currently 11°C (52°F) and there is a'

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

#model_id = "tiiuae/falcon-1b"  # or 1.5B
model_id = "tiiuae/Falcon-H1-1.5B-Deep-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")

prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=10)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


The capital of France is Paris.

**Answer:** Paris

#### 


In [40]:
get_weather('Tampa')

the weather in Tampa is sunny with low temperatures. 



'the weather in Tampa is sunny with low temperatures. \n'

In [23]:
messages2=[
    {"role": "user", "content": "What is the capital of France?"},
]
weather_agent(messages2)

'<|begin_of_text|><|im_start|>system\nYou are a function calling AI model. You are provided with function signature within <tools> </tools> XML tags. You may call one or more functions to assist with the user query. Don\'t make assumptions about what values to plug into functions.\n<tools>\n[{"type": "function", "function": {"name": "get_weather", "description": "Get the current weather for a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The name of the location to get the weather for."}}, "required": ["location"]}, "return": {"type": "string", "description": "A description of the current weather in the specified location."}}}][{"type": "function", "function": {"name": "get_mood", "description": "Get the current mood of the populace for a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The name of the location to get the mood of the local populace."}}, "re

In [None]:
# Define tool metadata (not actual function — just schema info)
tools = [{
    "type": "function",
    "function": {
        "name": "get_weather",
        "description": "Get the current weather in a given city",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {"type": "string", "description": "City name"},
            },
            "required": ["location"],
        },
    }
}]

# User question
messages = [
    {"role": "user", "content": "What is the weather in Tokyo?"},
]

# Apply chat template
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    tools=tools,
)


In [None]:
# Tokenize
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate
outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.7,
)

In [None]:
# Decode result
output_text = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
print("MODEL OUTPUT:\n", output_text)