# Prototype: LLM Tone Analysis

## Imports

In [8]:
# Util
import os
from loguru import logger

# AI
from datasets import Dataset, load_dataset
from openai import OpenAI
from lm_eval import simple_evaluate

## LLM

In [2]:
# Model name
model = "accounts/fireworks/models/qwen3-30b-a3b-thinking-2507"

In [3]:
# API credentials
api_key = os.environ.get("OPENAI_API_KEY")
base_url = os.environ.get("OPENAI_API_BASE")

In [4]:
# Set up fireworks API
fireworks_client = OpenAI(
    api_key=api_key,
    base_url=base_url
)

In [5]:
def call_llm(message: str) -> str:
    # Call API
    response = fireworks_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": message,
            }
        ],
        model=model
    )

    # Extract response
    resp = response.choices[0].message.content
    return resp

In [6]:
print(call_llm("What's up? Tell me the result of 1+1."))

Okay, the user asked "What's up? Tell me the result of 1+1." Let me break this down.

First, "What's up?" is a casual greeting, so they're probably just being friendly or testing if I'm responsive. Then they want the sum of 1+1. That's really basic math, so they might be checking if I can handle simple calculations or just making small talk.

Hmm, why would someone ask 1+1 in a chat? Maybe they're new to using AI and want to verify if the assistant works. Or perhaps they're just messing around to see the reaction. Either way, it's a straightforward request.

I should keep it simple and friendly. No need for complex explanations since it's a math question. Just give the answer clearly and add a bit of warmth to match their casual tone. 

Wait—should I consider any possible tricks? Like, in some contexts 1+1 could mean something else (e.g., binary or wordplay), but they didn't hint at that. The safest bet is standard arithmetic. 

Also, they said "result," so I'll just state "2" without 

## LM-Eval
Using lm-eval (https://github.com/EleutherAI/lm-evaluation-harness) for evaluation. It supports many benchmarks and has prompting and post-processing code built-in.

In [15]:
from pprint import pprint
from lm_eval import simple_evaluate
import os

api_key = os.environ["OPENAI_API_KEY"]
base_url = "https://api.fireworks.ai/inference/v1/completions"
model = "accounts/fireworks/models/qwen3-30b-a3b-thinking-2507"

results = simple_evaluate(
    model="local-completions",
    model_args=(
        f"model={model},"
        f"base_url={base_url},"
        f"api_key={api_key},"
        "temperature=0,"
        # tell lm-eval which *HF* tokenizer to load locally
        "tokenizer=Qwen/Qwen2.5-7B-Instruct,"
        "tokenizer_backend=huggingface,"
        "trust_remote_code=True,"
        # QoL for API backends
        "max_retries=3,tokenized_requests=False,"
    ),
    tasks=["mmlu"],
    num_fewshot=5,
    batch_size=1,
    limit=1,
    # IMPORTANT: do NOT apply a chat template for MMLU
    # apply_chat_template=False  # (omit or set False)
)

pprint(results["results"])


pretrained=model=accounts/fireworks/models/qwen3-30b-a3b-thinking-2507,base_url=https://api.fireworks.ai/inference/v1/completions,api_key=fw_3ZRKwEjRhyHBNAPd5FDd4wrF,temperature=0,tokenizer=Qwen/Qwen2.5-7B-Instruct,tokenizer_backend=huggingface,trust_remote_code=True,max_retries=3,tokenized_requests=False,
        appears to be an instruct or chat variant but chat template is not applied. Recommend setting `apply_chat_template` (optionally
        `fewshot_as_multiturn`).
Overwriting default num_fewshot of mmlu_elementary_mathematics from None to 5
Overwriting default num_fewshot of mmlu_college_biology from None to 5
Overwriting default num_fewshot of mmlu_computer_security from None to 5
Overwriting default num_fewshot of mmlu_high_school_statistics from None to 5
Overwriting default num_fewshot of mmlu_college_physics from None to 5
Overwriting default num_fewshot of mmlu_high_school_mathematics from None to 5
Overwriting default num_fewshot of mmlu_electrical_engineering from None 

{'mmlu': {'acc,none': 0.9298245614035088,
          'acc_stderr,none': 'N/A',
          'alias': 'mmlu'},
 'mmlu_abstract_algebra': {'acc,none': 1.0,
                           'acc_stderr,none': 'N/A',
                           'alias': '  - abstract_algebra'},
 'mmlu_anatomy': {'acc,none': 1.0,
                  'acc_stderr,none': 'N/A',
                  'alias': '  - anatomy'},
 'mmlu_astronomy': {'acc,none': 1.0,
                    'acc_stderr,none': 'N/A',
                    'alias': '  - astronomy'},
 'mmlu_business_ethics': {'acc,none': 1.0,
                          'acc_stderr,none': 'N/A',
                          'alias': '  - business_ethics'},
 'mmlu_clinical_knowledge': {'acc,none': 1.0,
                             'acc_stderr,none': 'N/A',
                             'alias': '  - clinical_knowledge'},
 'mmlu_college_biology': {'acc,none': 1.0,
                          'acc_stderr,none': 'N/A',
                          'alias': '  - college_biology'},
 'mmlu_col

## Benchmark

In [None]:
mmlu_pro = "TIGER-Lab/MMLU-Pro"

In [None]:
d = load_dataset(mmlu_pro)

In [None]:
d["validation"][60]