In [10]:
# Oneâ€‘command quickstart (Option B: local gguf via llama.cpp) for TruthTorchLM truthfulness check
# - Runs fully offline using two small local models downloaded from Hugging Face.
# - Edit CLAIM below and run this single cell.
# - First run will download models and may take a few minutes; subsequent runs are cached.

import sys, subprocess, importlib, importlib.util, inspect

# --- lightweight installer ----------------------------------------------------
def _ensure(pkg_import: str, pip_name: str | None = None):
    try:
        __import__(pkg_import)
        return
    except Exception:
        pass
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', pip_name or pkg_import])

# Ensure local LLM deps
_ensure('llama_cpp', 'llama-cpp-python')
_ensure('huggingface_hub')
# Ensure Transformers stack for generate_with_truth_value path
_ensure('transformers')
_ensure('torch')
_ensure('sentencepiece')
_ensure('accelerate')

from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# --- pick and download two small GGUF models (<10 GB each) --------------------
# We try a few common repos/filenames and use the first that works for each.
A_CANDIDATES = [
    ('TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF', 'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf'),
    ('TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF', 'tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf'),
]
B_CANDIDATES = [
    ('Qwen/Qwen2.5-3B-Instruct-GGUF', 'qwen2.5-3b-instruct-q4_k_m.gguf'),
    ('Qwen/Qwen2.5-3B-Instruct-GGUF', 'qwen2.5-3b-instruct-q5_k_m.gguf'),
    ('TheBloke/phi-2-GGUF', 'phi-2.Q4_K_M.gguf'),  # fallback if Qwen GGUF variant not available
]

def _first_available(cands):
    for repo_id, filename in cands:
        try:
            path = hf_hub_download(repo_id=repo_id, filename=filename)
            print(f"Using model: {repo_id} :: {filename}")
            return path
        except Exception as e:
            last_err = e
            continue
    raise RuntimeError(f"Could not download any of the candidate models. Last error: {last_err}")

path_a = _first_available(A_CANDIDATES)
try:
    path_b = _first_available(B_CANDIDATES)
except Exception:
    print('Second model unavailable; using the first model for both slots (still runs, less diversity).')
    path_b = path_a

# --- minimal llama.cpp wrapper ------------------------------------------------
_LLAMS = {}

def _get_llama(path: str):
    llm = _LLAMS.get(path)
    if llm is None:
        # n_ctx can be tuned; n_gpu_layers>0 offloads on Apple Silicon builds
        llm = Llama(model_path=path, n_ctx=4096, n_gpu_layers=0, logits_all=False, verbose=False)
        _LLAMS[path] = llm
    return llm


def _gen_llama(prompt: str, path: str, max_new_tokens: int = 256, temperature: float = 0.1) -> str:
    llm = _get_llama(path)
    out = llm(
        prompt,
        max_tokens=max_new_tokens,
        temperature=temperature,
        stop=["</s>", "###"],
    )
    return out['choices'][0]['text'].strip()

# Two generator callables for the "multi-LLM" check
gen_a = lambda prompt: _gen_llama(prompt, path_a)
gen_b = lambda prompt: _gen_llama(prompt, path_b)

# --- pick a small Transformers model for generate_with_truth_value ------------
TF_MODEL_CANDIDATES = [
    'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
    'Qwen/Qwen2.5-1.5B-Instruct',
    'Qwen/Qwen2.5-0.5B-Instruct',
]
TF_MODEL_ID = None
TF_TOKENIZER = None
TF_MODEL = None
try:
    from transformers import AutoTokenizer as _AutoTokenizer, AutoModelForCausalLM as _AutoModel
    import torch as _torch
    for _mid in TF_MODEL_CANDIDATES:
        try:
            TF_TOKENIZER = _AutoTokenizer.from_pretrained(_mid, use_fast=True)
            TF_MODEL_ID = _mid
            # Try to load a small model locally; if it fails, we will pass the model id string instead
            try:
                _dtype = _torch.float16 if (_torch.cuda.is_available() or (_torch.backends.mps.is_available() if hasattr(_torch.backends, 'mps') else False)) else _torch.float32
                TF_MODEL = _AutoModel.from_pretrained(TF_MODEL_ID, dtype=_dtype, low_cpu_mem_usage=True)
                print(f'Using Transformers model (loaded): {TF_MODEL_ID}')
            except Exception:
                TF_MODEL = None
                print(f'Using Transformers model id (lazy load in library): {TF_MODEL_ID}')
            break
        except Exception:
            TF_TOKENIZER = None
            continue
except Exception:
    TF_MODEL_ID = None
    TF_TOKENIZER = None
    TF_MODEL = None

# --- TruthTorchLM integration --------------------------------------------------
# Import TruthTorchLM module
modname = 'TruthTorchLM' if importlib.util.find_spec('TruthTorchLM') else 'truthtorchlm'
ttlm = importlib.import_module(modname)

# Claim to evaluate (edit as you wish)
CLAIM = 'The capital city of Washington State is Seattle.'

# Ensure TruthTorchLM uses our local generator internally if it calls `generation(...)`
if hasattr(ttlm, 'generation') and callable(getattr(ttlm, 'generation')):
    def _generation_local(prompt: str, *args, **kwargs):
        # Route all internal generation to our first local model
        return gen_a(prompt)
    ttlm.generation = _generation_local

# Locate a single-claim truth evaluation function first
# Prefer generator-based single-call functions for offline/local use
GEN_TRUTH_CANDIDATES = [
    'generate_with_truth_value',
    'generate_with_truthfulness',
    'long_form_generation_with_truth_value',
]
EVAL_CANDIDATES = [
    'evaluate_truth_method',
    'evaluate_truthfulness',
    'evaluate_truth',
    'truth_evaluate',
]

eval_fn = None
used_name = None
# Try generator-style first
for n in GEN_TRUTH_CANDIDATES:
    fn = getattr(ttlm, n, None)
    if callable(fn):
        eval_fn = fn
        used_name = n
        break
# If not found, fall back to evaluate_* APIs that may need dataset/model
if eval_fn is None:
    for n in EVAL_CANDIDATES:
        fn = getattr(ttlm, n, None)
        if callable(fn):
            eval_fn = fn
            used_name = n
            break

if eval_fn is None:
    # As a last resort, show nearby names and exit gracefully
    near = [a for a in dir(ttlm) if any(k in a.lower() for k in ['truth', 'check', 'verify', 'generate'])]
    print('Could not locate a single-claim truth evaluation function in TruthTorchLM.')
    print('Package exposes similar attributes:', near)
else:
    # Build kwargs adaptively based on the function signature
    kwargs = {}
    try:
        sig = inspect.signature(eval_fn)
    except Exception:
        sig = None
    print(f'Using function: {used_name} with signature: {sig}')

    # Select valid truth method objects (not booleans or enum names)
    def _select_truth_methods(mod):
        tms = []
        tm_attr = getattr(mod, 'truth_methods', None)
        if isinstance(tm_attr, dict):
            tms = list(tm_attr.values())
        elif isinstance(tm_attr, (list, tuple)):
            tms = list(tm_attr)
        # Keep only objects that expose REQUIRES_* flags expected by the library
        def _is_valid(x):
            for attr in ('REQUIRES_SAMPLED_TEXT','REQUIRES_SAMPLED_LOGITS','REQUIRES_SAMPLED_LOGPROBS','REQUIRES_SAMPLED_ATTENTIONS','REQUIRES_SAMPLED_ACTIVATIONS'):
                if hasattr(x, attr):
                    return True
            return False
        valid = [x for x in tms if _is_valid(x)]
        if not valid:
            print('Warning: could not locate valid truth method objects in ttlm.truth_methods; proceeding with none.')
        return valid[:2]

    if sig:
        # Prepare common helper values
        tm_list = _select_truth_methods(ttlm)
        sys_prompt = getattr(ttlm, 'GOOGLE_CHECK_QUERY_SYSTEM_PROMPT', None)
        # Build a simple 2-message chat compatible with many truth scorers
        msgs = [
            {'role': 'system', 'content': sys_prompt or 'You are a helpful, truthful assistant.'},
            {'role': 'user', 'content': f'Determine if the following claim is true or false and explain briefly: {CLAIM}'},
        ]
        for pname in sig.parameters:
            p = pname.lower()
            # Single-claim direct argument
            if any(k in p for k in ['claim', 'statement']):
                kwargs[pname] = CLAIM
            # Question field commonly used by generate_with_truth_value
            elif 'question' in p:
                kwargs[pname] = CLAIM
            # Required chat messages for generate_with_truth_value
            elif 'messages' == p:
                kwargs[pname] = msgs
            # Dataset-like argument (list of claims)
            elif any(k in p for k in ['dataset', 'data', 'claims', 'questions', 'texts', 'samples']):
                kwargs[pname] = [CLAIM]
            # Method selectors
            elif ('truth_methods' in p or 'methods' == p):
                if tm_list:
                    kwargs[pname] = tm_list
            elif 'method' in p and method_value is not None:
                kwargs[pname] = method_value
            # Tokenizer can be omitted for string model; leave None
            elif 'tokenizer' in p:
                # Prefer the loaded tokenizer; else leave None to let the library resolve
                kwargs[pname] = TF_TOKENIZER if 'TF_TOKENIZER' in globals() else None
            # Model/generator hooks
            elif any(k in p for k in ['generator', 'callable']):
                kwargs[pname] = gen_a
            elif 'model' in p or 'llm' in p:
                # For generator-style function, pass HF model object if loaded, else model id
                if used_name in GEN_TRUTH_CANDIDATES:
                    kwargs[pname] = TF_MODEL if ("TF_MODEL" in globals() and TF_MODEL is not None) else TF_MODEL_ID
                else:
                    kwargs[pname] = gen_a
            # Avoid passing provider model strings since we are offline; rely on monkey-patched generation

    # Helper to normalize the result
    def _extract_score(obj):
        try:
            if isinstance(obj, (int, float)):
                return float(obj)
            if isinstance(obj, dict):
                for k in ['truth_score', 'truth', 'score', 'truthfulness', 'truth_value']:
                    if k in obj:
                        try:
                            return float(obj[k])
                        except Exception:
                            pass
                for v in obj.values():
                    s = _extract_score(v)
                    if isinstance(s, (int, float)):
                        return float(s)
            if isinstance(obj, (list, tuple)):
                for v in obj:
                    s = _extract_score(v)
                    if isinstance(s, (int, float)):
                        return float(s)
        except Exception:
            pass
        return None

    # Execute
    used_name_effective = used_name
    result = None
    call_error = None
    if used_name == 'evaluate_truth_method':
        patterns = []
        # Build dataset candidates
        ds1 = [CLAIM]
        ds2 = [{'claim': CLAIM}]
        ds3 = [{'question': CLAIM}]
        ds4 = [{'text': CLAIM}]
        tm_list = [method_value] if method_value is not None else []
        # Try different shapes for truth_methods
        tm_obj = getattr(ttlm, 'truth_methods', None)
        tm_variants = []
        if tm_list:
            tm_variants.append(('truth_methods', tm_list))
        if tm_obj is not None:
            tm_variants.append(('truth_methods', tm_obj))
        # Different model forms: string label, callable, and None
        model_variants = [
            ('model', 'local-gguf'),
            ('model', gen_a),
            ('model', None),
        ]
        # Assemble combinations
        datasets = [('dataset', ds) for ds in (ds1, ds2, ds3, ds4)]
        for ds_kv in datasets:
            for tm_kv in tm_variants or [('truth_methods', tm_list)]:
                for mdl_kv in model_variants:
                    kw = dict([ds_kv, tm_kv, mdl_kv])
                    patterns.append(kw)
        # Try calling with the built patterns
        for kw in patterns:
            try:
                result = eval_fn(**kw)
                break
            except Exception as e:
                call_error = e
                continue
    if result is None:
        try:
            result = eval_fn(**kwargs) if kwargs else eval_fn(CLAIM)
        except TypeError:
            # Retry with minimal positional input
            try:
                result = eval_fn(CLAIM)
            except Exception:
                if used_name in GEN_TRUTH_CANDIDATES:
                    # No further fallback
                    raise
                # Try a generator truth function if available
                for n in GEN_TRUTH_CANDIDATES:
                    fn = getattr(ttlm, n, None)
                    if callable(fn):
                        used_name_effective = n
                        try:
                            # Build minimal kwargs for the generator function
                            sig2 = None
                            try:
                                sig2 = inspect.signature(fn)
                            except Exception:
                                pass
                            kwargs2 = {}
                            if sig2:
                                for pname in sig2.parameters:
                                    p = pname.lower()
                                    if any(k in p for k in ['prompt','question','text','instruction','claim','statement']):
                                        kwargs2[pname] = CLAIM
                                    elif any(k in p for k in ['generator','callable']):
                                        kwargs2[pname] = gen_a
                            result = fn(**kwargs2) if kwargs2 else fn(CLAIM)
                            break
                        except Exception:
                            continue

    # Normalize and print
    score = _extract_score(result)
    label = None
    if isinstance(score, (int, float)):
        # Heuristic mapping: >=0.5 => true, <0.5 => false
        label = 'true' if score >= 0.5 else 'false'
    print(f'Used function: {used_name_effective}')
    print('Claim:', CLAIM)
    if score is not None:
        print('Truth score:', score, f'-> label: {label}')
    else:
        print('Truth score: (not found in output)')
    print('Raw result:')
    print(result)

    # Soft smoke check for this specific claim (expecting false)
    if score is not None:
        try:
            assert score < 0.5, f'Expected a false-ish score (<0.5) for this false claim, got {score}'
            print('Smoke test passed: score indicates false as expected.')
        except AssertionError as ae:
            print('Smoke test warning:', ae)


Using model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF :: tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
Using model: Qwen/Qwen2.5-3B-Instruct-GGUF :: qwen2.5-3b-instruct-q4_k_m.gguf
Using Transformers model (loaded): TinyLlama/TinyLlama-1.1B-Chat-v1.0
Using function: generate_with_truth_value with signature: (model: Union[transformers.modeling_utils.PreTrainedModel, str], messages: list, question: str = None, truth_methods: list = [], tokenizer: Union[transformers.tokenization_utils.PreTrainedTokenizer, transformers.tokenization_utils_fast.PreTrainedTokenizerFast] = None, generation_seed=None, batch_generation=True, add_generation_prompt=True, continue_final_message=False, context: str = '', **kwargs) -> dict
Used function: generate_with_truth_value
Claim: The capital city of Washington State is Seattle.
Truth score: (not found in output)
Raw result:
{'generated_text': 'The claim "The capital city of Washington State is Seattle" is true. Seattle is the capital city of Washington State.', 'normaliz