In [18]:
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import inseq
import torch
import torch.nn.functional as F

In [None]:
MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
ATTRIBUTION_METHOD = "integrated_gradients"

In [30]:
model = inseq.load_model(
    MODEL_ID,
    ATTRIBUTION_METHOD,
    device='cpu',
    # Add quantization or other loading args if needed, e.g., for large models:
    # model_kwargs={"load_in_8bit": True} # requires bitsandbytes
)

# Access the underlying Hugging Face model and tokenizer
hf_model = model.model
tokenizer = model.tokenizer
hf_model.eval()

Loading checkpoint shards: 100%|██████████| 2/2 [01:36<00:00, 48.44s/it]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_e

In [37]:
CONTEXT = "This african guy is very" 
inputs = tokenizer(CONTEXT, return_tensors="pt").to(hf_model.device)

with torch.no_grad():
    outputs = hf_model(**inputs)
    # Logits shape: [batch_size, sequence_length, vocab_size]
    # We need the logits for the *last* token in the input sequence
    next_token_logits = outputs.logits[:, -1, :]
    # Apply softmax to get probabilities (optional, just for info)
    probs = torch.softmax(next_token_logits, dim=-1)

    # Get the top 2 tokens and their logit values (or probabilities)
    top_k_logits, top_k_indices = torch.topk(probs, 5, dim=-1)

generated_token_id = top_k_indices[0, 0].item()
contrast_token_id = top_k_indices[0, 1].item()

generated_token = tokenizer.decode(generated_token_id)
contrast_token = tokenizer.decode(contrast_token_id)

print(f"  - Top predicted token: '{generated_token}' (ID: {generated_token_id})")
print(f"  - Second-best token: '{contrast_token}' (ID: {contrast_token_id})")

  - Top predicted token: 'good' (ID: 1781)
  - Second-best token: 'hand' (ID: 1361)


In [None]:
top_k_indices

In [None]:
tokenizer.decode(521)

In [16]:
attribution_result.show()

Unnamed: 0_level_0,▁dig → ▁b
<s>,0.423
▁Can,0.281
▁you,0.122
▁stop,0.063
▁the,0.033
▁dog,0.041
▁from,0.038
▁dig → ▁b,Unnamed: 1_level_8
contrast_prob_diff,0.827


In [15]:
attribution_result = model.attribute(
    input_texts=CONTEXT,
    true_answer=generated_token,
    attributed_fn="contrast_prob_diff",
    contrast_targets=contrast_token,
    step_scores=["contrast_prob_diff"], # Score comparing prob(target) - prob(contrast)
    generation_args={"max_new_tokens": 1}, # Ensure only the first token is generated/attributed
    internal_batch_size=1 # Consider lowering if you hit memory issues
    # n_steps=50 # Default for Integrated Gradients, adjust if needed
)

attribute_target parameter is set to True, but will be ignored (not an encoder-decoder).
Unused arguments during attribution: {'true_answer': 'b'}
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
Attributing with integrated_gradients...: 100%|██████████| 8/8 [08:16<00:00, 496.63s/it]


In [None]:
attribution_result

In [17]:
# attribution_result is an AttributionResult object.
# It contains a list of SequenceAttribution objects (one for each input).
# We only have one input here.
seq_attr = attribution_result.sequence_attributions[0]

# Print the contrastive probability difference score for the step
# This score should be positive if the generated token was indeed more probable
# than the contrast token.
# Check if the score exists before accessing it
if "contrast_prob_diff" in seq_attr.step_scores and len(seq_attr.step_scores["contrast_prob_diff"]) > 0:
    contrast_prob_diff_score = seq_attr.step_scores["contrast_prob_diff"][0] # Index 0 for the first step
    print(f"\nContrastive Probability Difference (P(gen) - P(contrast)) at step 0: {contrast_prob_diff_score:.4f}")
else:
    print("\nContrastive Probability Difference score not found in results.")

# Show the source attribution scores: how much each input token contributed
# to the choice of the generated token *over* the contrast token.
# Positive scores favor the generated token, negative scores favor the contrast token.

# --- CORRECTED SECTION ---
# Access the source tokens (list of Token objects) and scores directly
source_tokens = seq_attr.source
source_scores = seq_attr.source_attributions # This is usually a numpy array or tensor

# Ensure we have scores to display
if source_scores is not None and len(source_tokens) == len(source_scores):
    print("\nSource Attribution Scores (Contribution of each input token):")
    for token_obj, score in zip(source_tokens, source_scores):
        # Extract the text from the Token object and the score value
        # Use .item() if score is a single-element tensor/numpy array, otherwise access appropriately
        score_val = score.item() if hasattr(score, 'item') else score
        print(f"  - '{token_obj.text}': {score_val:.4f}")
elif source_scores is None:
     print("\nSource attribution scores were not computed or found.")
else:
     print(f"\nMismatch between number of source tokens ({len(source_tokens)}) and scores ({len(source_scores)}). Cannot display.")


Contrastive Probability Difference (P(gen) - P(contrast)) at step 0: 0.8270

Source attribution scores were not computed or found.


In [None]:
seq_attr

In [None]:
# 1. Load the instruction‑tuned model & tokenizer
model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.eval()
device = next(model.parameters()).device
docs = [
        "Geothermal energy provides a constant power supply, unlike solar or wind which are intermittent.", # Should be important
        "Geothermal power plants have a small physical footprint compared to other power plants.", # Should be important
        "The initial exploration and drilling costs for geothermal energy can be significant.", # Less relevant to advantages
        "Geothermal systems emit very low levels of greenhouse gases.", # Should be important
        "Solar panels convert sunlight directly into electricity using photovoltaic cells." # Irrelevant
    ]
context ="".join(docs)
# 2. Prepare the instruction prompt
question = "What are the primary advantages of using geothermal energy?"
prompt = f"""
Answer the following question concisely and shortly using the context {context}.

{question}
Answer:
"""

inputs = tokenizer(prompt, return_tensors="pt").to(device)
prompt_len = inputs.input_ids.shape[1]

# 3. Generation config
max_new = 20    # how many tokens to generate
top_k   = 5     # number of top candidates per position
gen_config = GenerationConfig(
    max_new_tokens=max_new,
    do_sample=False,       # greedy generation for the chosen output
    temperature=0.0,
    repetition_penalty=1.1,
    eos_token_id=tokenizer.eos_token_id,
)

# 4. Generate with scores
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        generation_config=gen_config,
        return_dict_in_generate=True,
        output_scores=True,
    )

sequences = outputs.sequences[0]         # (prompt_len + max_new,)
scores    = outputs.scores              # list of length max_new, each (1, vocab_size)

In [None]:
# 5. Decode full generated text
full_text = tokenizer.decode(sequences, skip_special_tokens=True)
print("=== Full Generated Text ===\n")
print(full_text, "\n")

In [None]:
# 6. For each generated position, get top-k
print(f"=== Top {top_k} Candidates per Generated Position ===\n")
for idx, step_logits in enumerate(scores):
    # step_logits: shape (1, vocab_size)
    logits = step_logits[0]                       # (vocab_size,)
    probs  = F.softmax(logits, dim=-1)            # (vocab_size,)
    top_probs, top_ids = torch.topk(probs, k=top_k)
    tokens = tokenizer.convert_ids_to_tokens(top_ids.tolist())
    
    position = prompt_len + idx  # absolute position in sequence
    gen_tok  = tokenizer.convert_ids_to_tokens([sequences[position]])[0]
    
    print(f"Position {idx+1} (generated token {idx+1!r}):")
    for tok, p in zip(tokens, top_probs.tolist()):
        print(f"  {tok!r:>10} : {p:.4f}")
    print()

In [None]:
model1=inseq.load_model( "google/flan-t5-base", "integrated_gradients" )

In [None]:
attr_out = model1.attribute( prompt, attribute_target=True )
attr_out.show()

In [None]:
# The model is loaded in 8-bit on available GPUs
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

In [None]:
# GPT-2 XL is a Transformer model with 48 layers
for layer in range(48):
    attrib_model = inseq.load_model(
        model,
        "layer_gradient_x_activation",
        tokenizer="meta-llama/Llama-3.1-8B-Instruct",
        target_layer=model.transformer.h[layer].mlp,
    )
    # e.g. "The capital of Spain is"
    prompt = "What is the capital of Uzbekistan?"
    # e.g. "The capital of Spain is Madrid"
    true_answer = model.generate(prompt)
    # e.g. "The capital of Spain is Paris"
    false_answer = "samarkand"
    # Contrastive attribution of true vs false answer
    out = attrib_model.attribute(
        prompt,
        true_answer,
        attributed_fn="contrast_prob_diff",
        contrast_targets=false_answer,
        step_scores=["contrast_prob_diff"],
        show_progress=False,
    )
    # Save aggregated attributions to disk
    out = out.aggregate()


In [None]:
model = inseq.load_model(
    "gpt2-medium",
    "layer_gradient_x_activation",
    keep_top_n=5,
    stopping_condition_top_k=3,
    replacing_ratio=0.3,
    max_probe_steps=3000,
    num_probes=8
)
out = model.attribute("Can you stop the dog from", attr_pos_end=10, output_generated_only=True, skip_special_tokens=True)
out.show()

In [None]:
model.generate('Can you stop the dog from', output_generated_only=True, output_scores=True, skip_special_tokens=True)