In [1]:
!pip3 install transformers --upgrade

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading huggingface_hub-0.29.3-py3-none-any.whl (468 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 kB

Paper: https://arxiv.org/pdf/2210.15097

Let $x_{\text{pre}} = x_1, \ldots, x_n$ denote a prompt of $n$ tokens $x_i \in \mathcal{V}$ from a vocabulary $\mathcal{V}$
and $x_{\text{cont}} = x_{n+1}, \ldots, x_{n + m}$ denotes the output generated from the prompt. Autoregressive language
model decodes a single token at a time wih a next token probability distribution $p_{\text{LM}}(x_i|x_{<i})$

$$
p_{\text{LM}}(x_\text{cont}|x_{\text{pre}}) = \prod_{i=n+1}^{n+m} p_{\text{LM}}(x_i|x_{<i})
$$

Small models (AMA) give higher probability to tokens producing undesirable patterns (e.g. repetitions) than larger
models (EXP). Contrastive decoding combines a small and a larger model using an objective function

$$
\mathcal{L}_{\text{CD}}(x_\text{cont}, x_{\text{pre}}) = \log p_{\text{EXP}}(x_\text{cont}|x_{\text{pre}})
- \log p_{\text{AMA}}(x_\text{cont}|x_{\text{pre}})
$$

that penalizes patterns favored by the smaller model while rewarding patterns favored by the larger model.
This should improve the overall output. The continuation tokens $x_\text{cont}$ are picked from a truncated
vocabulary $\mathcal{V}_{\text{head}}(x_{<i})$ defined as 

$$
\mathcal{V}_{\text{head}}(x_{<i}) = 
\{x_i \in \mathcal{V} : p_{\text{EXP}}(x_i|x_{<i}) > \alpha \max_{w} p_{\text{EXP}}(w|x_{<i}) \}
$$

where $\alpha = 0.1$. The truncated vocabulary at the $i$-th step contains only tokens which probability
is higher than $\alpha$ times the probability of the most probable token. The truncatation limits
false positives (implausible tokens with large difference between $p_{\text{EXP}}$ and $p_{\text{AMA}}$)
and false negatives (obviously correct tokens with high $p_{\text{EXP}}$ and $p_{\text{AMA}}$)
by keeping a pool of only high probability candidates.

Token-level $\text{CD-score}$ is

$$
\text{CD-score}(x_i, x_{<i}) = \log \frac{p_{\text{EXP}}(x_i|x_{<i})}{p_{\text{AMA}}(x_i|x_{<i})},
\forall x_i \in \mathcal{V}_{\text{head}}(x_{<i})
$$

In [3]:
import transformers as tr
import torch
import tqdm.notebook as tqdm

amateur_path = 'Qwen/Qwen2.5-Coder-0.5B-Instruct'
expert_path = 'Qwen/Qwen2.5-Coder-1.5B-Instruct'
# device = "cuda"
device = "cpu"

tokenizer = tr.AutoTokenizer.from_pretrained(amateur_path)
amateur = tr.AutoModelForCausalLM.from_pretrained(amateur_path).to(device)
expert = tr.AutoModelForCausalLM.from_pretrained(expert_path).to(device)

In [4]:
user_message = """Give a very very brief docstring for the following function:\n```\nfunction updateEloScores(
	scores,
	results,
	kFactor = 4,
) {
	for (const result of results) {
		const { first, second, outcome } = result;
		const firstScore = scores[first] ?? 1000;
		const secondScore = scores[second] ?? 1000;

		const expectedScoreFirst = 1 / (1 + Math.pow(10, (secondScore - firstScore) / 400));
		const expectedScoreSecond = 1 / (1 + Math.pow(10, (firstScore - secondScore) / 400));
		let sa = 0.5;
		if (outcome === 1) {
			sa = 1;
		} else if (outcome === -1) {
			sa = 0;
		}
		scores[first] = firstScore + kFactor * (sa - expectedScoreFirst);
		scores[second] = secondScore + kFactor * (1 - sa - expectedScoreSecond);
	}
	return scores;
}\n```"""

prompt = tokenizer.apply_chat_template(
    [
        {'role': 'system', 'content': 'You are a helpful assistant'},
        {'role': 'user', 'content': user_message}
    ],
    add_generation_prompt=True,
    tokenize=False
)

In [5]:
def contrastive_decoding(amateur, expert, tokenizer, prompt, max_tokens=100, alpha=0.1):
    
    # tensor[int], shape (1, n_tokens)
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    for _ in tqdm.tqdm(range(max_tokens)):
        with torch.no_grad():
            # next token scores (before softmax), tensor[float] shape(1, vocab_size)
            amateur_logits = amateur(input_ids).logits[:, -1, :]
            expert_logits = expert(input_ids).logits[:, -1, :]

        # convert raw scores to log probabilities, shape(1, vocab_size)
        amateur_log_probs = torch.log_softmax(amateur_logits, dim=-1)
        expert_log_probs = torch.log_softmax(expert_logits, dim=-1)

        # compute contrastive score, shape(1, vocab_size)
        contrastive_scores = expert_log_probs - amateur_log_probs

        # compute expert probabilities from expert raw scores, shape(1, vocab_size)
        expert_probs = torch.softmax(expert_logits, dim=-1)
        # vocabulary truncation treshold = alpha * probability of the most probable next token
        threshold = alpha * expert_probs.max()
        # find tokens with the expert probability below the truncation threshold
        mask = expert_probs >= threshold
        # truncate the next token vocabulary by setting the CD score to infinite
        contrastive_scores[~mask] = float('-inf')

        # Select the token with the highest CD score from the truncated vocabulary
        best_token = torch.argmax(contrastive_scores, dim=-1)
        # add the token to the input ids and go to the next iteration
        input_ids = torch.cat([input_ids, best_token.unsqueeze(0)], dim=-1)

        # Stop if end token is generated
        if best_token.item() == tokenizer.eos_token_id:
            break

    return tokenizer.decode(input_ids.squeeze(), skip_special_tokens=True)

In [6]:
output = contrastive_decoding(amateur, expert, tokenizer, prompt, max_tokens=10)
print(output)

  0%|          | 0/10 [00:00<?, ?it/s]

system
You are a helpful assistant
user
Give a very very brief docstring for the following function:
```
function updateEloScores(
	scores,
	results,
	kFactor = 4,
) {
	for (const result of results) {
		const { first, second, outcome } = result;
		const firstScore = scores[first] ?? 1000;
		const secondScore = scores[second] ?? 1000;

		const expectedScoreFirst = 1 / (1 + Math.pow(10, (secondScore - firstScore) / 400));
		const expectedScoreSecond = 1 / (1 + Math.pow(10, (firstScore - secondScore) / 400));
		let sa = 0.5;
		if (outcome === 1) {
			sa = 1;
		} else if (outcome === -1) {
			sa = 0;
		}
		scores[first] = firstScore + kFactor * (sa - expectedScoreFirst);
		scores[second] = secondScore + kFactor * (1 - sa - expectedScoreSecond);
	}
	return scores;
}
```
assistant
This function `updateEloScores` takes three
