In [1]:
# Dependencies:

#!pip install transformers ipywidgets IPython

# If running on Colab:
#!git clone https://github.com/sam-paech/antislop-sampler.git
#!mv antislop-sampler/src .
#!mv antislop-sampler/slop_phrase_prob_adjustments.json .

In [2]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)
from src.antislop_generate import generate_antislop, chat_antislop

# Enable efficient transfer for Hugging Face models
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = "1"

# Set the device to 'cuda' if available, else 'cpu'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Specify the model name (replace with your preferred model)
model_name = "unsloth/Llama-3.2-1B-Instruct"
#model_name = "unsloth/Llama-3.2-3B-Instruct"

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True)
model.to(device)
print('Model loaded')

Model loaded


In [3]:
import json
if os.path.exists('banned_tokens.json'):
    with open('banned_tokens.json', 'r') as f:
        slop_phrase_prob_adjustments = dict(json.load(f)[:5000])

In [4]:
prompt = "Write a story about Elara, the weaver of tapestries in future Technopolis. In the bustling city, a group of "

In [5]:
# Chat generation with streaming
messages = [
    {"role": "user", "content": prompt}
]
tokens = []
text = ''
for token in chat_antislop(
    model=model,
    tokenizer=tokenizer,
    messages=messages,
    max_new_tokens=50,
    # Antislop sampling may be less reliable at low temperatures.
    #temperature=1,    
    #min_p=0.1,
    temperature=0.01,
    # The adjustment_strength param scales how strongly the probability adjustments are applied.
    # A value of 1 means the values in slop_phrase_prob_adjustments (or the defaults) are used unmodified.
    # Reasonable values are 0 (disabled) thru 100+ (effectively banning the list).
    adjustment_strength=100.0,
    # Optional: Provide a list of slop phrases and probability adjustments
    slop_phrase_prob_adjustments=slop_phrase_prob_adjustments,
    enforce_json=False,
    antislop_enabled=False,
    ban_slop_first_tokens=True,
    streaming=True,
    stream_smoothing=True, # On by default; this will smooth out the stutters from backtracking.
):
    tokens.append(token)
    full_text = tokenizer.decode(tokens, skip_special_tokens=True)
    new_text = full_text[len(text):]
    text = full_text
    print(new_text, end='', flush=True)

{'symphony': {24738, 328, 63306, 18923, 8045, 11629, 5837, 6705, 82, 50, 29012, 274, 16466, 35767, 23707}, 'testament to': {259, 1028, 29584, 1296, 2323, 3475, 40214, 668, 2722, 23971, 75048, 51, 1985, 2505, 83, 2392, 13916, 350, 18793, 51309, 61300, 6777, 10238}, 'kaleidoscope': {23109, 36454, 74, 42, 16909, 97842, 597, 63577, 27930, 735}, 'delve': {67, 451, 35, 294, 7462, 39432, 423, 24711, 16939, 1611, 3467, 1170, 9783, 1624, 409, 82845, 1951}, 'delving': {67, 451, 35, 294, 7462, 39432, 423, 24711, 16939, 1611, 3467, 1170, 9783, 1624, 409, 1951}, 'delved': {67, 451, 35, 294, 7462, 39432, 423, 24711, 16939, 1611, 3467, 1170, 9783, 1624, 409, 1951}, 'elara': {384, 2818, 68, 36, 4072, 17705, 301, 658, 469, 6719}, 'tapestry': {259, 2629, 9637, 15559, 31147, 64620, 15596, 24172, 37234, 51, 83, 39991, 17401, 58586, 350}, 'tapestries': {259, 9637, 15559, 15596, 51, 83, 39991, 17401, 350}, 'weave': {68608, 289, 12739, 1687, 584, 906, 1226, 468, 54, 86, 20255}, 'wove': {289, 468, 54, 86, 235

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


**The Weaver's Gift**

It was the height

From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


KeyboardInterrupt: 