In [10]:
from transformers import T5ForConditionalGeneration, AutoTokenizer, BitsAndBytesConfig
import torch

repo_name = "fristrup/flan-t5-semantic-tagger-small-4bit"


quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

tokenizer = AutoTokenizer.from_pretrained(repo_name)
model = T5ForConditionalGeneration.from_pretrained(
    repo_name, 
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    device_map="cpu"
)



In [11]:
import pandas as pd

In [12]:
df = pd.read_csv("hf://datasets/jstet/quotes-500k/quotes.csv")

In [13]:
def predict_tags(model, tokenizer, text, max_length=128):
    """
    Generate tags for a given text using the trained model
    """
    # Prepare the input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    input_ids = inputs.input_ids

    # Move to the same device as model
    device = model.device
    input_ids = input_ids.to(device)

    # Generate tags
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            early_stopping=True,
        )

    # Decode the generated tokens
    predicted_tags = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Clean up tags - split by comma and strip whitespace
    tags = [tag.strip() for tag in predicted_tags.split(",") if tag.strip()]

    # Remove duplicates while preserving order
    unique_tags = []
    for tag in tags:
        if tag not in unique_tags:
            unique_tags.append(tag)

    return unique_tags

In [14]:
def create_input_text(quote, author):
    """Create the input text with descriptive prompt"""
    return f'What tags or categories would best describe this quote: "{quote}" by {author}? Provide comma-separated tags.'

In [15]:
for i in range(10):
    print(predict_tags(
        model, tokenizer, create_input_text(df["quote"].iloc[i], df["author"].iloc[i])
    ))

['selfish', 'impatient', 'insecure']
['love', 'love-quotes']
['dreams', 'dream-quotes']
['love', 'friend']
['darkness', 'hate', 'love']
['love', 'love-of-belief']
['love', 'happiness', 'life']
['love', 'hate']
['love', 'fall', 'sleep']
['indifference', 'love']
