In [26]:
from transformers import T5ForConditionalGeneration, AutoTokenizer, BitsAndBytesConfig
import torch

repo_name = "fristrup/flan-t5-semantic-tagger-base-4bit"


quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

tokenizer = AutoTokenizer.from_pretrained(repo_name)
model = T5ForConditionalGeneration.from_pretrained(
    repo_name, 
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    device_map="auto"
)

In [27]:
import pandas as pd

In [34]:
def predict_tags(model, tokenizer, text, max_length=128):
    """
    Generate tags for a given text using the trained model
    """
    # Prepare the input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    input_ids = inputs.input_ids

    # Move to the same device as model
    device = model.device
    input_ids = input_ids.to(device)

    # Generate tags
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            early_stopping=True,
        )

    # Decode the generated tokens
    predicted_tags = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Clean up tags - split by comma and strip whitespace
    tags = [tag.strip() for tag in predicted_tags.split(",") if tag.strip()]

    # Remove duplicates while preserving order
    unique_tags = []
    for tag in tags:
        if tag not in unique_tags:
            unique_tags.append(tag)

    return unique_tags

In [29]:
df = pd.read_csv("hf://datasets/jstet/quotes-500k/quotes.csv")

In [33]:
predicted_tags = predict_tags(model, tokenizer, df["quote"].iloc[1])

IndexError: index 1 is out of bounds for dimension 0 with size 1

In [31]:
print(predicted_tags)

["You've gotta dance like there's nobody watching", "Love like you'll never be hurt", 'Sing like no one listening', "And live like it' is heaven on earth."]
