In [None]:
import os
import torch
from dotenv import load_dotenv
from huggingface_hub import login, snapshot_download
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoProcessor,
    AutoTokenizer,
    Gemma3ForConditionalGeneration
)
from sentence_transformers import SentenceTransformer
torch._dynamo.config.disable = True

embed_model_id = "Qwen/Qwen3-Embedding-0.6B"
reranker_model_id = "Qwen/Qwen3-Reranker-0.6B"
llm_model_id = "google/gemma-3-4b-it"
image_gen_model_id = "stabilityai/sdxl-turbo"
DEVICE = "cuda:0"
DOWNLOAD_REPOS = False

if DOWNLOAD_REPOS:
    # Saved to ~/.cache/huggingface/hub
    load_dotenv()
    login(token=os.getenv("HF_TOKEN"))
    snapshot_download(embed_model_id)
    snapshot_download(reranker_model_id)
    snapshot_download(llm_model_id)
    snapshot_download(image_gen_model_id)

### Embedding Model

In [None]:
embed_model = SentenceTransformer(
    embed_model_id,
    model_kwargs={"torch_dtype": torch.float16, "attn_implementation": "flash_attention_2", "device_map": "auto"},
    tokenizer_kwargs={"padding_side": "left"},
)

queries = [
    "What is the capital of China?",
    "Explain gravity"
]
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other."
]

query_embeddings = embed_model.encode(queries, prompt_name="query")
document_embeddings = embed_model.encode(documents)
similarity = embed_model.similarity(query_embeddings, document_embeddings)
print(similarity)

### Reranker

In [None]:
reranker_tokenizer = AutoTokenizer.from_pretrained(reranker_model_id)
reranker_model = AutoModelForCausalLM.from_pretrained(
    reranker_model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="flash_attention_2"
).eval()

def get_rerank_scores(tokenizer, model, pairs):
    inputs = tokenizer(
        pairs, padding=True, truncation='longest_first', return_tensors='pt', return_attention_mask=False, max_length=4096
    )
    for key in inputs:
        inputs[key] = inputs[key].to(model.device)
    with torch.no_grad():
        scores = model(**inputs).logits[:, -1, :]
    return scores

pairs = [
    ["What is the capital of China?", "The capital of China is Beijing."],
    ["What is the capital of China?", "Gravity is a force that attracts two bodies towards each other."]
]

scores = get_rerank_scores(reranker_tokenizer, reranker_model, pairs)
print(scores)

### Language Model

In [None]:
llm_processor = AutoProcessor.from_pretrained(llm_model_id)
llm_model = Gemma3ForConditionalGeneration.from_pretrained(
    llm_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="sdpa"
).eval()

def generate_completion(processor, model, messages):
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
        add_generation_prompt=True
    ).to(model.device, dtype=torch.bfloat16)
    
    input_len = inputs["input_ids"].shape[-1]
    with torch.inference_mode():
        output = model.generate(**inputs, max_new_tokens=400, do_sample=False)
        output = output[0][input_len:]
    
    return processor.decode(output, skip_special_tokens=True)

messages = [
    {"role": "system", "content": [{"type": "text", "text": "You are a pirate chatbot who always responds in pirate speak!"}]},
    {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]}
]

response = generate_completion(llm_processor, llm_model, messages)
print(response)

In [None]:
messages = [
    {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
    {"role": "user", "content": [
            {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
            {"type": "text", "text": "Summarize this image."}
        ]
    }
]

response = generate_completion(llm_processor, llm_model, messages)
print(response)

### Image Generation Model

In [None]:
from diffusers import AutoPipelineForText2Image
import torch

pipe = AutoPipelineForText2Image.from_pretrained(
    image_gen_model_id,
    torch_dtype=torch.float16,
    variant="fp16"
)
pipe.to(DEVICE)

prompt = "A cinematic shot of a baby racoon wearing an intricate italian priest robe."
image = pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0]
image.show()