In [None]:
import os
import torch
from dotenv import load_dotenv
from huggingface_hub import login, snapshot_download
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer
)
from sentence_transformers import SentenceTransformer

embed_model_id = "BAAI/bge-m3"
reranker_model_id = "BAAI/bge-reranker-v2-m3"
llm_model_id = "microsoft/Phi-3-mini-128k-instruct"
vision_llm_model_id = "microsoft/Phi-3-vision-128k-instruct"
image_gen_model_id = "stabilityai/sdxl-turbo"
DEVICE = "cuda:0"
DOWNLOAD_REPOS = False

if DOWNLOAD_REPOS:
    # Saved to ~/.cache/huggingface/hub
    load_dotenv()
    login(token=os.getenv("HF_TOKEN"))
    snapshot_download(embed_model_id)
    snapshot_download(reranker_model_id)
    snapshot_download(llm_model_id)
    snapshot_download(vision_llm_model_id)
    snapshot_download(image_gen_model_id)

### Embedding Model

In [None]:
embed_model = SentenceTransformer(embed_model_id).to(DEVICE)

def get_embedding(model, text):
    if not isinstance(text, list):
        text = [text]
    return model.encode(text)[0]

sentence = "This framework generates embeddings for each input sentence"
embedding = get_embedding(embed_model, sentence)
print(len(embedding))

### Reranker

In [None]:
reranker_tokenizer = AutoTokenizer.from_pretrained(reranker_model_id)
reranker_model = AutoModelForSequenceClassification.from_pretrained(reranker_model_id).to(DEVICE)
reranker_model.eval()

def get_rerank_scores(tokenizer, model, pairs):
    with torch.no_grad():
        inputs = tokenizer(
            pairs, padding=True, truncation=True, return_tensors='pt', max_length=512
        ).to(model.device)
        scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
    return scores.tolist()

pairs = [["what is a panda?", "hi"], ["what is a panda?", "The giant panda is a bear species endemic to China."]]
scores = get_rerank_scores(reranker_tokenizer, reranker_model, pairs)
print(scores)

### Language Model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
model = AutoModelForCausalLM.from_pretrained(
    llm_model_id,
    device_map=DEVICE,
    torch_dtype="auto",
    trust_remote_code=True,
    # attn_implementation="flash_attention_2",
    attn_implementation="eager",
)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.6,
    "do_sample": True,
}

def generate_text(pipeline, generation_args, messages):
    output = pipe(messages, **generation_args)
    return output[0]["generated_text"]

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"}
]

response = generate_text(pipeline, generation_args, messages)
print(response)

### Vision Language Model

In [None]:
import requests
from transformers import AutoModelForCausalLM
from transformers import AutoProcessor
from transformers.image_utils import load_image

processor = AutoProcessor.from_pretrained(
    vision_llm_model_id,
    trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
    vision_llm_model_id,
    device_map=DEVICE,
    trust_remote_code=True,
    torch_dtype="auto",
    # _attn_implementation="flash_attention_2",
    _attn_implementation="eager"
)
generation_args = {
    "max_new_tokens": 500,
    "temperature": 0.6,
    "do_sample": True,
}

def generate_vision_text(processor, model, generation_args, messages, images):
    prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(prompt, images, return_tensors="pt").to(DEVICE)
    generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)
    generate_ids = generate_ids[:, inputs["input_ids"].shape[1]:]
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return response

messages = [
    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
    {"role": "assistant", "content": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."},
    {"role": "user", "content": "<|image_2|>\nAnd how about this image?"}
] 
image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
images = [image1, image2]

response = generate_vision_text(processor, model, generation_args, messages, images)

print(response)

### Image Generation Model

In [None]:
from diffusers import AutoPipelineForText2Image
import torch

pipe = AutoPipelineForText2Image.from_pretrained(
    image_gen_model_id,
    torch_dtype=torch.float16,
    variant="fp16"
)
pipe.to(DEVICE)

prompt = "A cinematic shot of a baby racoon wearing an intricate italian priest robe."
image = pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0]
image.show()