In [1]:
# DeepSeek-VL2 Interactive Notebook
# This notebook allows you to interact with DeepSeek-VL2 directly in Jupyter Notebook.
# It loads the model from a local directory and supports uploading images.

# 📌 Required Libraries
import os
import torch
import PIL.Image
import ipywidgets as widgets
from IPython.display import display
from transformers import AutoModelForCausalLM
from deepseek_vl2.models import DeepseekVLV2ForCausalLM, DeepseekVLV2Processor
from deepseek_vl2.serve.app_modules.utils import parse_ref_bbox

A matching Triton is not available, some optimizations will not be enabled
Traceback (most recent call last):
  File "c:\Users\LENOVO\AppData\Local\Programs\Python\Python39\lib\site-packages\xformers\__init__.py", line 57, in _is_triton_available
    import triton  # noqa
ModuleNotFoundError: No module named 'triton'


In [2]:
# 🛠 Set the LOCAL MODEL PATH (Update if needed)
LOCAL_MODEL_PATH = r"C:\Users\LENOVO\.cache\huggingface\hub\models--deepseek-ai--deepseek-vl2-tiny\snapshots\66c54660eae7e90c9ba259bfdf92d07d6e3ce8aa"

# ✅ Function to Load Model from Local Path
def load_model(model_path=LOCAL_MODEL_PATH, dtype=torch.bfloat16):
    """Load the DeepSeek-VL2 model from a local directory."""
    print(f"🔄 Loading model from {model_path}...")
    
    vl_chat_processor = DeepseekVLV2Processor.from_pretrained(
        model_path, local_files_only=True
    )
    tokenizer = vl_chat_processor.tokenizer

    vl_gpt = AutoModelForCausalLM.from_pretrained(
        model_path, trust_remote_code=True, torch_dtype=dtype, local_files_only=True
    ).cuda().eval()

    print("✅ Model loaded successfully!")
    return vl_chat_processor, tokenizer, vl_gpt

In [3]:
# ✅ Function to Save Uploaded Images (Corrected)
def save_uploaded_images(uploaded_files, temp_folder="temp_images"):
    """Save uploaded images and return their file paths."""
    os.makedirs(temp_folder, exist_ok=True)  # Ensure the folder exists
    saved_paths = []

    for uploaded_file in uploaded_files:  # Iterate directly over tuple
        filepath = os.path.join(temp_folder, uploaded_file.name)  # Get filename
        with open(filepath, "wb") as f:
            f.write(uploaded_file.content)  # 🔹 Corrected: Use .content instead of .data
        saved_paths.append(filepath)

    return saved_paths

# ✅ Function to Load PIL Images
def load_pil_images(image_paths):
    """Load images as PIL objects."""
    pil_images = []
    for image_path in image_paths:
        pil_img = PIL.Image.open(image_path).convert("RGB")
        pil_images.append(pil_img)
    return pil_images

In [None]:
# ✅ Function to Generate Model Response
def generate_response(text, images, temperature=0.4, top_p=0.9, repetition_penalty=1.1):
    """Generate a response using the DeepSeek-VL2 model."""
    vl_chat_processor, tokenizer, vl_gpt = load_model()

    # Prepare conversation
    conversation = [{"role": "<|User|>", "content": text, "images": images}, {"role": "<|Assistant|>", "content": ""}]
    pil_images = load_pil_images(images)

    prepare_inputs = vl_chat_processor.__call__(
        conversations=conversation,
        images=pil_images,
        force_batchify=True,
        system_prompt=""
    ).to(vl_gpt.device)

    with torch.no_grad():
        inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

        outputs = vl_gpt.generate(
            inputs_embeds=inputs_embeds,
            input_ids=prepare_inputs.input_ids,
            images=prepare_inputs.images,
            images_seq_mask=prepare_inputs.images_seq_mask,
            images_spatial_crop=prepare_inputs.images_spatial_crop,
            attention_mask=prepare_inputs.attention_mask,
            pad_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            max_new_tokens=512,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            use_cache=True,
        )

        answer = tokenizer.decode(outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(), skip_special_tokens=False)
        print(f"🗨️ Assistant:\n{answer}")

        # Display visual grounding image if applicable
        vg_image = parse_ref_bbox(answer, image=pil_images[-1]) if pil_images else None
        if vg_image:
            display(vg_image)

        return answer

# ✅ Create Interactive Widgets
text_input = widgets.Textarea(
    placeholder="Enter your text prompt...",
    layout=widgets.Layout(width="100%", height="100px"),
)

image_uploader = widgets.FileUpload(
    accept="image/*", multiple=True
)

generate_button = widgets.Button(description="Generate Response")
output_area = widgets.Output()

# ✅ Define Button Click Function
def on_generate_clicked(b):
    with output_area:
        output_area.clear_output()
        text = text_input.value  # Get user input text

        # Ensure images are uploaded before proceeding
        if not image_uploader.value:
            print("⚠️ No images uploaded. Please upload an image before submitting.")
            return

        # Save images and get file paths
        image_paths = save_uploaded_images(image_uploader.value)

        # Generate response using saved image paths
        response = generate_response(text, image_paths)
        print("Response:\n", response)

# ✅ Attach Function to Button Click
generate_button.on_click(on_generate_clicked)

# ✅ Display UI Elements
display(text_input, image_uploader, generate_button, output_area)

Textarea(value='', layout=Layout(height='100px', width='100%'), placeholder='Enter your text prompt...')

FileUpload(value=(), accept='image/*', description='Upload', multiple=True)

Button(description='Generate Response', style=ButtonStyle())

Output()

In [10]:
# ✅ Function to Generate Model Response
def generate_response(text, images, temperature=0.4, top_p=0.9, repetition_penalty=1.1):
    """Generate a response using the DeepSeek-VL2 model."""
    vl_chat_processor, tokenizer, vl_gpt = load_model()

    # Prepare conversation
    conversation = [{"role": "<|User|>", "content": text, "images": images}, {"role": "<|Assistant|>", "content": ""}]
    pil_images = load_pil_images(images)

    prepare_inputs = vl_chat_processor.__call__(
        conversations=conversation,
        images=pil_images,
        force_batchify=True,
        system_prompt=""
    ).to(vl_gpt.device)

    with torch.no_grad():
        inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

        outputs = vl_gpt.generate(
            inputs_embeds=inputs_embeds,
            input_ids=prepare_inputs.input_ids,
            images=prepare_inputs.images,
            images_seq_mask=prepare_inputs.images_seq_mask,
            images_spatial_crop=prepare_inputs.images_spatial_crop,
            attention_mask=prepare_inputs.attention_mask,
            pad_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            max_new_tokens=512,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            use_cache=True,
        )

        answer = tokenizer.decode(outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(), skip_special_tokens=False)
        print(f"🗨️ Assistant:\n{answer}")

        # Display visual grounding image if applicable
        vg_image = parse_ref_bbox(answer, image=pil_images[-1]) if pil_images else None
        if vg_image:
            display(vg_image)

        return answer

# ✅ Create Interactive Widgets
text_input = widgets.Textarea(
    placeholder="Enter your text prompt...",
    layout=widgets.Layout(width="100%", height="100px"),
)

image_uploader = widgets.FileUpload(
    accept="image/*", multiple=True
)

generate_button = widgets.Button(description="Generate Response")
output_area = widgets.Output()

# ✅ Define Button Click Function
def on_generate_clicked(b):
    with output_area:
        output_area.clear_output()
        text = text_input.value  # Get user input text

        # Ensure images are uploaded before proceeding
        if not image_uploader.value:
            print("⚠️ No images uploaded. Please upload an image before submitting.")
            return

        # Save images and get file paths
        image_paths = save_uploaded_images(image_uploader.value)

        # Generate response using saved image paths
        response = generate_response(text, image_paths)
        print("Response:\n", response)

# ✅ Attach Function to Button Click
generate_button.on_click(on_generate_clicked)

# ✅ Display UI Elements
display(text_input, image_uploader, generate_button, output_area)

Textarea(value='', layout=Layout(height='100px', width='100%'), placeholder='Enter your text prompt...')

FileUpload(value=(), accept='image/*', description='Upload', multiple=True)

Button(description='Generate Response', style=ButtonStyle())

Output()