In [45]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [46]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model

    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.11: Fast Mllama vision patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

In [47]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

# Test Inference

In [3]:
from competitive_sketch_agent.sketch_generator import SketchApp, call_argparse

In [1]:
!which python


/usr/local/bin/python


In [2]:
!python --version


Python 3.10.12


In [3]:
!pip list | grep torch


torch                              2.5.1+cu121
torchaudio                         2.5.1+cu121
torchsummary                       1.5.1
torchvision                        0.20.1+cu121


In [24]:
!git clone https://github.com/heng2j/CompetitiveSketchAgent.git

Cloning into 'CompetitiveSketchAgent'...
remote: Enumerating objects: 123, done.[K
remote: Counting objects: 100% (123/123), done.[K
remote: Compressing objects: 100% (79/79), done.[K
remote: Total 123 (delta 54), reused 79 (delta 27), pack-reused 0 (from 0)[K
Receiving objects: 100% (123/123), 361.38 KiB | 10.32 MiB/s, done.
Resolving deltas: 100% (54/54), done.


In [25]:
%cd CompetitiveSketchAgent
!git checkout EDA_with_Llama_vision
!pip install -e .
!pip list | grep competitive
!pip install cairosvg

/content/CompetitiveSketchAgent
Branch 'EDA_with_Llama_vision' set up to track remote branch 'EDA_with_Llama_vision' from 'origin'.
Switched to a new branch 'EDA_with_Llama_vision'


Collecting cairosvg
  Downloading CairoSVG-2.7.1-py3-none-any.whl.metadata (2.7 kB)
Collecting cairocffi (from cairosvg)
  Downloading cairocffi-1.7.1-py3-none-any.whl.metadata (3.3 kB)
Collecting cssselect2 (from cairosvg)
  Downloading cssselect2-0.7.0-py3-none-any.whl.metadata (2.9 kB)
Downloading CairoSVG-2.7.1-py3-none-any.whl (43 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cairocffi-1.7.1-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cssselect2-0.7.0-py3-none-any.whl (15 kB)
Installing collected packages: cssselect2, cairocffi, cairosvg
Successfully installed cairocffi-1.7.1 cairosvg-2.7.1 cssselect2-0.7.0


In [36]:
from competitive_sketch_agent import utils
from competitive_sketch_agent.prompts import sketch_first_prompt, system_prompt, gt_example
from competitive_sketch_agent.sketch_generator import SketchApp


In [31]:
filename = "/content/CompetitiveSketchAgent/competitive_sketch_agent/configs/default_config.yaml"
config = utils.load_yaml_config(filename=filename)

In [38]:
sketch_app = SketchApp(config=config)

In [39]:
sketch_app.input_prompt

"I provide you with a blank grid. Your goal is to produce a visually appealing sketch of a car.\nHere are a few examples:\n<examples>\n\n<example>\nTo draw a house, start by drawing the front of the house:\n<concept>House</concept>\n<strokes>\n    <s1>\n        <points>'x13y27', 'x24y27', 'x24y27', 'x24y11', 'x24y11', 'x13y11', 'x13y11', 'x13y27'</points>\n        <t_values>0.00,0.3,0.25,0.5,0.5,0.75,0.75,1.00</t_values>\n        <id>house base front rectangle</id>\n    </s1>\n    <s2>\n        <points>'x13y27', 'x18y37','x18y37', 'x24y27'</points>\n        <t_values>0.00,0.55,0.5,1.00</t_values>\n        <id>roof front triangle</id>\n    </s2>\n</strokes>\n\nNext we add the house's right section:\n<concept>House</concept>\n<strokes>\n    <s1>\n        <points>'x13y27', 'x24y27', 'x24y27', 'x24y11', 'x24y11', 'x13y11', 'x13y11', 'x13y27'</points>\n        <t_values>0.00,0.3,0.25,0.5,0.5,0.75,0.75,1.00</t_values>\n        <id>house base front rectangle</id>\n    </s1>\n    <s2>\n       

In [40]:
system_prompt

'You are an expert artist specializing in drawing sketches that are visually appealing, expressive, and professional.\nYou will be provided with a blank grid. Your task is to specify where to place strokes on the grid to create a visually appealing sketch of the given textual concept.\nThe grid uses numbers (1 to {res}) along the bottom (x axis) and numbers (1 to {res}) along the left edge (y axis) to reference specific locations within the grid. Each cell is uniquely identified by a combination of the corresponding x axis numbers and y axis number (e.g., the bottom-left cell is \'x1y1\', the cell to its right is \'x2y1\').\nYou can draw on this grid by specifying where to draw strokes. You can draw multiple strokes to depict the whole object, where different strokes compose different parts of the object. \nTo draw a stroke on the grid, you need to specify the following:\nStarting Point: Specify the starting point by giving the grid location (e.g., \'x1y1\' for column 1, row 1).\nEndin

In [41]:
sketch_app.res

50

In [53]:
msg_history=[]
init_canvas_str=None
msg=sketch_app.input_prompt
other_msg = sketch_app.define_input_to_llm(msg_history, sketch_app.init_canvas_str, msg)
other_msg


[{'role': 'user',
  'content': [{'type': 'image',
    'source': {'type': 'base64',
     'media_type': 'image/jpeg',
     'data': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAJkAmQDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwC74Z0bRYvBWlfbNEt21K9tLXyZL3S7NkKyTRRNKh

In [56]:
type(sketch_app.init_canvas)

In [54]:
FastVisionModel.for_inference(model) # Enable for inference!

# image = dataset[0]["image"]
instruction = "You are an expert radiographer. Describe accurately what you see in this image."

messages = other_msg


In [59]:
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
image = sketch_app.init_canvas

inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
response = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 3000,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

**Step 1: Concept Analysis**

The car is a large, four-wheeled vehicle with a body and wheels. The front part is pointed, indicating a front-end. We need to think about the shapes we'll use to draw this and their ordering.

**Step 2: Stroke Creation**

We should first focus on creating the outline of the car body as a basic structure.

1.  **Step 1.1: Draw the Front of the Car**
    <formatting>
    <concept>The front end of the car.</concept>
    <strokes>
        <s1>
            <points>'x100y10', 'x90y30', 'x90y30', 'x70y50', 'x70y50', 'x55y60', 'x55y60', 'x30y85'</points>
            <t_values>0.00,0.2,0.4,0.6,0.7,0.85,0.9,1.00</t_values>
            <id>front half car</id>
        </s1>
    </strokes>
    </formatting>

2.  **Step 1.2: Draw the Side Profile**

    <formatting>
    <concept>The side profile of the car.</concept>
    <strokes>
        <s2>
            <points>'x30y85', 'x70y50', 'x70y50', 'x55y60', 'x55y60', 'x70y45', 'x70y45', 'x100y10'</points>
            <t_val

In [62]:
model.generate?

In [64]:
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
image = sketch_app.init_canvas

inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
response = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 3000,
                   use_cache = True, temperature = 0.5, min_p = 0.1)

**Step 1: Concept Analysis**

The concept depicted in the sketch is a car. To create a recognizable sketch of a car, we need to identify the key parts of the vehicle and their relative positions on the grid.

**Step 2: Stroke Analysis**

We will start by sketching the car's body, which can be represented by a rectangular shape. The points for this stroke are:

* 'x20y20', 'x30y20', 'x30y30', 'x20y30'

The corresponding t-values are:

* 0.00, 0.25, 0.5, 0.75

The id for this stroke is "car body rectangle".

**Step 3: Additional Strokes**

Next, we will add the car's wheels. The points for this stroke are:

* 'x20y20', 'x30y20', 'x30y30', 'x20y30'
* 'x25y25', 'x35y25', 'x35y35', 'x25y35'

The corresponding t-values are:

* 0.00, 0.25, 0.5, 0.75
* 0.00, 0.25, 0.5, 0.75

The ids for these strokes are "wheel 1" and "wheel 2", respectively.

**Step 4: Final Stroke**

Finally, we will add the car's front grille. The points for this stroke are:

* 'x25y25', 'x35y25', 'x35y35', 'x25y35'

The co

In [65]:
response

tensor([[128000, 128006,    882,  ...,   9399,     29, 128009]],
       device='cuda:0')