In [10]:
!pip install --upgrade modal-client






In [12]:
import os

# Set Modal Token ID and Secret as environment variables
os.environ["MODAL_TOKEN_ID"] = #Token ID here
os.environ["MODAL_TOKEN_SECRET"] = #Token secret here






In [13]:
pip install modal



In [None]:
%%writefile app.py
import modal
from PIL import Image

# Create the image with all required dependencies
image = modal.Image.debian_slim().pip_install(
    "diffusers",
    "transformers",
    "accelerate",
    "torch",
    "fastapi",
    "pillow"
)

# Define stub (app) after image definition
app = modal.Stub(
    "sdxl-turbo-with-caption",
    image=image
)

@app.cls(gpu="A100")
class Model:
    def __init__(self):
        self.pipe = None
        self.caption_model = None
        self.caption_processor = None

    @modal.enter()
    def load_weights(self):
        from diffusers import AutoPipelineForText2Image
        from transformers import BlipForConditionalGeneration, BlipProcessor
        import torch

        # Load SDXL Turbo
        self.pipe = AutoPipelineForText2Image.from_pretrained(
            "stabilityai/sdxl-turbo",
            torch_dtype=torch.float16,
            variant="fp16"
        )
        self.pipe.to("cuda")

        # Load BLIP captioning model
        self.caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
        self.caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda")

    def generate_caption(self, image):
        inputs = self.caption_processor(images=image, return_tensors="pt").to("cuda")
        generated_ids = self.caption_model.generate(**inputs, max_new_tokens=50)
        caption = self.caption_processor.decode(generated_ids[0], skip_special_tokens=True)
        return caption

    @modal.web_endpoint(method="GET")
    def generate(self, prompt: str = "A cinematic shot of a baby racoon wearing an intricate italian priest robe."):
        import io
        from fastapi import Response
        import json

        print(f"Generating image for prompt: {prompt}")

        # Generate image
        image = self.pipe(
            prompt=prompt,
            num_inference_steps=1,
            guidance_scale=0.0
        ).images[0]

        # Generate caption
        caption = self.generate_caption(image)
        print(f"Generated caption: {caption}")

        # Convert image to bytes
        buffer = io.BytesIO()
        image.save(buffer, format="JPEG")
        buffer.seek(0)
        image_bytes = buffer.getvalue()

        # Create response with both image and caption
        response = Response(
            content=json.dumps({
                "image_bytes": image_bytes.hex(),
                "caption": caption
            }),
            media_type="application/json"
        )
        response.headers["Access-Control-Allow-Origin"] = "*"
        return response



Overwriting app.py


In [15]:
## Optimized ###################
%%writefile app.py

import modal
from PIL import Image
import torch

# Define the models to be downloaded during image creation
SDXL_MODEL_ID = "stabilityai/sdxl-turbo"
BLIP_MODEL_ID = "Salesforce/blip-image-captioning-large"

def download_models():
    from diffusers import AutoPipelineForText2Image
    from transformers import BlipForConditionalGeneration, BlipProcessor

    # Download models during image building
    AutoPipelineForText2Image.from_pretrained(
        SDXL_MODEL_ID,
        torch_dtype=torch.float16,
        variant="fp16"
    )
    BlipProcessor.from_pretrained(BLIP_MODEL_ID)
    BlipForConditionalGeneration.from_pretrained(BLIP_MODEL_ID)

# Create the image with optimized dependencies
image = (
    modal.Image.debian_slim(python_version="3.10")
    .pip_install(
        "diffusers>=0.24.0",
        "transformers>=4.36.0",
        "accelerate>=0.26.0",
        "torch>=2.1.0",
        "fastapi>=0.109.0",
        "pillow>=10.0.0",
        "safetensors>=0.4.0",
    )
    .pip_install("xformers", pre=True)
    .run_function(download_models)
)

# Create volume for persistent cache
volume = modal.Volume.from_name("model-cache-vol", create_if_missing=True)
CACHE_PATH = "/cache"

# Changed from stub to app
app = modal.Stub("sdxl-turbo-with-caption", image=image)

@app.cls(
    gpu="A100",
    container_idle_timeout=300,
    allow_concurrent_inputs=2,
    volumes={CACHE_PATH: volume}
)
class Model:
    def __init__(self):
        self.pipe = None
        self.caption_model = None
        self.caption_processor = None

    @modal.enter()
    def load_weights(self):
        import os
        from diffusers import AutoPipelineForText2Image
        from transformers import BlipForConditionalGeneration, BlipProcessor
        import torch

        # Set cache directory
        os.environ['TRANSFORMERS_CACHE'] = f"{CACHE_PATH}/transformers"
        os.environ['DIFFUSERS_CACHE'] = f"{CACHE_PATH}/diffusers"

        # Enable optimization flags
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True

        # Load SDXL Turbo with optimizations
        self.pipe = AutoPipelineForText2Image.from_pretrained(
            SDXL_MODEL_ID,
            torch_dtype=torch.float16,
            variant="fp16",
            use_safetensors=True,
            cache_dir=f"{CACHE_PATH}/diffusers"
        )
        self.pipe.enable_xformers_memory_efficient_attention()
        self.pipe.to("cuda")

        # Load BLIP with optimizations
        self.caption_processor = BlipProcessor.from_pretrained(
            BLIP_MODEL_ID,
            cache_dir=f"{CACHE_PATH}/transformers"
        )
        self.caption_model = BlipForConditionalGeneration.from_pretrained(
            BLIP_MODEL_ID,
            torch_dtype=torch.float16,
            cache_dir=f"{CACHE_PATH}/transformers"
        ).to("cuda")

    def generate_caption(self, image):
        with torch.inference_mode():
            inputs = self.caption_processor(images=image, return_tensors="pt").to("cuda", torch.float16)
            generated_ids = self.caption_model.generate(
                **inputs,
                max_new_tokens=50,
                num_beams=1,
                length_penalty=1.0
            )
            caption = self.caption_processor.decode(generated_ids[0], skip_special_tokens=True)
        return caption

    @modal.web_endpoint(method="GET")
    async def generate(self, prompt: str = "A cinematic shot of a baby racoon wearing an intricate italian priest robe."):
        import io
        from fastapi import Response
        import json

        try:
            with torch.inference_mode():
                image = self.pipe(
                    prompt=prompt,
                    num_inference_steps=1,
                    guidance_scale=0.0,
                ).images[0]

            caption = self.generate_caption(image)

            buffer = io.BytesIO()
            image.save(buffer, format="JPEG", quality=90, optimize=True)
            image_bytes = buffer.getvalue()

            response = Response(
                content=json.dumps({
                    "image_bytes": image_bytes.hex(),
                    "caption": caption,
                    "status": "success"
                }),
                media_type="application/json"
            )
            response.headers["Access-Control-Allow-Origin"] = "*"
            return response

        except Exception as e:
            return Response(
                content=json.dumps({
                    "status": "error",
                    "error": str(e)
                }),
                media_type="application/json",
                status_code=500
            )

# Optional: Add keepalive function to prevent cold starts
@app.function(schedule=modal.Period(minutes=4))
def keepalive():
    resp = Model().generate.remote("test prompt")
    print("Keeping container warm:", resp)

# Optional: Add warmup function
@app.function(schedule=modal.Cron("30 7 * * *"))
def warmup():
    Model().generate.remote("warmup prompt")

Overwriting app.py


In [16]:
!modal deploy app.py

[33m│[0m The use of "Stub" has been deprecated in favor of "App". This is a pure name change with no      [33m│[0m
[33m│[0m other implications.                                                                              [33m│[0m
[33m│[0m                                                                                                  [33m│[0m
[33m│[0m Source: /content/app.py:44                                                                       [33m│[0m
[33m│[0m   app = modal.Stub("sdxl-turbo-with-caption", image=image)                                       [33m│[0m
[33m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯[0m
[2K[34m⠸[0m Creating objects...
[2K[1A[2K[34m⠦[0m Creating objects...
[37m├── [0m🔨 Created mount /content/app.py
[2K[1A[2K[1A[2K[34m⠏[0m Creating objects...
[37m├── [0m🔨 Created mount /content/app.py
[37m├── [0m🔨 Created function download_models.
[37m├── [0m🔨 Created func