In [None]:
from diffusers import AutoPipelineForText2Image
from diffusers.utils import load_image
import torch

In [None]:
pipe = AutoPipelineForText2Image.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", 
    torch_dtype=torch.float16
).to("cuda")

pipe.load_ip_adapter(
    "h94/IP-Adapter", 
    subfolder="sdxl_models", 
    weight_name="ip-adapter_sdxl_vit-h.safetensors", 
    torch_dtype=torch.float16
)

pipe.set_ip_adapter_scale(1)

Here we suppose that we obtain `image_embeds` as follows:
1. Pass fMRI through the fMRI encoder to get `fmri_embeds`
2. Pass EEG through the EEG encoder to get `eeg_embeds`
3. Pass `fmri_embeds` and `eeg_embeds` through fuse module to get `combined_embeds`

`combined_embeds` should be the same dim, as CLIP-ViT-H-14 embedding, i.e. 1024

In [None]:
# but here we just set it randomly
combined_embeds = torch.randn(1, 1024).to(torch.float16) # vector of dim 1024

In [None]:
ip_adapter_image_embeds = torch.cat([
    torch.zeros_like(combined_embeds),
    combined_embeds
]).unsqueeze(1)

image = pipe(
    prompt='', 
    ip_adapter_image_embeds=[ip_adapter_image_embeds], 
    num_inference_steps=30,
    guidance_scale=5.0,
).images[0]

image