In [None]:
#@markdown # Set worker info here (highly encouraged!)

# The horde url
horde_url = "https://stablehorde.net"
# Give a cool name to your instance
horde_name = "My Awesome Instance"
# The api_key identifies a unique user in the horde
# Visit https://stablehorde.net/register to create one before you can join
horde_api_key = "0000000000"
# Put other users whose prompts you want to prioritize.
# The owner's username is always included so you don't need to add it here, unless you want it to have lower priority than another user
horde_priority_usernames = []
# The amount of power your system can handle
# 8 means 512*512. Each increase increases the possible resoluion by 64 pixes
# So if you put this to 2 (the minimum, your SD can only generate 64x64 pixels
# If you put this to 32, it is equivalent to 1024x1024 pixels
horde_max_power = 8
# Set this to false, if you do not want your worker to receive requests for NSFW generations
horde_nsfw = True
# A list of words which you do not want to your worker to accept
horde_blacklist = []
# A list of words for which you always want to allow the NSFW censor filter, even when this worker is in NSFW mode
horde_censorlist = []

%env USE_MEMORY_EFFICIENT_ATTENTION 1

In [None]:
#@markdown # Install dependencies
!nvidia-smi
# enable the following line if you want to speed up everything seems broken rn
# !pip install "git+https://github.com/facebookresearch/xformers"

!pip install diffusers==0.3.0
!pip install transformers scipy ftfy
!pip install gradio datasets tqdm
!pip3 install Cython
!mkdir /root/.huggingface
!echo -n "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" > /root/.huggingface/token
!git clone https://github.com/sberbank-ai/Real-ESRGAN
!pip install -r Real-ESRGAN/requirements.txt
# download model weights
# x2 
!gdown https://drive.google.com/uc?id=1pG2S3sYvSaO0V0B8QPOl1RapPHpUGOaV -O Real-ESRGAN/weights/RealESRGAN_x2.pth
!gdown https://drive.google.com/uc?id=1SGHdZAln4en65_NQeQY9UjchtkEF9f5F -O Real-ESRGAN/weights/RealESRGAN_x4.pth
!gdown https://drive.google.com/uc?id=1mT9ewx86PSrc43b-ax47l1E2UzR7Ln4j -O Real-ESRGAN/weights/RealESRGAN_x8.pth


from tensorflow.python.client import device_lib
import sys
import re

import inspect
import warnings
from typing import List, Optional, Union

import torch
from torch import autocast
from tqdm.auto import tqdm

from diffusers import (
    AutoencoderKL,
    DDIMScheduler,
    DiffusionPipeline,
    PNDMScheduler,
    UNet2DConditionModel,
)

from diffusers import StableDiffusionPipeline
from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
%cd Real-ESRGAN/
from RealESRGAN import RealESRGAN
%cd ..
from PIL import Image
import numpy as np
import torch


In [None]:
#@markdown # Get an available GPU plus the system version
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.physical_device_desc.split(",")[1].split(": ")[1] for x in local_device_protos if x.device_type == 'GPU']
gpuname = get_available_gpus()[0]

sysversion = sys.version[:3]

In [None]:
#@markdown # Initialize img2img pipeline
class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
    def __init__(
        self,
        vae: AutoencoderKL,
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
        scheduler: Union[DDIMScheduler, PNDMScheduler],
        feature_extractor: CLIPFeatureExtractor,
    ):
        super().__init__()
        scheduler = scheduler.set_format("pt")
        self.register_modules(
            vae=vae,
            text_encoder=text_encoder,
            tokenizer=tokenizer,
            unet=unet,
            scheduler=scheduler,
            feature_extractor=feature_extractor,
        )
    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
        r"""
        Enable sliced attention computation.
        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
        in several steps. This is useful to save some memory in exchange for a small speed decrease.
        Args:
            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
                `attention_head_dim` must be a multiple of `slice_size`.
        """
        if slice_size == "auto":
            # half the attention head size is usually a good trade-off between
            # speed and memory
            slice_size = self.unet.config.attention_head_dim // 2
        self.unet.set_attention_slice(slice_size)

    def disable_attention_slicing(self):
        r"""
        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
        back to computing attention in one step.
        """
        # set slice_size = `None` to disable `attention slicing`
        self.enable_attention_slicing(None)

    @torch.no_grad()
    def __call__(
        self,
        prompt: Union[str, List[str]],
        init_image: Optional[torch.FloatTensor] = None,
        strength: float = 0.8,
        num_inference_steps: Optional[int] = 50,
        guidance_scale: Optional[float] = 7.5,
        eta: Optional[float] = 0.0,
        generator: List[str] = [],
        output_type: Optional[str] = "pil",
        height: Optional[int] = 512,
        width: Optional[int] = 512,
    ):

        if isinstance(prompt, str):
            batch_size = 1
        elif isinstance(prompt, list):
            batch_size = len(prompt)
        else:
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

        if strength < 0 or strength > 1:
          raise ValueError(f'The value of strength should in [0.0, 1.0] but is {strength}')

        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
        # get prompt text embeddings
        text_input = self.tokenizer(
            prompt,
            padding="max_length",
            max_length=self.tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt",
        )
        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]

        # set timesteps
        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
        extra_set_kwargs = {}
        offset = 0
        if accepts_offset:
            offset = 1
            extra_set_kwargs["offset"] = 1

        
        if(init_image is not None):
          self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
          # encode the init image into latents and scale the latents
          init_latents = self.vae.encode(init_image.to(self.device)).latent_dist.sample()
          init_latents = 0.18215 * init_latents

          # prepare init_latents noise to latents
          init_latents = torch.cat([init_latents] * batch_size)
          
          # get the original timestep using init_timestep
          init_timestep = int(num_inference_steps * strength) + offset
          init_timestep = min(init_timestep, num_inference_steps)
          timesteps = self.scheduler.timesteps[-init_timestep]
          timesteps = torch.tensor([timesteps] * batch_size, dtype=torch.long, device=self.device)
          
          # add noise to latents using the timesteps
          noise = torch.randn(init_latents.shape, generator=torch.Generator("cuda").manual_seed(int(generator[0])), device=self.device)
          init_latents = self.scheduler.add_noise(init_latents, noise, timesteps)
          latents = init_latents
        else:
          init_timestep = 0
          latents = []
          for seedt in generator:
            gen = torch.Generator("cuda").manual_seed(int(seedt))
            latents = latents + [torch.randn(
                (1,self.unet.in_channels, height // 8, width // 8),
                generator=gen,
                device=self.device)]
          latents = torch.cat(latents,dim=0)
                # set timesteps
          accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
          extra_set_kwargs = {}
          if accepts_offset:
              extra_set_kwargs["offset"] = 1

          self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)

      

        
        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0
        # get unconditional embeddings for classifier free guidance
        if do_classifier_free_guidance:
            max_length = text_input.input_ids.shape[-1]
            uncond_input = self.tokenizer(
                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
            )
            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])


        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
        # and should be between [0, 1]
        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
        extra_step_kwargs = {}
        if accepts_eta:
            extra_step_kwargs["eta"] = eta

        
        t_start = max(num_inference_steps - init_timestep + offset, 0) if init_image is not None else 0
        for i, t in tqdm(enumerate(self.scheduler.timesteps[t_start:])):
            # expand the latents if we are doing classifier free guidance
            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents

            # predict the noise residual
            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]

            # perform guidance
            if do_classifier_free_guidance:
                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

            # compute the previous noisy sample x_t -> x_t-1
            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)["prev_sample"]

        # scale and decode the image latents with vae
        latents = 1 / 0.18215 * latents
        image = self.vae.decode(latents).sample

        image = (image / 2 + 0.5).clamp(0, 1)
        image = image.cpu().permute(0, 2, 3, 1).numpy()

        if output_type == "pil":
            image = self.numpy_to_pil(image)

        return {"sample": image}
device = "cuda"
model_path = "CompVis/stable-diffusion-v1-4"

# Using DDIMScheduler as anexample,this also works with PNDMScheduler
# uncomment this line if you want to use it.

# scheduler = PNDMScheduler.from_config(model_path, subfolder="scheduler", use_auth_token=True)

scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
img2imgpipe = StableDiffusionImg2ImgPipeline.from_pretrained(
    model_path,
    scheduler=scheduler,
    revision="fp16", 
    torch_dtype=torch.float16,
    use_auth_token=True
).to(device)
pipe = img2imgpipe


In [None]:
#@markdown # Enable attention slicing
pipe.enable_attention_slicing()
!nvidia-smi

In [None]:
#@markdown # Load pipelines
model2 = RealESRGAN("cuda", scale = 2)
model2.load_weights(f'Real-ESRGAN/weights/RealESRGAN_x2.pth')
model8 = RealESRGAN("cuda", scale = 8)
model8.load_weights(f'Real-ESRGAN/weights/RealESRGAN_x8.pth')
model4 = RealESRGAN("cuda", scale = 4)
model4.load_weights(f'Real-ESRGAN/weights/RealESRGAN_x4.pth')
def upscaleFunc(image,mult):
    if(mult=="2"):
        return model2.predict(np.array(image))
    if(mult == "4"):
        return model4.predict(np.array(image))
    if(mult == "8"):
        return model8.predict(np.array(image))
    return image

In [None]:
#@markdown # Initialize Stable Horde functions
import urllib3, json 
http = urllib3.PoolManager()
# horde stuff
DEFAULT_HEADERS = {'content-type': 'text/plain'}
REQUEST_FAILED = "Request failed"


def makeHttpRequest(type, url, body = None, headers = DEFAULT_HEADERS):
    try:
        if type == "GET":
            return http.request("GET", url)
        elif type == "POST":
            return http.request("POST", url, body=body, headers=headers)
    except Exception as e:
        return REQUEST_FAILED

def getPop():
    return makeHttpRequest("POST",horde_url+"/api/v2/generate/pop",body=json.dumps({"name":horde_name,"priority_usernames":horde_priority_usernames,"nsfw":horde_nsfw,"blacklist":horde_blacklist,"max_pixels":1024*1024}),headers={'content-type': 'application/json',"apikey":horde_api_key})
  
def push(base64encodedimage,seed,pid):
    return makeHttpRequest("POST", f"{horde_url}/api/v2/generate/submit", body=json.dumps({"id":pid,"generation":base64encodedimage,"seed":seed}), headers={"Content-Type":"application/json", "apikey":horde_api_key})
    

In [None]:
#@markdown # Start Stable Horde Worker
print_prompts = False #@param {type:"boolean"}
import torch
import random
import re
version = "1.6"
batch = 4
from torch import autocast
from torchvision import transforms

import os
import base64
import time
from io import BytesIO
from PIL import Image

from io import BytesIO
import torch
import time
from tqdm import tqdm, trange
from PIL import Image
import numpy as np
import base64

def my_preprocess(image, mask):
    #find a way to make it work with 512-1024 dimensions, problem arises with the mask tensor
    image = image.resize((512, 512))

    w, h = image.size
    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
    image = np.array(image).astype(np.float32) / 255.0
    image = image[None].transpose(0, 3, 1, 2)
    image = torch.from_numpy(image)
    return 2.0 * image - 1.0

def my_preprocess_mask(mask):
    mask = mask.convert("L")
    mask = mask.resize((64,64), resample=PIL.Image.LANCZOS)
    mask = np.array(mask).astype(np.float32) / 255.0
    mask = np.tile(mask,(4,1,1))
    mask = mask[None].transpose(0, 1, 2, 3) #what does this step do?
    mask = torch.from_numpy(mask)
    return mask

def load_img_pil(base64text):
    temp = BytesIO()
    
    temp.write(base64.b64decode(base64text))
      
    return Image.open(temp,"r").convert("RGB")
def load_img(base64text, h0, w0):
    temp = BytesIO()
    
    temp.write(base64.b64decode(base64text))
    
    image = Image.open(temp,"r").convert("RGB")
    w, h = image.size
   
    if(h0 is not None and w0 is not None):
        h, w = h0, w0
    
    w, h = map(lambda x: x - x % 32, (w0, h0))  # resize to integer multiple of 32

    print(f"New image size ({w}, {h})")
    image = image.resize((w, h), resample = Image.LANCZOS)
    image = np.array(image).astype(np.float32) / 255.0
    image = image[None].transpose(0, 3, 1, 2)
    image = torch.from_numpy(image)
    return 2.*image - 1.


def image_grid(imgs, rows, cols):
    assert len(imgs) == rows*cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

def image_grid_rec(imgs):
    newimgs = []
    while len(imgs)>0:
        newimgs = newimgs + [image_grid(imgs[:9],3,3)]
        imgs = imgs[9:]
    return image_grid(newimgs,3,3)

while True:
    print("starting listen")
    fetchedNewPromptFromServer = False
    while not fetchedNewPromptFromServer:
        try:
            serverData = getPop()
        except:
            print("error getting pop")
            time.sleep(10)
            continue
        if serverData == REQUEST_FAILED:
            print("failed")
            time.sleep(5)
            continue

        serverResponse = json.loads(serverData.data.decode("utf-8"))
        if(not "id" in serverResponse or serverResponse["id"] is None):
           time.sleep(5)
           continue
        else:
            pid = serverResponse["id"]
            serverResponse = serverResponse["payload"]
            if print_prompts:
                print(f"Using prompt:{serverResponse['prompt']}")
            fetchedNewPromptFromServer = True

   
    
    width = 512
    height = 512
    if("width" in serverResponse):
      width = serverResponse["width"]
    if("height" in serverResponse):
      height = serverResponse["height"]
    iterations = 1
    cfg = 7.5
    if("cfg_scale" in serverResponse):
        cfg = float(serverResponse["cfg_scale"])
        print(cfg)
    prompt = [serverResponse["prompt"]] * iterations
    steps = serverResponse["ddim_steps"]
    inputimg = None
    if("input" in serverResponse):
      inputimg = load_img(serverResponse["input"],width,height)
    upscale = None
    if("upscale" in serverResponse):
        upscale = serverResponse["upscale"]
        inputimg = load_img_pil(serverResponse["input"])
    mask = None
    if("mask" in serverResponse):
      mask = my_preprocess_mask(load_img_pil(serverResponse["mask"]))
      inputimg = my_preprocess(load_img_pil(serverResponse["input"]),mask)
    images = []
    batches = 9
    wid = max(width,height)
    if( wid > 128):
        batches = 6
    if( wid > 256 ):
        batches = 3
    if( wid > 512 ):
        batches = 1
    with autocast("cuda"):
        
        if(inputimg is not None):
            if(upscale is not None):
                images = [upscaleFunc(inputimg,upscale)]
            else:
                images = img2imgpipe(prompt=prompt, init_image=inputimg, strength=float(serverResponse["strength"]), guidance_scale=7.5, generator=serverResponse["seed"])["sample"]
        else:   
            tot = len(prompt)
            try:
                  seed = int(serverResponse["seed"])
            except: 
              seed = random.randint(100,1000000)
            seedd = [seed]  * iterations
            origiseed = seedd
            for i in range(0,iterations):
                
                seedd[i] = seedd[i] + i
            try:
                while(len(prompt) > 0):
                    print(f"Batch size:{batches}")
                    prog = tot-len(prompt)
                    #makeHttpRequest("POST", f"https://writerbot.selkiemyth.com/update/{pid}?name="+nodename, body=f"Iter {prog}-{prog+batches} started")

                    images = images + pipe(prompt[:batches],num_inference_steps=min(max(steps,10),150), generator=seedd[:batches], height=height,guidance_scale=cfg, width=width)["sample"] # image here is in [PIL format](https://pillow.readthedocs.io/en/stable/)
                    prompt = prompt[batches:]
                    seedd = seedd[batches:]
            except:
                pass
                #makeHttpRequest("POST", f"https://writerbot.selkiemyth.com/update/{pid}?name="+nodename, body=f"Job Failed(retry in 10 mins")

            
    #image = images[0] if (iterations == "1") else image_grid(images, 2, 2) if (iterations == "4") else image_grid(images, 3, 3) if (iterations == "9") else image_grid_rec(images)
    # Now to display an image you can do either save it such as:
    o = 0
    for image in images:
        temp = BytesIO()
        image.save(temp,"webp")
        encoded = base64.b64encode(temp.getvalue()).decode('utf-8')
        saveFailed = push(encoded,origiseed[o],pid)
        o+=1
        if saveFailed == REQUEST_FAILED:
            print("dammit")

    "Finish generating images"