diff --git a/docs/assets/inpainting/000019.curly.hair.deselected.png b/docs/assets/inpainting/000019.curly.hair.deselected.png new file mode 100644 index 00000000000..54f2285550c Binary files /dev/null and b/docs/assets/inpainting/000019.curly.hair.deselected.png differ diff --git a/docs/assets/inpainting/000019.curly.hair.masked.png b/docs/assets/inpainting/000019.curly.hair.masked.png new file mode 100644 index 00000000000..a221c522f3e Binary files /dev/null and b/docs/assets/inpainting/000019.curly.hair.masked.png differ diff --git a/docs/assets/inpainting/000019.curly.hair.selected.png b/docs/assets/inpainting/000019.curly.hair.selected.png new file mode 100644 index 00000000000..e25bb4340c5 Binary files /dev/null and b/docs/assets/inpainting/000019.curly.hair.selected.png differ diff --git a/docs/assets/inpainting/000024.801380492.png b/docs/assets/inpainting/000024.801380492.png new file mode 100644 index 00000000000..9c72eb06b8b Binary files /dev/null and b/docs/assets/inpainting/000024.801380492.png differ diff --git a/docs/features/CLI.md b/docs/features/CLI.md index 1219a2bed7c..78f035b8717 100644 --- a/docs/features/CLI.md +++ b/docs/features/CLI.md @@ -503,6 +503,16 @@ invoke> !search surreal This clears the search history from memory and disk. Be advised that this operation is irreversible and does not issue any warnings! +Other ! Commands + +### !mask + +This command takes an image, a text prompt, and uses the `clipseg` +algorithm to automatically generate a mask of the area that matches +the text prompt. It is useful for debugging the text masking process +prior to inpainting with the `--text_mask` argument. See +[INPAINTING.md] for details. + ## Command-line editing and completion The command-line offers convenient history tracking, editing, and diff --git a/docs/features/INPAINTING.md b/docs/features/INPAINTING.md index ac558917e78..226144de95b 100644 --- a/docs/features/INPAINTING.md +++ b/docs/features/INPAINTING.md @@ -74,6 +74,60 @@ up at all! invoke> a baseball -I /path/to/breakfast.png -tm orange 0.6 ~~~ +The `!mask` command may be useful for debugging problems with the +text2mask feature. The syntax is `!mask /path/to/image.png -tm +` + +It will generate three files: + +- The image with the selected area highlighted. +- The image with the un-selected area highlighted. +- The image with the selected area converted into a black and white + image according to the threshold level. + +Note that none of these images are intended to be used as the mask +passed to invoke via `-M` and may give unexpected results if you try +to use them this way. Instead, use `!mask` for testing that you are +selecting the right mask area, and then do inpainting using the +best selection term and threshold. + +Here is an example of how `!mask` works: + +``` +invoke> !mask ./test-pictures/curly.png -tm hair 0.5 +>> generating masks from ./test-pictures/curly.png +>> Initializing clipseg model for text to mask inference +Outputs: +[941.1] outputs/img-samples/000019.curly.hair.deselected.png: !mask ./test-pictures/curly.png -tm hair 0.5 +[941.2] outputs/img-samples/000019.curly.hair.selected.png: !mask ./test-pictures/curly.png -tm hair 0.5 +[941.3] outputs/img-samples/000019.curly.hair.masked.png: !mask ./test-pictures/curly.png -tm hair 0.5 +``` + +**Original image "curly.png"** + + +**000019.curly.hair.selected.png** + + +**000019.curly.hair.deselected.png** + + +**000019.curly.hair.masked.png** + + +It looks like we selected the hair pretty well at the 0.5 threshold +(which is the default, so we didn't actually have to specify it), so +let's have some fun: + +``` +invoke> medusa with cobras -I ./test-pictures/curly.png -tm hair 0.5 -C20 +>> loaded input image of size 512x512 from ./test-pictures/curly.png +... +Outputs: +[946] outputs/img-samples/000024.801380492.png: "medusa with cobras" -s 50 -S 801380492 -W 512 -H 512 -C 20.0 -I ./test-pictures/curly.png -A k_lms -f 0.75 +``` + + ### Inpainting is not changing the masked region enough! diff --git a/environment-mac.yml b/environment-mac.yml index 9b43fd09789..14509ccff79 100644 --- a/environment-mac.yml +++ b/environment-mac.yml @@ -57,7 +57,7 @@ dependencies: - -e git+https://github.com/openai/CLIP.git@main#egg=clip - -e git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k_diffusion - -e git+https://github.com/TencentARC/GFPGAN.git#egg=gfpgan - - -e git+https://github.com/invoke-ai/clipseg.git#egg=clipseg + - -e git+https://github.com/invoke-ai/clipseg.git@models-rename#egg=clipseg - -e . variables: PYTORCH_ENABLE_MPS_FALLBACK: 1 diff --git a/environment.yml b/environment.yml index 72468067e4c..820f940608b 100644 --- a/environment.yml +++ b/environment.yml @@ -37,5 +37,5 @@ dependencies: - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers - -e git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k_diffusion - -e git+https://github.com/TencentARC/GFPGAN.git#egg=gfpgan - - -e git+https://github.com/invoke-ai/clipseg.git#egg=clipseg + - -e git+https://github.com/invoke-ai/clipseg.git@models-rename#egg=clipseg - -e . diff --git a/ldm/generate.py b/ldm/generate.py index 7fb68dec0ad..fa72a72d388 100644 --- a/ldm/generate.py +++ b/ldm/generate.py @@ -805,6 +805,23 @@ def upscale_and_reconstruct(self, else: r[0] = image + def apply_textmask(self, image_path:str, prompt:str, callback, threshold:float=0.5): + assert os.path.exists(image_path), '** "{image_path}" not found. Please enter the name of an existing image file to mask **' + basename,_ = os.path.splitext(os.path.basename(image_path)) + if self.txt2mask is None: + self.txt2mask = Txt2Mask(device = self.device) + segmented = self.txt2mask.segment(image_path,prompt) + trans = segmented.to_transparent() + inverse = segmented.to_transparent(invert=True) + mask = segmented.to_mask(threshold) + + path_filter = re.compile(r'[<>:"/\\|?*]') + safe_prompt = path_filter.sub('_', prompt)[:50].rstrip(' .') + + callback(trans,f'{safe_prompt}.deselected',use_prefix=basename) + callback(inverse,f'{safe_prompt}.selected',use_prefix=basename) + callback(mask,f'{safe_prompt}.masked',use_prefix=basename) + # to help WebGUI - front end to generator util function def sample_to_image(self, samples): return self._make_base().sample_to_image(samples) diff --git a/ldm/invoke/readline.py b/ldm/invoke/readline.py index 0a798940296..df2e1a1e5bb 100644 --- a/ldm/invoke/readline.py +++ b/ldm/invoke/readline.py @@ -56,6 +56,7 @@ '--png_compression','-z', '--text_mask','-tm', '!fix','!fetch','!history','!search','!clear', + '!mask', '!models','!switch','!import_model','!edit_model' ) MODEL_COMMANDS = ( @@ -71,6 +72,7 @@ IMG_FILE_COMMANDS=( '!fix', '!fetch', + '!mask', '--init_img[=\s]','-I', '--init_mask[=\s]','-M', '--init_color[=\s]', diff --git a/ldm/invoke/txt2mask.py b/ldm/invoke/txt2mask.py index 01d93546e35..bc8251abde2 100644 --- a/ldm/invoke/txt2mask.py +++ b/ldm/invoke/txt2mask.py @@ -29,9 +29,9 @@ import torch import numpy as np -from models.clipseg import CLIPDensePredT +from clipseg_models.clipseg import CLIPDensePredT from einops import rearrange, repeat -from PIL import Image +from PIL import Image, ImageOps from torchvision import transforms CLIP_VERSION = 'ViT-B/16' @@ -50,9 +50,14 @@ def to_mask(self,threshold:float=0.5)->Image: discrete_heatmap = self.heatmap.lt(threshold).int() return self._rescale(Image.fromarray(np.uint8(discrete_heatmap*255),mode='L')) - def to_transparent(self)->Image: + def to_transparent(self,invert:bool=False)->Image: transparent_image = self.image.copy() - transparent_image.putalpha(self.to_grayscale()) + gs = self.to_grayscale() + # The following line looks like a bug, but isn't. + # For img2img, we want the selected regions to be transparent, + # but to_grayscale() returns the opposite. + gs = ImageOps.invert(gs) if not invert else gs + transparent_image.putalpha(gs) return transparent_image # unscales and uncrops the 352x352 heatmap so that it matches the image again @@ -79,7 +84,7 @@ def __init__(self,device='cpu'): self.model.load_state_dict(torch.load(CLIPSEG_WEIGHTS, map_location=torch.device('cpu')), strict=False) @torch.no_grad() - def segment(self, image:Image, prompt:str) -> SegmentedGrayscale: + def segment(self, image, prompt:str) -> SegmentedGrayscale: ''' Given a prompt string such as "a bagel", tries to identify the object in the provided image and returns a SegmentedGrayscale object in which the brighter @@ -94,6 +99,10 @@ def segment(self, image:Image, prompt:str) -> SegmentedGrayscale: transforms.Resize((CLIPSEG_SIZE, CLIPSEG_SIZE)), # must be multiple of 64... ]) + if type(image) is str: + image = Image.open(image).convert('RGB') + + image = ImageOps.exif_transpose(image) img = self._scale_and_crop(image) img = transform(img).unsqueeze(0) diff --git a/requirements-linux-arm64.txt b/requirements-linux-arm64.txt index 3a6ab888c9b..5ee4df23993 100644 --- a/requirements-linux-arm64.txt +++ b/requirements-linux-arm64.txt @@ -22,5 +22,5 @@ transformers==4.21.3 -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers -e git+https://github.com/lstein/k-diffusion.git@master#egg=k-diffusion -e git+https://github.com/TencentARC/GFPGAN.git#egg=gfpgan --3 git+https://github.com/invoke-ai/clipseg.git#egg=clipseg +-e git+https://github.com/invoke-ai/clipseg.git@models-rename#egg=clipseg -e . diff --git a/requirements.txt b/requirements.txt index 3517e48177a..2e85166841c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,4 +35,4 @@ realesrgan git+https://github.com/openai/CLIP.git@main#egg=clip git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k-diffusion git+https://github.com/TencentARC/GFPGAN.git#egg=gfpgan -git+https://github.com/invoke-ai/clipseg.git#egg=clipseg +git+https://github.com/invoke-ai/clipseg.git@models-rename#egg=clipseg diff --git a/scripts/invoke.py b/scripts/invoke.py index 8ea71c0de56..754ee1fad06 100644 --- a/scripts/invoke.py +++ b/scripts/invoke.py @@ -225,9 +225,13 @@ def main_loop(gen, opt, infile): os.makedirs(opt.outdir) current_outdir = opt.outdir - # write out the history at this point + # Write out the history at this point. + # TODO: Fix the parsing of command-line parameters + # so that !operations don't need to be stripped and readded if operation == 'postprocess': completer.add_history(f'!fix {command}') + elif operation == 'mask': + completer.add_history(f'!mask {command}') else: completer.add_history(command) @@ -247,13 +251,28 @@ def image_writer(image, seed, upscaled=False, first_seed=None, use_prefix=None): # when the -v switch is used to generate variations nonlocal prior_variations nonlocal prefix - if use_prefix is not None: - prefix = use_prefix path = None if opt.grid: grid_images[seed] = image + + elif operation == 'mask': + filename = f'{prefix}.{use_prefix}.{seed}.png' + tm = opt.text_mask[0] + th = opt.text_mask[1] if len(opt.text_mask)>1 else 0.5 + formatted_dream_prompt = f'!mask {opt.prompt} -tm {tm} {th}' + path = file_writer.save_image_and_prompt_to_png( + image = image, + dream_prompt = formatted_dream_prompt, + metadata = {}, + name = filename, + compress_level = opt.png_compression, + ) + results.append([path, formatted_dream_prompt]) + else: + if use_prefix is not None: + prefix = use_prefix postprocessed = upscaled if upscaled else operation=='postprocess' filename, formatted_dream_prompt = prepare_image_metadata( opt, @@ -292,7 +311,7 @@ def image_writer(image, seed, upscaled=False, first_seed=None, use_prefix=None): results.append([path, formatted_dream_prompt]) # so that the seed autocompletes (on linux|mac when -S or --seed specified - if completer: + if completer and operation == 'generate': completer.add_seed(seed) completer.add_seed(first_seed) last_results.append([path, seed]) @@ -310,6 +329,10 @@ def image_writer(image, seed, upscaled=False, first_seed=None, use_prefix=None): print(f'>> fixing {opt.prompt}') opt.last_operation = do_postprocess(gen,opt,image_writer) + elif operation == 'mask': + print(f'>> generating masks from {opt.prompt}') + do_textmask(gen, opt, image_writer) + if opt.grid and len(grid_images) > 0: grid_img = make_grid(list(grid_images.values())) grid_seeds = list(grid_images.keys()) @@ -355,6 +378,10 @@ def do_command(command:str, gen, opt:Args, completer) -> tuple: command = command.replace('!fix ','',1) operation = 'postprocess' + elif command.startswith('!mask'): + command = command.replace('!mask ','',1) + operation = 'mask' + elif command.startswith('!switch'): model_name = command.replace('!switch ','',1) gen.set_model(model_name) @@ -363,6 +390,7 @@ def do_command(command:str, gen, opt:Args, completer) -> tuple: elif command.startswith('!models'): gen.model_cache.print_models() + completer.add_history(command) operation = None elif command.startswith('!import'): @@ -494,6 +522,19 @@ def write_config_file(conf_path, gen, model_name, new_config, clobber=False): os.rename(tmpfile,conf_path) return True +def do_textmask(gen, opt, callback): + image_path = opt.prompt + assert os.path.exists(image_path), '** "{image_path}" not found. Please enter the name of an existing image file to mask **' + assert opt.text_mask is not None and len(opt.text_mask) >= 1, '** Please provide a text mask with -tm **' + tm = opt.text_mask[0] + threshold = float(opt.text_mask[1]) if len(opt.text_mask) > 1 else 0.5 + gen.apply_textmask( + image_path = image_path, + prompt = tm, + threshold = threshold, + callback = callback, + ) + def do_postprocess (gen, opt, callback): file_path = opt.prompt # treat the prompt as the file pathname if os.path.dirname(file_path) == '': #basename given @@ -670,7 +711,7 @@ def load_face_restoration(opt): print(traceback.format_exc(), file=sys.stderr) print('>> You may need to install the ESRGAN and/or GFPGAN modules') return gfpgan,codeformer,esrgan - + def make_step_callback(gen, opt, prefix): destination = os.path.join(opt.outdir,'intermediates',prefix) os.makedirs(destination,exist_ok=True) diff --git a/scripts/preload_models.py b/scripts/preload_models.py index 97b79e1845f..1b0ad80e5c9 100644 --- a/scripts/preload_models.py +++ b/scripts/preload_models.py @@ -107,25 +107,27 @@ print(traceback.format_exc()) print('...success') -print('Loading clipseq model for text-based masking...',end='') +print('Loading clipseg model for text-based masking...',end='') try: model_url = 'https://owncloud.gwdg.de/index.php/s/ioHbRzFx6th32hn/download' model_dest = 'src/clipseg/clipseg_weights.zip' - if not os.path.exists(model_dest): + weights_dir = 'src/clipseg/weights' + if not os.path.exists(weights_dir): os.makedirs(os.path.dirname(model_dest), exist_ok=True) urllib.request.urlretrieve(model_url,model_dest) with zipfile.ZipFile(model_dest,'r') as zip: zip.extractall('src/clipseg') os.rename('src/clipseg/clipseg_weights','src/clipseg/weights') - from models.clipseg import CLIPDensePredT + os.remove(model_dest) + from clipseg_models.clipseg import CLIPDensePredT model = CLIPDensePredT(version='ViT-B/16', reduce_dim=64, ) model.eval() model.load_state_dict( - torch.load('src/clipseg/weights/rd64-uni-refined.pth'), - model.load_state_dict(torch.load('src/clipseg/weights/rd64-uni-refined.pth'), - map_location=torch.device('cpu'), - strict=False, - ) + torch.load( + 'src/clipseg/weights/rd64-uni-refined.pth', + map_location=torch.device('cpu') + ), + strict=False, ) except Exception: print('Error installing clipseg model:')