## 試行錯誤編

In [None]:
!pip install git+https://github.com/facebookresearch/segment-anything.git

In [None]:
!pip install opencv-python



In [None]:
!pip install git+https://github.com/openai/CLIP.git

In [None]:
import os
import urllib
from functools import lru_cache
from random import randint
from typing import Any, Callable, Dict, List, Tuple

import clip
import cv2
import numpy as np
import PIL
import torch
from segment_anything import SamAutomaticMaskGenerator, sam_model_registry

CHECKPOINT_PATH = os.path.join(os.path.expanduser("~"), ".cache", "SAM")
CHECKPOINT_NAME = "sam_vit_h_4b8939.pth"
CHECKPOINT_URL = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
MODEL_TYPE = "default"
MAX_WIDTH = MAX_HEIGHT = 1024
TOP_K_OBJ = 100
THRESHOLD = 0.85
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


@lru_cache
def load_mask_generator() -> SamAutomaticMaskGenerator:
    if not os.path.exists(CHECKPOINT_PATH):
        os.makedirs(CHECKPOINT_PATH)
    checkpoint = os.path.join(CHECKPOINT_PATH, CHECKPOINT_NAME)
    if not os.path.exists(checkpoint):
        urllib.request.urlretrieve(CHECKPOINT_URL, checkpoint)
    sam = sam_model_registry[MODEL_TYPE](checkpoint=checkpoint).to(device)
    mask_generator = SamAutomaticMaskGenerator(sam)
    return mask_generator


@lru_cache
def load_clip(
    name: str = "ViT-B/32",
) -> Tuple[torch.nn.Module, Callable[[PIL.Image.Image], torch.Tensor]]:
    model, preprocess = clip.load(name, device=device)
    return model.to(device), preprocess


def adjust_image_size(image: np.ndarray) -> np.ndarray:
    height, width = image.shape[:2]
    if height > width:
        if height > MAX_HEIGHT:
            height, width = MAX_HEIGHT, int(MAX_HEIGHT / height * width)
    else:
        if width > MAX_WIDTH:
            height, width = int(MAX_WIDTH / width * height), MAX_WIDTH
    image = cv2.resize(image, (width, height))
    return image


@torch.no_grad()
def get_score(crop: PIL.Image.Image, texts: List[str]) -> torch.Tensor:
    model, preprocess = load_clip()
    preprocessed = preprocess(crop).unsqueeze(0).to(device)
    tokens = clip.tokenize(texts).to(device)
    logits_per_image, _ = model(preprocessed, tokens)
    similarity = logits_per_image.softmax(-1).cpu()
    return similarity[0, 0]


def crop_image(image: np.ndarray, mask: Dict[str, Any]) -> PIL.Image.Image:
    x, y, w, h = mask["bbox"]
    masked = image * np.expand_dims(mask["segmentation"], -1)
    crop = masked[y : y + h, x : x + w]
    if h > w:
        top, bottom, left, right = 0, 0, (h - w) // 2, (h - w) // 2
    else:
        top, bottom, left, right = (w - h) // 2, (w - h) // 2, 0, 0
    # padding
    crop = cv2.copyMakeBorder(
        crop,
        top,
        bottom,
        left,
        right,
        cv2.BORDER_CONSTANT,
        value=(0, 0, 0),
    )
    crop = PIL.Image.fromarray(crop)
    return crop


def get_texts(query: str) -> List[str]:
    return [f"a picture of {query}", "a picture of background"]


def filter_masks(
    image: np.ndarray,
    masks: List[Dict[str, Any]],
    predicted_iou_threshold: float,
    stability_score_threshold: float,
    query: str,
    clip_threshold: float,
) -> List[Dict[str, Any]]:
    filtered_masks: List[Dict[str, Any]] = []

    for mask in sorted(masks, key=lambda mask: mask["area"])[-TOP_K_OBJ:]:
        if (
            mask["predicted_iou"] < predicted_iou_threshold
            or mask["stability_score"] < stability_score_threshold
            or image.shape[:2] != mask["segmentation"].shape[:2]
            or query
            and get_score(crop_image(image, mask), get_texts(query)) < clip_threshold
        ):
            continue

        filtered_masks.append(mask)

    return filtered_masks


def remove_small_segments(segmentation: np.ndarray) -> np.ndarray:
    # ブール配列を整数型に変換（OpenCVの関数はブール型を直接扱えないため）
    segmentation_int = segmentation.astype(np.uint8)  # Trueを1に、Falseを0に変換

    # すべての連結成分を見つけ、ラベル付けする
    num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(segmentation_int)

    # 最大の連結成分（背景を除く）のラベルを見つける
    # 面積はstatsの5番目の列に格納されています（index=4）
    # 背景の成分（ラベル0）を除外して最大のものを見つける
    largest_label = 1 + np.argmax(stats[1:, 4])  # 背景を除く最大領域
    # 最大の連結成分のみを保持
    cleaned_segmentation = (labels == largest_label)

    return cleaned_segmentation

def remove_contained_masks(masks: List[np.ndarray]) -> List[np.ndarray]:
    # マスクが他のマスクに完全に含まれているかどうかをチェック
    remaining_masks = []
    for i, mask_i in enumerate(masks):
        fully_contained = False
        for j, mask_j in enumerate(masks):
            if i != j and np.all(mask_i["segmentation"] <= mask_j["segmentation"]):
                fully_contained = True
                break
        if not fully_contained:
            remaining_masks.append(mask_i)
    return remaining_masks

def remove_overlapping_masks(masks: List[np.ndarray], overlap_threshold: float = 0.8) -> List[np.ndarray]:
    # マスクが他のマスクと大きく重複しているかどうかをチェックし、重複している場合は小さい方を削除
    remaining_masks = []
    removed_indices = set()  # 削除されたマスクのインデックスを保持

    for i, mask_i in enumerate(masks):
        if i in removed_indices:
            continue  # すでに削除されているマスクはスキップ

        for j, mask_j in enumerate(masks):
            if i != j and j not in removed_indices:
                # 両マスク間の重複領域を計算
                intersection = np.logical_and(mask_i["segmentation"], mask_j["segmentation"])
                intersection_area = np.sum(intersection)

                # 小さい方のマスクの面積を計算
                area_i = np.sum(mask_i["segmentation"])
                area_j = np.sum(mask_j["segmentation"])
                min_area = min(area_i, area_j)

                # 重複領域が小さい方のマスクの面積の特定の割合以上なら、小さい方のマスクを削除
                if intersection_area / min_area > overlap_threshold:
                    if area_i < area_j:
                        removed_indices.add(i)
                        break  # 現在のマスクiを削除し、次のマスクに進む
                    else:
                        removed_indices.add(j)
                        # マスクjを削除しても、マスクiの処理は続ける

    # 削除されていないマスクのみを保持
    for i, mask in enumerate(masks):
        if i not in removed_indices:
            remaining_masks.append(mask)

    return remaining_masks


def draw_masks(
    image: np.ndarray, masks: List[np.ndarray], alpha: float = 0.7
) -> np.ndarray:
    masks = remove_overlapping_masks(masks)
    surfaces = []
    transparent_mask = np.zeros_like(image)

    for mask in masks:
        segmentation = remove_small_segments(mask["segmentation"])
        area = np.sum(segmentation)

        if mask["segmentation"].size * 0.01 > area:
          continue

        color = [randint(127, 255) for _ in range(3)]

        # draw mask overlay
        colored_mask = np.expand_dims(segmentation, 0).repeat(3, axis=0)
        colored_mask = np.moveaxis(colored_mask, 0, -1)

        # masked = np.ma.MaskedArray(image, mask=colored_mask, fill_value=color)
        # image_overlay = masked.filled()
        # image = cv2.addWeighted(image, 1 - alpha, image_overlay, alpha, 0)

        # draw contour
        print(np.uint8(segmentation))
        contours, _ = cv2.findContours(
            np.uint8(segmentation), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )
        cv2.drawContours(image, contours, -1, (0, 0, 255), 2)

        for contour in contours:
            # Calculate the perimeter of the contour
            perimeter = cv2.arcLength(contour, True)
            # Approximate the contour to a polygon
            epsilon = 0.02 * perimeter  # 2% of the perimeter
            approx = cv2.approxPolyDP(contour, epsilon, True)
            if len(approx) == 4:
              # Draw the approximated polygon (should be a quadrilateral if the shape is close to a rectangle)
              cv2.drawContours(image, [approx], 0, (255, 0, 0), 2)
              cv2.drawContours(transparent_mask, [approx], 0, (255, 0, 0), -1)

              surfaces.append(np.squeeze(approx, axis=1))

    image = cv2.addWeighted(transparent_mask, 1 - alpha, image, alpha, 0)

    return image, surfaces

def crop_and_affine_transform_quadrilateral(original_image: np.ndarray, src_pts: np.ndarray) -> np.ndarray:
    # src_ptsが平行四辺形であると仮定して、アフィン変換を適用
    # 変換後の点を定義 (左上、右上、左下の順)
    width_a = np.sqrt(((src_pts[0][0][0] - src_pts[1][0][0]) ** 2) + ((src_pts[0][0][1] - src_pts[1][0][1]) ** 2))
    width_b = np.sqrt(((src_pts[2][0][0] - src_pts[3][0][0]) ** 2) + ((src_pts[2][0][1] - src_pts[3][0][1]) ** 2))
    height_a = np.sqrt(((src_pts[0][0][0] - src_pts[3][0][0]) ** 2) + ((src_pts[0][0][1] - src_pts[3][0][1]) ** 2))
    height_b = np.sqrt(((src_pts[1][0][0] - src_pts[2][0][0]) ** 2) + ((src_pts[1][0][1] - src_pts[2][0][1]) ** 2))
    max_width = max(int(width_a), int(width_b))
    max_height = max(int(height_a), int(height_b))
    dst_pts = np.array([[0, 0], [max_width - 1, 0], [0, max_height - 1]], dtype='float32')

    # 3つの点からアフィン変換行列を計算
    M = cv2.getAffineTransform(np.float32(src_pts[:3]), dst_pts)

    # アフィン変換を適用して画像を変換
    transformed = cv2.warpAffine(original_image, M, (max_width, max_height))

    return transformed

def crop_test(img, points):
    points = sorted(points, key=lambda x:x[1])  # yが小さいもの順に並び替え。
    top = sorted(points[:2], key=lambda x:x[0])  # 前半二つは四角形の上。xで並び替えると左右も分かる。
    bottom = sorted(points[2:], key=lambda x:x[0], reverse=True)  # 後半二つは四角形の下。同じくxで並び替え。
    points = np.array(top + bottom, dtype='float32')  # 分離した二つを再結合。

    width = max(np.sqrt(((points[0][0]-points[2][0])**2)*2), np.sqrt(((points[1][0]-points[3][0])**2)*2))
    height = max(np.sqrt(((points[0][1]-points[2][1])**2)*2), np.sqrt(((points[1][1]-points[3][1])**2)*2))

    dst = np.array([
            np.array([0, 0]),
            np.array([width-1, 0]),
            np.array([width-1, height-1]),
            np.array([0, height-1]),
            ], np.float32)

    trans = cv2.getPerspectiveTransform(points, dst)  # 変換前の座標と変換後の座標の対応を渡すと、透視変換行列を作ってくれる。
    return cv2.warpPerspective(img, trans, (int(width), int(height)))

def normalize_surface(surface, image_width, image_height):
    # surfaceの各頂点を正規化（0から1の範囲に変換）
    normalized_surface = np.zeros_like(surface, dtype=np.float32)
    normalized_surface[:, 0] = surface[:, 0] / image_width  # x座標を正規化
    normalized_surface[:, 1] = surface[:, 1] / image_height  # y座標を正規化

    return normalized_surface

def denormalize_approx(surface, image_width, image_height):
    denormalized_approx = np.zeros_like(surface, dtype=np.int32)
    denormalized_approx[:, 0] = np.round(surface[:, 0] * image_width).astype(np.int32)  # x座標を元に戻す
    denormalized_approx[:, 1] = np.round(surface[:, 1] * image_height).astype(np.int32)  # y座標を元に戻す

    return denormalized_approx

def segment_frame(predicted_iou_threshold: float,
    stability_score_threshold: float,
    clip_threshold: float,
    frame: str,
    query: str,
    mask_generator,
):
    # mask_generator = load_mask_generator()
    # ori_image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # reduce the size to save gpu memory
    image = adjust_image_size(frame)
    try:
      masks = mask_generator.generate(image)
      masks = filter_masks(
          image,
          masks,
          predicted_iou_threshold,
          stability_score_threshold,
          query,
          clip_threshold,
      )
      masked_image, _ = draw_masks(image, masks)
      # masked_image = PIL.Image.fromarray(masked_image)
      return masked_image
    except Exception as e:
      print(e)
      print("error occur")
      return image

def segment(
    predicted_iou_threshold: float,
    stability_score_threshold: float,
    clip_threshold: float,
    image_path: str,
    query: str,
) -> PIL.ImageFile.ImageFile:
    mask_generator = load_mask_generator()
    ori_image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    ori_image = cv2.cvtColor(ori_image, cv2.COLOR_BGR2RGB)

    # reduce the size to save gpu memory
    image = adjust_image_size(ori_image)
    masks = mask_generator.generate(image)
    masks = filter_masks(
        image,
        masks,
        predicted_iou_threshold,
        stability_score_threshold,
        query,
        clip_threshold,
    )
    masked_image, surfaces = draw_masks(image, masks)

    # cropped_images = []
    normalized_surfaces = []
    for surface in surfaces:
      height, width, _ = image.shape
      nor_surface = normalize_surface(surface, width, height)
      normalized_surfaces.append(nor_surface)
    #   ori_h, ori_w, _ = ori_image.shape
    #   sca_surface = denormalize_approx(surface, ori_w, ori_h)

      # cropped_img = crop_test(ori_image, sca_surface)
      # cropped_images.append(cropped_img)

    masked_image = PIL.Image.fromarray(masked_image)
    return masked_image, surfaces

In [None]:
filename = "/content/samples/demo01.jpg"

image, surfaces = segment(0.8, 0.8, 0.96, filename, "display")

image

In [None]:
from tqdm import tqdm

video_path = "/content/samples/MOVIE.mp4"

cap = cv2.VideoCapture(video_path)

# 動画のフレームレートを取得
fps = cap.get(cv2.CAP_PROP_FPS)

_, first_frame = cap.read()

first_frame = adjust_image_size(first_frame.copy())
height, width = first_frame.shape[:2]

# 10秒間に相当するフレーム数を計算
frames_to_process = int(20 * fps)

# 出力用の動画ファイルを準備
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('output_video.mp4', fourcc, fps, (width, height))

# 処理するフレーム数をカウント
frame_count = 0

mask_generator = load_mask_generator()

for _ in range(int(5 * fps)):
  cap.read()

with tqdm(total=100) as pbar:
  while(cap.isOpened()):
      ret, frame = cap.read()
      if not ret or frame_count >= frames_to_process:
          break

      # フレームごとに物体検出を実行
      detected_frame = segment_frame(0.8, 0.8, 0.96, frame.copy(), "display", mask_generator)
      # detected_frame = segment_frame(0.8, 0.8, 0.96, frame.copy(), "display")

      # 出力用の動画ファイルに書き込む
      out.write(detected_frame)

      pbar.update(100/frames_to_process)
      frame_count += 1

cap.release()
out.release()

## FAST SAM

### image

In [None]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.1.27-py3-none-any.whl (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.2/721.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting thop>=0.1.1 (from ultralytics)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Installing collected packages: thop, ultralytics
Successfully installed thop-0.1.1.post2209072238 ultralytics-8.1.27


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import cv2
from google.colab.patches import cv2_imshow
from ultralytics import FastSAM
from ultralytics.models.fastsam import FastSAMPrompt

# Define an inference source
source = '/content/drive/MyDrive/未踏/prototype/pdemo.jpg'

# Create a FastSAM model
model = FastSAM('FastSAM-x.pt')  # or FastSAM-x.pt

# Run inference on an image
everything_results = model(source, device='cuda', retina_masks=True, imgsz=1024, conf=0.4, iou=0.9)

# Prepare a Prompt Process object
prompt_process = FastSAMPrompt(source, everything_results, device='cuda')

# Everything prompt
ann = prompt_process.everything_prompt()

# Bbox default shape [0,0,0,0] -> [x1,y1,x2,y2]
# ann = prompt_process.box_prompt(bbox=[200, 200, 300, 300])

# Text prompt
# ann = prompt_process.text_prompt(text='the screen and laptop screen')

# Point prompt
# points default [[0,0]] [[x1,y1],[x2,y2]]
# point_label default [0] [1,0] 0:background, 1:foreground
# ann = prompt_process.point_prompt(points=[[200, 200]], pointlabel=[1])
prompt_process.plot(annotations=ann, output='./result')

result = cv2.imread('./result/demo01.jpg', cv2.IMREAD_UNCHANGED)
cv2_imshow(result)


In [None]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [None]:
!pip install numba

from numba import cuda
device = cuda.get_current_device()
device.reset()



In [None]:
import cv2
from google.colab.patches import cv2_imshow
from ultralytics import FastSAM
from ultralytics.models.fastsam import FastSAMPrompt
import numpy as np
import torch
from PIL import Image


class CustomFastSAMPrompt(FastSAMPrompt):
    def __init__(self, device='cuda'):
        try:
            import clip
        except ImportError:
            from ultralytics.utils.checks import check_requirements

            check_requirements("git+https://github.com/openai/CLIP.git")
            import clip
        self.clip = clip
        self.device = device
        with torch.no_grad():
            self.clip_model, self.preprocess = self.clip.load("ViT-B/32", device=device)

    @torch.no_grad()
    def pre_make_text_features(self, search_text: str, device) -> int:
        tokenized_text = self.clip.tokenize([search_text]).to(device)
        text_features = self.clip_model.encode_text(tokenized_text)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        return text_features

    def _crop_image(self, format_results):
        image = Image.fromarray(cv2.cvtColor(self.results[0].orig_img, cv2.COLOR_BGR2RGB))
        ori_w, ori_h = image.size
        annotations = format_results
        mask_h, mask_w = annotations[0]["segmentation"].shape
        if ori_w != mask_w or ori_h != mask_h:
            image = image.resize((mask_w, mask_h))
        cropped_boxes = []
        cropped_images = []
        not_crop = []
        filter_id = []
        for _, mask in enumerate(annotations):
            if np.sum(mask["segmentation"]) <= 100:
                filter_id.append(_)
                continue
            bbox = self._get_bbox_from_mask(mask["segmentation"])  # bbox from mask
            cropped_boxes.append(self._segment_image(image, bbox))  # save cropped image
            cropped_images.append(bbox)  # save cropped image bbox

        return cropped_boxes, cropped_images, not_crop, filter_id, annotations

    @torch.no_grad()
    def fast_retrieve(self, model, preprocess, elements, text_features: str, device) -> int:
        """Processes images and text with a model, calculates similarity, and returns softmax score."""
        preprocessed_images = [preprocess(image).to(device) for image in elements]
        stacked_images = torch.stack(preprocessed_images)
        image_features = model.encode_image(stacked_images)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        probs = 100.0 * image_features @ text_features.T
        return probs[:, 0].softmax(dim=0)

    @staticmethod
    def _normalize_mask(mask, width, height):
        normalized_coordinates = np.copy(mask).astype(np.float32)
        normalized_coordinates[:, 0] /= width
        normalized_coordinates[:, 1] /= height
        return normalized_coordinates

    def _custom_crop_image(self, format_results, orig_img):
        image = Image.fromarray(cv2.cvtColor(orig_img, cv2.COLOR_BGR2RGB))
        ori_w, ori_h = image.size
        annotations = format_results
        mask_h, mask_w = annotations[0]["segmentation"].shape
        if ori_w != mask_w or ori_h != mask_h:
            image = image.resize((mask_w, mask_h))

        cropped_boxes = []
        cropped_images = []
        not_crop = []
        filter_id = []

        for _, mask in enumerate(annotations):
            segmentation = mask["segmentation"]

            if np.sum(segmentation) <= mask_w * mask_h * 0.01:
                filter_id.append(_)
                continue

            # ディスプレイぽいのを検出
            contours, _ = cv2.findContours(
                np.uint8(segmentation), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
            )

            for contour in contours:
                  perimeter = cv2.arcLength(contour, True)
                  epsilon = 0.02 * perimeter
                  approx = cv2.approxPolyDP(contour, epsilon, True)
                  if len(approx) == 4:
                      bbox = [
                          min(approx[:, 0, 0]),
                          min(approx[:, 0, 1]),
                          max(approx[:, 0, 0]),
                          max(approx[:, 0, 1]),
                      ]
                      cropped_boxes.append(self._segment_image(image, bbox))
                      approx = np.squeeze(approx, axis=1)
                      cropped_images.append(self._normalize_mask(approx, mask_w, mask_h))
                  else:
                      filter_id.append(_)
                      continue

        return cropped_boxes, cropped_images, not_crop, filter_id, annotations

    @torch.no_grad()
    def custom_filter_mask(self, text, source, results, threshold=0.5, text_features=None, device="cuda"):
        self.results = results
        self.source = source
        self.device = device

        new_results = []
        for result in self.results:
          if result.masks is not None:
            format_results = self._format_results(result, 0)
            cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._custom_crop_image(format_results, result.orig_img)

            if text_features is None:
                scores = self.retrieve(self.clip_model, self.preprocess, cropped_boxes, text, device=self.device)
            else:
                scores = self.fast_retrieve(self.clip_model, self.preprocess, cropped_boxes, text_features, device=self.device)

            scores = scores.cpu().numpy()
            valid_indices = np.where(scores >= threshold)[0]
            valid_cropped_images = np.array(cropped_images)[valid_indices]

            new_results.append(valid_cropped_images)

        if len(new_results) == 1:
          return new_results[0]
        return new_results


        # if self.results[0].masks is not None:
        #     format_results = self._format_results(self.results[0], 0)
        #     cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._custom_crop_image(format_results)

        #     if text_features is None:
        #         scores = self.retrieve(self.clip_model, self.preprocess, cropped_boxes, text, device=self.device)
        #     else:
        #         scores = self.fast_retrieve(self.clip_model, self.preprocess, cropped_boxes, text_features, device=self.device)

        #     scores = scores.cpu().numpy()
        #     valid_indices = np.where(scores >= threshold)[0]
        #     valid_cropped_images = np.array(cropped_images)[valid_indices]

        #     return valid_cropped_images
        # return []


In [None]:
import cv2
from google.colab.patches import cv2_imshow
from ultralytics import FastSAM
from ultralytics.models.fastsam import FastSAMPrompt

# Define an inference source
source = '/content/drive/MyDrive/未踏/prototype/pdemo.jpg'

# Create a FastSAM model
with torch.no_grad():
  model = FastSAM('FastSAM-x.pt')  # or FastSAM-x.pt

  # Run inference on an image
  everything_results = model(source, device='cuda', retina_masks=True, imgsz=1024, conf=0.4, iou=0.9)

  del model
  prompt_process = CustomFastSAMPrompt(device='cuda')

  # Prepare a Prompt Process object

  prompt = 'operation system, windows, macos, browser, screen with text'
  text_features = prompt_process.pre_make_text_features(search_text=prompt, device='cuda')
  masks = prompt_process.custom_filter_mask(text=prompt, source=source, results=everything_results, text_features=text_features, threshold=0.1)

del prompt_process

len(masks)




OutOfMemoryError: CUDA out of memory. Tried to allocate 7.56 GiB. GPU 0 has a total capacty of 14.75 GiB of which 4.75 GiB is free. Process 72869 has 9.99 GiB memory in use. Of the allocated memory 9.79 GiB is allocated by PyTorch, and 25.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
def denomarize_mask(mask, width, height):
    denomarized_mask = np.round(mask * np.array([width, height])).astype(np.int32)
    return denomarized_mask

def denomarize_masks(masks, width, height):
    denomarized_mask = np.round(masks * np.array([[width, height]])).astype(np.int32)
    return denomarized_mask

def draw_mask_result(masks, ori_image):
  masks = denomarize_masks(masks, ori_image.shape[1], ori_image.shape[0])
  for mask in masks:
    cv2.drawContours(ori_image, [mask], 0, (255, 0, 0), -1)
  return ori_image

In [None]:
ori_image = cv2.imread(source, cv2.IMREAD_COLOR)
ori_image = draw_mask_result(masks, ori_image)

cv2_imshow(ori_image)

In [None]:
def order_points_clockwise(pts):
    x_sorted = pts[np.argsort(pts[:, 0]), :]
    left_most = x_sorted[:2, :]
    right_most = x_sorted[2:, :]
    left_most = left_most[np.argsort(left_most[:, 1]), :]
    (tl, bl) = left_most
    D = np.linalg.norm(right_most - tl, axis=1)
    (br, tr) = right_most[np.argsort(D)[::-1], :]
    return np.array([tl, tr, br, bl], dtype="float32")

def calculate_output_size(src_coords):
    widths = [np.linalg.norm(src_coords[0] - src_coords[1]), np.linalg.norm(src_coords[2] - src_coords[3])]
    heights = [np.linalg.norm(src_coords[0] - src_coords[3]), np.linalg.norm(src_coords[1] - src_coords[2])]

    max_width = int(max(widths))
    max_height = int(max(heights))

    return (max_height, max_width)

def crop_and_affine_display(mask, ori_image):
    mask = denomarize_mask(mask, ori_image.shape[1], ori_image.shape[0])
    output_size = calculate_output_size(mask)

    dst_coords = np.array([[0, 0], [output_size[0], 0], [output_size[0], output_size[1]], [0, output_size[1]]], dtype=np.float32)

    # Convert source coordinates to numpy array
    src_coords = np.array(mask, dtype=np.float32)
    src_coords = order_points_clockwise(src_coords)
    transform_matrix = cv2.getPerspectiveTransform(src_coords, dst_coords)

    # Apply the perspective transformation
    transformed_image = cv2.warpPerspective(ori_image, transform_matrix, output_size)

    return transformed_image

In [None]:
!sudo apt -y install tesseract-ocr tesseract-ocr-jpn libtesseract-dev libleptonica-dev tesseract-ocr-script-jpan tesseract-ocr-script-jpan-vert

In [None]:
!pip install pytesseract pyocr

In [None]:
import cv2
import pytesseract
from pytesseract import Output
import pyocr
import pyocr.builders
from PIL import Image

def add_text_detection_boxes(image):
    tools = pyocr.get_available_tools()
    if len(tools) == 0:
        print("No OCR tool found")
        return
    tool = tools[0]  # 利用可能なツールの最初のものを使用

    pil_image = Image.fromarray(image)

    # テキストの検出領域を取得
    boxes = tool.image_to_string(
        pil_image,
        lang='jpn',
        builder=pyocr.builders.WordBoxBuilder(tesseract_layout=1)
    )

    # 検出したテキストの領域に対して枠を描画
    for box in boxes:
        top_left = box.position[0]
        bottom_right = box.position[1]
        image = cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2)

    return image

In [None]:
!pip install google-cloud-vision



In [None]:
from google.colab import auth
auth.authenticate_user(project_id="")

In [None]:
from google.cloud import vision
from PIL import Image
import io
from google.cloud.vision_v1 import types

def add_text_detection_boxes_google(image):
    # クライアントを初期化
    client = vision.ImageAnnotatorClient()


    success, encoded_image = cv2.imencode('.jpg', image)
    if not success:
        raise Exception("Image encoding failed")

    # エンコードされたバイト列をPythonのバイト配列に変換
    byte_io = io.BytesIO(encoded_image.tobytes())

    # Google Cloud Vision APIに渡すための画像オブジェクトを作成
    g_image = types.Image(content=byte_io.getvalue())

    # OCR実行
    response = client.text_detection(image=g_image)
    texts = response.text_annotations

    # 検出されたテキストの領域を描画
    for text in texts[1:]:  # 最初の要素は全テキストを含むためスキップ
        vertices = [(vertex.x, vertex.y) for vertex in text.bounding_poly.vertices]
        cv2.polylines(image, [np.array(vertices)], True, (0, 255, 0), 2)

    return image

In [None]:
def overlay_on_four_points(base_image, overlay_image, dst_points):
    """
    base_image: 背景画像
    overlay_image: 上に貼り付ける画像
    dst_points: 背景画像上の四点の座標 (numpy array of shape (4, 2))
    """
    # 貼り付ける画像の四隅の座標 (左上, 右上, 右下, 左下)
    src_points = np.float32([[0, 0], [overlay_image.shape[1], 0], [overlay_image.shape[1], overlay_image.shape[0]], [0, overlay_image.shape[0]]])
    dst_points = denomarize_mask(dst_points, base_image.shape[1], base_image.shape[0])
    dst_points = order_points_clockwise(dst_points)
    # 変換行列を計算
    M = cv2.getPerspectiveTransform(src_points, dst_points)

    # 変換行列を使用して画像を変形
    transformed_overlay = cv2.warpPerspective(overlay_image, M, (base_image.shape[1], base_image.shape[0]))

    # 貼り付ける領域を作成 (マスクを使用)
    mask = np.zeros_like(base_image, dtype=np.uint8)
    cv2.fillConvexPoly(mask, np.int32(dst_points), (255,) * base_image.shape[2])

    # 背景画像から該当領域を削除
    base_image = cv2.bitwise_and(base_image, cv2.bitwise_not(mask))

    # 変形した画像を背景画像に貼り付け
    result_image = cv2.add(base_image, cv2.bitwise_and(transformed_overlay, mask))
    return result_image


def render_ocr_result(image, boxes):
    for box in boxes:
        transformed_image = crop_and_affine_display(box, image)
        ocr_image = add_text_detection_boxes(transformed_image)
        image = overlay_on_four_points(image, ocr_image, box)
    return image

### 映像処理

In [None]:
from tqdm import tqdm
import cv2
from google.colab.patches import cv2_imshow
from ultralytics import FastSAM
from ultralytics.models.fastsam import FastSAMPrompt

import logging


logger = logging.getLogger()
logger.setLevel(logging.ERROR)

video_path = "/content/samples/demo03.mp4"

cap = cv2.VideoCapture(video_path)

# 動画のフレームレートを取得
fps = cap.get(cv2.CAP_PROP_FPS)

_, first_frame = cap.read()

height, width = first_frame.shape[:2]

# 10秒間に相当するフレーム数を計算
frames_to_process = int(60 * fps)

# 出力用の動画ファイルを準備
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('output_video-demo03.mp4', fourcc, fps, (width, height))

# 処理するフレーム数をカウント
frame_count = 0
batch_frame_size = 2

# for _ in range(int(5 * fps)):
#   cap.read()



with torch.no_grad():
  prompt_process = CustomFastSAMPrompt(device='cuda')
  model = FastSAM('FastSAM-x.pt')  # or FastSAM-x.pt
  prompt = 'operation system, windows, macos, browser, display word'
  text_features = prompt_process.pre_make_text_features(search_text=prompt, device='cuda')

  with tqdm(total=100) as pbar:
    while(cap.isOpened()):
        ret, frame = cap.read()
        if not ret or frame_count >= frames_to_process:
            break

        # batch_frames = []
        # for _ in range(batch_frame_size):
        #     ret, frame = cap.read()
        #     if not ret:
        #         break
        #     batch_frames.append(frame.copy())

        # everything_results = model(batch_frames, device='cuda', retina_masks=True, imgsz=1024, conf=0.4, iou=0.9, verbose=True)
        # batch_masks = prompt_process.custom_filter_mask(text=prompt, source="", results=everything_results, text_features=text_features, threshold=0.1)

        # for i, masks in enumerate(batch_masks):
        #     image = batch_frames[i]
        #     image = render_ocr_result(image, masks)
        #     # image = draw_mask_result(masks, image)
        #     out.write(image)

        # pbar.update(batch_frame_size * 100/frames_to_process)
        # frame_count += batch_frame_size


        image = frame.copy()
        everything_results = model(image, device='cuda', retina_masks=True, imgsz=1024, conf=0.4, iou=0.9)
        masks = prompt_process.custom_filter_mask(text=prompt, source="", results=everything_results, threshold=0.1)

        # image = draw_mask_result(masks, image)
        image = render_ocr_result(image, masks)

        out.write(image)
        pbar.update(100/frames_to_process)
        frame_count += 1


del model
del prompt_process

cap.release()
out.release()

### tracking

In [None]:
import json

import cv2
import numpy as np


def denomarize_mask(mask, width, height):
    denomarized_mask = np.round(mask * np.array([width, height])).astype(np.int32)
    return denomarized_mask


with open("/content/samples/outputs/demo04/mask_data.jsonl") as f:
    first_mask = json.loads(f.readline())
    masks = first_mask["masks"]

# 動画ファイルを読み込む
video_path = "/content/samples/outputs/demo04/demo04.mp4"
video = cv2.VideoCapture(video_path)

# 出力する動画の設定
fps = int(video.get(cv2.CAP_PROP_FPS))
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
output = cv2.VideoWriter("output_tracker_video_04.mp4", fourcc, fps, (width, height))

# トラッキングする領域を選択する
_, frame = video.read()

mask = denomarize_mask(masks[0], width, height)

# maskは4点の座標を持つため、それを使ってbboxを作成する
# ここでは、単純に最小のx, yと最大のx, yを使ってbboxを作成します
x, y = mask.min(axis=0)
w, h = mask.max(axis=0) - mask.min(axis=0)

bbox = (x, y, w, h)

# トラッキングメソッドを初期化する（ここではMILを使用しますが、他にもいくつか選択肢があります）
tracker = cv2.TrackerMIL_create()
tracker.init(frame, bbox)


# 動画を通じてトラッキングを行う
while video.isOpened():
    ret, frame = video.read()
    if not ret:
        break  # 動画が終了したらループを抜ける

    # 物体のトラッキングを更新する
    success, bbox = tracker.update(frame)

    if success:
        x, y, w, h = [int(v) for v in bbox]
        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2, 1)
    else:
        cv2.putText(
            frame,
            "Tracking failure detected",
            (100, 80),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.75,
            (0, 0, 255),
            2,
        )

    # 処理したフレームを出力動画に追加する
    output.write(frame)


# リソースの解放
video.release()
output.release()


In [None]:
!pip install -U opencv-python

In [None]:
!wget https://github.com/opencv/opencv_zoo/blob/main/models/object_tracking_vittrack/object_tracking_vittrack_2023sep.onnx

In [None]:
# multi object tracking
import json

import clip
import cv2
import numpy as np
import torch
from PIL import Image


@torch.no_grad()
def fast_retrieve(model, preprocess, elements, text_features: str, device) -> int:
    """Processes images and text with a model, calculates similarity, and returns softmax score."""
    print(elements)
    preprocessed_images = [preprocess(image).to(device) for image in elements]
    stacked_images = torch.stack(preprocessed_images)
    image_features = model.encode_image(stacked_images)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    probs = 100.0 * image_features @ text_features.T
    return probs[:, 0].softmax(dim=0)


def crop_boxes(orig_img, boxes):
    image = Image.fromarray(cv2.cvtColor(orig_img, cv2.COLOR_BGR2RGB))
    cropped_boxes = []
    for box in boxes:
        x, y, w, h = box
        cropped_boxes.append(image.crop((x, y, x + w, y + h)))
    return cropped_boxes


def filter_trackers(
    trackers,
    frame,
    clip_model,
    clip_preprocess,
    text_features,
    threshold=0.5,
    device="cuda",
):
    if len(trackers) == 0:
        return []

    # trackerのbboxが小さいものは除外
    trackers = [
        tracker for tracker in trackers if tracker["bbox"][2] * tracker["bbox"][3] > 100
    ]

    boxes = [tracker["bbox"] for tracker in trackers]

    box_images = crop_boxes(frame, boxes)

    # print(len(box_images))
    # for box_image in box_images:
    #     print(box_image.size)

    # 画像とテキストの特徴量を取得
    image_features = fast_retrieve(
        clip_model, clip_preprocess, box_images, text_features, device
    )

    # 類似度が低いトラッカーを削除
    new_trackers = []
    for tracker, similarity in zip(trackers, image_features):
        if similarity > threshold:
            new_trackers.append(tracker)
        else:
            print("similarity is low")

    return new_trackers


def random_color():
    return np.random.randint(0, 255, (3,)).tolist()


def denomarize_masks(masks, width, height):
    denomarized_mask = np.round(masks * np.array([[width, height]])).astype(np.int32)
    return denomarized_mask


def detect_objects_from_file(f, width, height):
    if not f.seekable():
        raise ValueError("File must be seekable")

    data = f.readline()
    if not data:
        return []

    first_mask = json.loads(data)
    if not first_mask:
        return []
    masks = first_mask["masks"]
    masks = denomarize_masks(masks, width, height)

    # maskは4点の座標を持つため、それを使ってbboxを作成する
    # ここでは、単純に最小のx, yと最大のx, yを使ってbboxを作成します
    bboxes = []
    for mask in masks:
        try:
            x, y = mask.min(axis=0)
            w, h = mask.max(axis=0) - mask.min(axis=0)
            bboxes.append((x, y, w, h))
        except Exception as e:
            print(e)
            pass

    return bboxes  # [(x1, y1, w1, h1), (x2, y2, w2, h2), ...]


# 重複をチェックする関数
def is_overlapping(new_box, existing_boxes, threshold=0.5):
    x1, y1, w1, h1 = new_box
    for box in existing_boxes:
        x2, y2, w2, h2 = box
        # 重なっている領域を計算
        dx = min(x1 + w1, x2 + w2) - max(x1, x2)
        dy = min(y1 + h1, y2 + h2) - max(y1, y2)
        if dx >= 0 and dy >= 0:
            intersect_area = dx * dy
            area1 = w1 * h1
            area2 = w2 * h2
            # 重なっている領域が両方のバウンディングボックスのいずれかの一定の割合以上であるか確認
            if intersect_area > threshold * area1 or intersect_area > threshold * area2:
                return True
    return False


# 動画ファイルを読み込む
video_path = "/content/samples/outputs/demo02/demo02.mp4"
video = cv2.VideoCapture(video_path)

# 出力する動画の設定
fps = int(video.get(cv2.CAP_PROP_FPS))
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
output = cv2.VideoWriter(
    "output_tracker_video_02_multi.mp4", fourcc, fps, (width, height)
)

trackers = []


mask_data_file = open("/content/samples/outputs/demo02/mask_data.jsonl")

vit_params = cv2.TrackerVit_Params()
vit_params.net = "/content/samples/object_tracking_vittrack_2023sep.onnx"

clip_model, clip_preprocess = clip.load("ViT-B/32", device="cuda")
text_features = clip_model.encode_text(
    clip.tokenize(["operation system, windows, macos, browser, display word"]).to(
        "cuda"
    )
)
text_features /= text_features.norm(dim=-1, keepdim=True)

frame_count = 0
target_secods = 10
while video.isOpened():
    ret, frame = video.read()
    if not ret:
        break

    if frame_count > (fps * target_secods) == 0:
        break

    # トラッカーの更新
    trackers = filter_trackers(
        trackers, frame, clip_model, clip_preprocess, text_features, threshold=0.1
    )
    existing_boxes = [
        tracker["bbox"] for tracker in trackers
    ]  # 既存のバウンディングボックスを取得

    new_objects = detect_objects_from_file(mask_data_file, width, height)

    # 新しい物体が既に追跡されていないか確認
    for new_box in new_objects:
        if not is_overlapping(new_box, existing_boxes):
            print("new object")
            tracker = cv2.TrackerVit.create(vit_params)
            tracker.init(frame, new_box)
            new_color = random_color()
            tracker_item = {"tracker": tracker, "color": new_color, "bbox": new_box}
            trackers.append(tracker_item)

    # トラッカーの更新と描画
    for tracker_item in trackers:
        success, bbox = tracker_item["tracker"].update(frame)
        if success:
            x, y, w, h = [int(v) for v in bbox]
            cv2.rectangle(frame, (x, y), (x + w, y + h), tracker_item["color"], 3, 1)
            # トラッキングしたバウンディングボックスを更新
            tracker_item["bbox"] = (x, y, w, h)
        else:
            # トラッキングに失敗した場合はトラッカーを削除
            print("lost object")
            trackers.remove(tracker_item)

    # 処理したフレームを出力動画に追加する
    output.write(frame)
    frame_count += 1

    if frame_count % fps == 0:
        print(f"{frame_count} frames processed")

video.release()
output.release()
mask_data_file.close()


## 下記テスト

In [None]:
ann[0]

In [None]:
from PIL import Image
def denomarize_mask(mask, width, height):
    denomarized_mask = np.round(mask.xyn * np.array([width, height])).astype(np.int32)
    return denomarized_mask

def segment_image(image, bbox):
    """Segments the given image according to the provided bounding box coordinates."""
    image_array = np.array(image)
    segmented_image_array = np.zeros_like(image_array)
    x1, y1, x2, y2 = bbox
    segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2]
    segmented_image = Image.fromarray(segmented_image_array)
    black_image = Image.new("RGB", image.size, (255, 255, 255))
    # transparency_mask = np.zeros_like((), dtype=np.uint8)
    transparency_mask = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=np.uint8)
    transparency_mask[y1:y2, x1:x2] = 255
    transparency_mask_image = Image.fromarray(transparency_mask, mode="L")
    black_image.paste(segmented_image, mask=transparency_mask_image)
    return black_image

def filter_mask(image, masks, clip_model, preprocess):
    results = []
    height, width = image.shape[:2]

    mask_h, mask_w = masks[0].cpu().numpy().shape[:2]
    if mask_h != height or mask_w != width:
          image = image.resize((mask_w, mask_h))

    cropped_boxes = []
    cropped_images = []
    not_crop = []
    # filter_id = []

    for mask in masks:
      if np.sum(mask["segmentation"]) <= 100:
          continue

      segmentation = mask.cpu().numpy()

      # ディスプレイぽいのを検出
      contours, _ = cv2.findContours(
          np.uint8(segmentation), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
      )

      for contour in contours:
            perimeter = cv2.arcLength(contour, True)
            epsilon = 0.02 * perimeter
            approx = cv2.approxPolyDP(contour, epsilon, True)
            if len(approx) == 4:
                cropped_boxes.append(segment_image(image, approx.squeeze().tolist()))
                cropped_images.append(approx)



def draw_fastsam_masks(
    image: np.ndarray, masks: List[np.ndarray], alpha: float = 0.7
) -> np.ndarray:
    # masks = remove_overlapping_masks(masks)
    surfaces = []
    transparent_mask = np.zeros_like(image)
    height, width = image.shape[:2]

    for mask in masks:
        segmentation = mask.cpu().numpy()

        color = [randint(127, 255) for _ in range(3)]

        # draw contour
        contours, _ = cv2.findContours(
            np.uint8(segmentation), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )
        # cv2.drawContours(image, contours, -1, (0, 0, 255), 2)

        for contour in contours:
            # Calculate the perimeter of the contour
            perimeter = cv2.arcLength(contour, True)
            # Approximate the contour to a polygon
            epsilon = 0.02 * perimeter  # 2% of the perimeter
            approx = cv2.approxPolyDP(contour, epsilon, True)
            if len(approx) == 4:
              # Draw the approximated polygon (should be a quadrilateral if the shape is close to a rectangle)
              cv2.drawContours(image, [approx], 0, (255, 0, 0), 2)
              # cv2.drawContours(transparent_mask, [approx], 0, (255, 0, 0), -1)

              surfaces.append(np.squeeze(approx, axis=1))

    # image = cv2.addWeighted(transparent_mask, 1 - alpha, image, alpha, 0)

    return image, surfaces


In [None]:
ori_image = cv2.imread(source, cv2.IMREAD_COLOR)
# ori_image = cv2.cvtColor(ori_image, cv2.COLOR_BGR2RGB)
# print(ann[0].masks)
image, surfaces = draw_fastsam_masks(ori_image, ann[0].masks.data)

cv2_imshow(image)

In [None]:
ann[0].masks.data.shape

torch.Size([132, 3072, 4080])