<a href="https://colab.research.google.com/github/haddybhaiya/sem-i-con/blob/main/synthetic_wafer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
BASE_DATASET = "/content/drive/MyDrive/dataset/base"


In [None]:
!pip install albumentations opencv-python pillow tqdm




In [None]:
import os
import cv2
import numpy as np
from tqdm import tqdm
import albumentations as A

BASE_DATASET = "/content/drive/MyDrive/dataset/base"
OUT_DATASET = "/content/drive/MyDrive/synthetic_dataset"

CLASSES = ["clean","bridge","cmp","crack","open","ler","via","other"]


TARGET_PER_CLASS = 300   # you can change later
IMG_SIZE = 224           # edge-friendly size


In [None]:
# very safe augmentations (geometry only)
geom_aug = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.RandomRotate90(p=0.5),
    A.ShiftScaleRotate(
        shift_limit=0.03,
        scale_limit=0.08,
        rotate_limit=10,
        p=0.5
    )
])

# mild texture changes (NO particles)
mild_aug = A.Compose([
    A.RandomBrightnessContrast(p=0.3),
    A.GaussianBlur(blur_limit=3, p=0.15)
])

# edge roughness (for LER only)
ler_aug = A.Compose([
    A.GaussianBlur(blur_limit=3, p=0.2),
    A.GaussNoise(var_limit=(5,20), p=0.2)
])

# heavy junk / OOD augmentations (ONLY for other)
other_aug = A.Compose([
    A.GaussNoise(var_limit=(20,80), p=0.6),
    A.MotionBlur(p=0.3),
    A.GridDistortion(p=0.3),
    A.ElasticTransform(p=0.3),
    A.RandomBrightnessContrast(p=0.5)
])


  original_init(self, **validated_kwargs)
  A.GaussNoise(var_limit=(5,20), p=0.2)
  A.GaussNoise(var_limit=(20,80), p=0.6),


In [None]:
def particle_noise(img):
    h, w = img.shape
    noise = np.zeros((h, w), dtype=np.uint8)

    for _ in range(np.random.randint(10, 40)):
        x, y = np.random.randint(0, w), np.random.randint(0, h)
        r = np.random.randint(2, 10)
        cv2.circle(noise, (x, y), r, 255, -1)

    noise = cv2.GaussianBlur(noise, (9, 9), 0)
    out = img.copy()
    out[noise > 0] = out[noise > 0] * np.random.uniform(0.5, 0.8)
    return out


In [None]:
os.makedirs(OUT_DATASET, exist_ok=True)

for cls in CLASSES:
    in_path = os.path.join(BASE_DATASET, cls)
    out_path = os.path.join(OUT_DATASET, cls)
    os.makedirs(out_path, exist_ok=True)

    images = [f for f in os.listdir(in_path)
              if f.lower().endswith(('.png','.jpg','.jpeg','.bmp','.tif','.tiff','.webp'))]

    if not images:
        print(f"[WARN] No images found for {cls}")
        continue

    count, idx = 0, 0
    pbar = tqdm(total=TARGET_PER_CLASS, desc=f"Generating {cls}")

    while count < TARGET_PER_CLASS:
        img_path = os.path.join(in_path, images[idx % len(images)])
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))

        # class specific
        aug_img = geom_aug(image=img)["image"]

        if cls in ["bridge", "cmp", "open", "via"]:
            pass  # geometry only (preserve morphology)

        elif cls == "crack":
            aug_img = mild_aug(image=aug_img)["image"]

        elif cls == "ler":
            aug_img = ler_aug(image=aug_img)["image"]

        elif cls == "other":
            aug_img = other_aug(image=aug_img)["image"]
            aug_img = particle_noise(aug_img)

        elif cls == "clean":
            aug_img = mild_aug(image=aug_img)["image"]



        save_path = os.path.join(out_path, f"{cls}_{count}.png")
        cv2.imwrite(save_path, aug_img)

        count += 1
        idx += 1
        pbar.update(1)

    pbar.close()

print("\n Rectified synthetic dataset generated successfully")


Generating clean: 100%|██████████| 300/300 [01:41<00:00,  2.95it/s]
Generating bridge: 100%|██████████| 300/300 [01:27<00:00,  3.43it/s]
Generating cmp: 100%|██████████| 300/300 [01:30<00:00,  3.32it/s]
Generating crack: 100%|██████████| 300/300 [01:34<00:00,  3.17it/s]
Generating open: 100%|██████████| 300/300 [01:22<00:00,  3.62it/s]
Generating ler: 100%|██████████| 300/300 [01:24<00:00,  3.55it/s]
Generating via: 100%|██████████| 300/300 [01:25<00:00,  3.51it/s]
Generating other: 100%|██████████| 300/300 [01:31<00:00,  3.28it/s]


 Rectified synthetic dataset generated successfully





In [None]:
BASE = "/content/drive/MyDrive/dataset/base"

for cls in CLASSES:
    path = os.path.join(BASE,cls)
    files = [f for f in os.listdir(path) if f.lower().endswith(('.png','.jpg','.jpeg','.bmp','.tif','.tiff','.webp'))]
    print(cls, "->", len(files))



clean -> 17
bridge -> 5
cmp -> 5
crack -> 4
open -> 4
ler -> 4
via -> 4
other -> 15
