# 12 Convolution / Block Benchmark (Params + MACs + Latency + (Optional) CUDA Memory)

**Amaç:** Şu 12 blok için tek bir model iskeletinde karşılaştırma:
1) Depthwise Separable  
2) Pointwise (1×1)  
3) Inverted Bottleneck (MBConv)  
4) Ghost  
5) Dilated  
6) Deformable *(torchvision varsa gerçek; yoksa fallback)*  
7) Shift  
8) Octave  
9) Dynamic Conv  
10) RepVGG *(deploy mode ölçülür)*  
11) Coordinate Conv (CoordConv)  
12) Group Conv

> Accuracy yok. Sadece: **parametre**, **yaklaşık MACs**, **latency** (ms/iter), **(varsa) CUDA peak memory**.

## Notlar (kısa)
- **MACs (approx)**: Pool/upsample/shift gibi non-conv ops’lar ihmal veya yaklaşık alınmıştır.  
- **Deformable**: torchvision `DeformConv2d` yoksa otomatik **fallback** yapar (standard conv).  
- **RepVGG**: burada **deploy mode** ölçülür (tek 3×3).

In [None]:
import time, math
import torch
import torch.nn as nn
import torch.nn.functional as F

# -------------------------
# Profil: Params / MACs / Latency / Peak Mem
# -------------------------
def count_params(m: nn.Module) -> int:
    return sum(p.numel() for p in m.parameters() if p.requires_grad)

class MacCounter:
    def __init__(self):
        self.macs = 0

    def hook(self, module, inp, out):
        if isinstance(out, (tuple, list)):
            out = out[0]

        # Conv2d
        if isinstance(module, nn.Conv2d):
            x = inp[0]
            B = x.shape[0]
            Cin = module.in_channels
            Cout = module.out_channels
            kH, kW = module.kernel_size if isinstance(module.kernel_size, tuple) else (module.kernel_size, module.kernel_size)
            groups = module.groups
            Hout, Wout = out.shape[-2], out.shape[-1]
            mac = B * Hout * Wout * Cout * (Cin // groups) * kH * kW
            self.macs += int(mac)
            return

        if isinstance(module, nn.Linear):
            x = inp[0]
            B = x.shape[0] if x.dim() > 1 else 1
            mac = B * module.in_features * module.out_features
            self.macs += int(mac)
            return

def benchmark_latency(m: nn.Module, x: torch.Tensor, iters=200, warmup=50) -> float:
    m.eval()
    with torch.no_grad():
        for _ in range(warmup):
            _ = m(x)
        if x.device.type == "cuda":
            torch.cuda.synchronize()
        t0 = time.perf_counter()
        for _ in range(iters):
            _ = m(x)
        if x.device.type == "cuda":
            torch.cuda.synchronize()
        t1 = time.perf_counter()
    return (t1 - t0) * 1000.0 / iters

def profile_block(name: str, m: nn.Module, x: torch.Tensor):
    m = m.to(x.device).eval()
    mac_counter = MacCounter()
    hooks = []
    for mod in m.modules():
        if isinstance(mod, (nn.Conv2d, nn.Linear)):
            hooks.append(mod.register_forward_hook(mac_counter.hook))

    # Peak mem
    if x.device.type == "cuda":
        torch.cuda.reset_peak_memory_stats()

    with torch.no_grad():
        y = m(x)

    for h in hooks:
        h.remove()

    params = count_params(m)
    macs = mac_counter.macs
    ms = benchmark_latency(m, x, iters=200, warmup=50)

    peak_mb = 0.0
    if x.device.type == "cuda":
        peak_mb = torch.cuda.max_memory_allocated() / (1024**2)

    print(f"{name:18s} | params={params:10d} | macs={macs/1e6:10.2f} M | lat={ms:7.3f} ms | peak={peak_mb:7.1f} MB | out={tuple(y.shape)}")


# ----------------------------------------------------------------------------------------------------
class PointwiseConv(nn.Module):
    def __init__(self, cin, cout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(cin, cout, 1, bias=False),
            nn.BatchNorm2d(cout),
            nn.ReLU(inplace=True),
        )
    def forward(self, x): return self.net(x)

# ----------------------------------------------------------------------------------------------------
class DepthwiseSeparableConv(nn.Module):
    def __init__(self, cin, cout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(cin, cin, 3, padding=1, groups=cin, bias=False),
            nn.BatchNorm2d(cin),
            nn.ReLU(inplace=True),
            nn.Conv2d(cin, cout, 1, bias=False),
            nn.BatchNorm2d(cout),
            nn.ReLU(inplace=True),
        )
    def forward(self, x): return self.net(x)

# ----------------------------------------------------------------------------------------------------
class InvertedBottleneck(nn.Module):
    def __init__(self, cin, cout, expand=4):
        super().__init__()
        mid = cin * expand
        self.net = nn.Sequential(
            nn.Conv2d(cin, mid, 1, bias=False),
            nn.BatchNorm2d(mid),
            nn.ReLU(inplace=True),
            nn.Conv2d(mid, mid, 3, padding=1, groups=mid, bias=False),
            nn.BatchNorm2d(mid),
            nn.ReLU(inplace=True),
            nn.Conv2d(mid, cout, 1, bias=False),
            nn.BatchNorm2d(cout),
            nn.ReLU(inplace=True),
        )
    def forward(self, x): return self.net(x)

# ----------------------------------------------------------------------------------------------------
class GhostConv(nn.Module):
    def __init__(self, cin, cout, ratio=2):
        super().__init__()
        init = math.ceil(cout / ratio)
        new = cout - init
        self.primary = nn.Sequential(
            nn.Conv2d(cin, init, 1, bias=False),
            nn.BatchNorm2d(init),
            nn.ReLU(inplace=True),
        )
        # FIX: new=0 olursa Conv2d(.,0,...) invalid olur
        self.cheap = None
        if new > 0:
            self.cheap = nn.Sequential(
                nn.Conv2d(init, new, 3, padding=1, groups=init, bias=False),
                nn.BatchNorm2d(new),
                nn.ReLU(inplace=True),
            )
        self.cout = cout

    def forward(self, x):
        y = self.primary(x)
        if self.cheap is None:
            return y
        z = self.cheap(y)
        out = torch.cat([y, z], dim=1)
        return out[:, :self.cout]

# ----------------------------------------------------------------------------------------------------
class DilatedConv(nn.Module):
    def __init__(self, cin, cout, dilation=2):
        super().__init__()
        p = dilation
        self.net = nn.Sequential(
            nn.Conv2d(cin, cout, 3, padding=p, dilation=dilation, bias=False),
            nn.BatchNorm2d(cout),
            nn.ReLU(inplace=True),
        )
    def forward(self, x): return self.net(x)

# Deformable: torchvision varsa gerçek, yoksa fallback 
try:
    from torchvision.ops import DeformConv2d as TVDeformConv2d
    _HAS_DEFORM = True
except Exception:
    TVDeformConv2d = None
    _HAS_DEFORM = False

# ----------------------------------------------------------------------------------------------------
class DeformableConv(nn.Module):
    def __init__(self, cin, cout, k=3, padding=1):
        super().__init__()
        if not _HAS_DEFORM:
            self.fallback = True
            self.net = nn.Sequential(
                nn.Conv2d(cin, cout, k, padding=padding, bias=False),
                nn.BatchNorm2d(cout),
                nn.ReLU(inplace=True),
            )
        else:
            self.fallback = False
            self.offset = nn.Conv2d(cin, 2*k*k, 3, padding=1)  # 2*k*k offset
            self.conv = TVDeformConv2d(cin, cout, k, padding=padding, bias=False)
            self.bn = nn.BatchNorm2d(cout)
            self.act = nn.ReLU(inplace=True)

    def forward(self, x):
        if self.fallback:
            return self.net(x)
        off = self.offset(x)
        y = self.conv(x, off)
        return self.act(self.bn(y))

# ----------------------------------------------------------------------------------------------------
def shift2d(x, dirs=((0,0),(1,0),(-1,0),(0,1),(0,-1))):
    B,C,H,W = x.shape
    G = len(dirs)
    base = C // G
    sizes = [base]*(G-1) + [C - base*(G-1)]
    chunks = torch.split(x, sizes, dim=1)
    out = []
    for ch,(dx,dy) in zip(chunks, dirs):
        if dx==0 and dy==0:
            out.append(ch); continue
        r = torch.roll(ch, shifts=(dy,dx), dims=(-2,-1))
        if dy>0: r[..., :dy, :] = 0
        if dy<0: r[..., dy:, :] = 0
        if dx>0: r[..., :, :dx] = 0
        if dx<0: r[..., :, dx:] = 0
        out.append(r)
    return torch.cat(out, dim=1)

class ShiftConv(nn.Module):
    def __init__(self, cin, cout):
        super().__init__()
        self.pw = nn.Sequential(
            nn.Conv2d(cin, cout, 1, bias=False),
            nn.BatchNorm2d(cout),
            nn.ReLU(inplace=True),
        )
    def forward(self, x):
        return self.pw(shift2d(x))

# ----------------------------------------------------------------------------------------------------
class OctaveConv(nn.Module):
    def __init__(self, cin, cout, alpha=0.5):
        super().__init__()
        self.alpha = alpha
        ch_h_in = int(round(cin*(1-alpha)))
        ch_l_in = cin - ch_h_in
        ch_h_out = int(round(cout*(1-alpha)))
        ch_l_out = cout - ch_h_out

        self.pool = nn.AvgPool2d(2)
        self.up = nn.Upsample(scale_factor=2, mode="nearest")

        self.hh = nn.Conv2d(ch_h_in, ch_h_out, 3, padding=1, bias=False)
        self.hl = nn.Conv2d(ch_h_in, ch_l_out, 3, padding=1, bias=False)
        self.lh = nn.Conv2d(ch_l_in, ch_h_out, 3, padding=1, bias=False)
        self.ll = nn.Conv2d(ch_l_in, ch_l_out, 3, padding=1, bias=False)

        self.bn_h = nn.BatchNorm2d(ch_h_out)
        self.bn_l = nn.BatchNorm2d(ch_l_out)
        self.act = nn.ReLU(inplace=True)

class OctaveConv(nn.Module):
    def __init__(self, cin, cout, alpha=0.5):
        super().__init__()
        self.alpha = alpha
        ch_h_in = int(round(cin*(1-alpha)))
        ch_l_in = cin - ch_h_in
        ch_h_out = int(round(cout*(1-alpha)))
        ch_l_out = cout - ch_h_out

        self.pool = nn.AvgPool2d(2)
        self.up = nn.Upsample(scale_factor=2, mode="nearest")

        self.hh = nn.Conv2d(ch_h_in, ch_h_out, 3, padding=1, bias=False)
        self.hl = nn.Conv2d(ch_h_in, ch_l_out, 3, padding=1, bias=False)
        self.lh = nn.Conv2d(ch_l_in, ch_h_out, 3, padding=1, bias=False)
        self.ll = nn.Conv2d(ch_l_in, ch_l_out, 3, padding=1, bias=False)

        self.bn_h = nn.BatchNorm2d(ch_h_out)
        self.bn_l = nn.BatchNorm2d(ch_l_out)
        self.act = nn.ReLU(inplace=True)

    def forward(self, x):
        B, C, H, W = x.shape
        ch_h = int(round(C*(1 - self.alpha)))
        xh = x[:, :ch_h]           # (B, ch_h, H, W)
        xl = x[:, ch_h:]           # (B, ch_l, H, W)  

        if xl.numel() == 0:
            xl = self.pool(xh)     
        else:
            if xl.shape[-2:] == (H, W):
                xl = self.pool(xl)

        yh = self.hh(xh) + self.up(self.lh(xl))      # HxW + up(H/2xW/2) -> HxW
        yl = self.ll(xl) + self.hl(self.pool(xh))    # H/2xW/2 + H/2xW/2

        yh = self.act(self.bn_h(yh))
        yl = self.act(self.bn_l(yl))

        return torch.cat([yh, self.up(yl)], dim=1)   # ikisi de HxW

# ----------------------------------------------------------------------------------------------------
class DynamicConv2d(nn.Module):
    def __init__(self, cin, cout, k=3, padding=1, K=4, reduction=4, temperature=1.0):
        super().__init__()
        self.K = K
        self.temperature = temperature
        hidden = max(1, cin // reduction)
        self.router = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(cin, hidden, 1),
            nn.ReLU(inplace=True),
            nn.Conv2d(hidden, K, 1),
        )
        self.weight = nn.Parameter(torch.randn(K, cout, cin, k, k) * 0.02)
        self.bias = nn.Parameter(torch.zeros(K, cout))
        self.padding = padding

    def forward(self, x):
        B,C,H,W = x.shape
        logits = self.router(x).flatten(1) / self.temperature   # (B,K)
        a = torch.softmax(logits, dim=1)                        # (B,K)

        # ağırlık karışımı: (B,K) @ (K,cout,cin,kh,kw) -> (B,cout,cin,kh,kw)
        Wmix = torch.einsum("bk,kocij->bocij", a, self.weight)
        bmix = torch.einsum("bk,ko->bo", a, self.bias)

        outs = []
        for i in range(B):
            yi = F.conv2d(x[i:i+1], Wmix[i], bmix[i], padding=self.padding)
            outs.append(yi)
        return torch.cat(outs, dim=0)

# ----------------------------------------------------------------------------------------------------
class RepVGGBlock(nn.Module):
    def __init__(self, cin, cout, stride=1, deploy=False):
        super().__init__()
        self.deploy = deploy
        self.act = nn.ReLU(inplace=True)
        self.cin, self.cout, self.stride = cin, cout, stride

        if deploy:
            self.rbr = nn.Conv2d(cin, cout, 3, stride=stride, padding=1, bias=True)
        else:
            self.rbr3 = nn.Sequential(nn.Conv2d(cin, cout, 3, stride=stride, padding=1, bias=False),
                                      nn.BatchNorm2d(cout))
            self.rbr1 = nn.Sequential(nn.Conv2d(cin, cout, 1, stride=stride, padding=0, bias=False),
                                      nn.BatchNorm2d(cout))
            self.idbn = nn.BatchNorm2d(cout) if (cin==cout and stride==1) else None

    def forward(self, x):
        if self.deploy:
            return self.act(self.rbr(x))
        out = self.rbr3(x) + self.rbr1(x)
        if self.idbn is not None:
            out = out + self.idbn(x)
        return self.act(out)

# ----------------------------------------------------------------------------------------------------
def cords(b, h, w, device, dtype, add_rad=True):
    yy = torch.linspace(-1.0, 1.0, steps=h, device=device, dtype=dtype)
    xx = torch.linspace(-1.0, 1.0, steps=w, device=device, dtype=dtype)
    try:
        yv, xv = torch.meshgrid(yy, xx, indexing="ij")
    except TypeError:
        yv, xv = torch.meshgrid(yy, xx)

    x_ch = xv.unsqueeze(0).expand(b, 1, h, w)
    y_ch = yv.unsqueeze(0).expand(b, 1, h, w)

    if add_rad:
        r = torch.sqrt(x_ch**2 + y_ch**2)
        return torch.cat([x_ch, y_ch, r], dim=1)
    return torch.cat([x_ch, y_ch], dim=1)

class CoordConv(nn.Module):
    def __init__(self, cin, cout, with_r=True):
        super().__init__()
        self.with_r = with_r
        extra = 3 if with_r else 2
        self.net = nn.Sequential(
            nn.Conv2d(cin + extra, cout, 3, padding=1, bias=False),
            nn.BatchNorm2d(cout),
            nn.ReLU(inplace=True),
        )
    def forward(self, x):
        b,c,h,w = x.shape
        coords = cords(b, h, w, x.device, x.dtype, add_rad=self.with_r)
        return self.net(torch.cat([x, coords], dim=1))

# ----------------------------------------------------------------------------------------------------
class GroupConv(nn.Module):
    def __init__(self, cin, cout, groups=4):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(cin, cout, 3, padding=1, groups=groups, bias=False),
            nn.BatchNorm2d(cout),
            nn.ReLU(inplace=True),
        )
    def forward(self, x): return self.net(x)

# ----------------------------------------------------------------------------------------------------
# -------------------------
# RUN: 12 conv karşılaştırma
# -------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(16, 64, 56, 56, device=device)

blocks = {
    "DepthwiseSep":  DepthwiseSeparableConv(64, 128),
    "Pointwise":     PointwiseConv(64, 128),
    "InvBottleneck": InvertedBottleneck(64, 128, expand=4),
    "Ghost":         GhostConv(64, 128, ratio=2),
    "Dilated":       DilatedConv(64, 128, dilation=2),
    "Deformable":    DeformableConv(64, 128, k=3, padding=1),
    "Shift":         ShiftConv(64, 128),
    "Octave":        OctaveConv(64, 128, alpha=0.5),
    "Dynamic":       nn.Sequential(DynamicConv2d(64, 128, k=3, padding=1, K=4), nn.BatchNorm2d(128), nn.ReLU(True)),
    "RepVGG(train)": RepVGGBlock(64, 128, stride=1, deploy=False),
    "CoordConv":     CoordConv(64, 128, with_r=True),
    "GroupConv":     GroupConv(64, 128, groups=4),
}

print("device:", device, "| input:", tuple(x.shape), "| deformable_real:", _HAS_DEFORM)
for name, blk in blocks.items():
    profile_block(name, blk, x)


device: cpu | input: (16, 64, 56, 56) | deformable_real: True
DepthwiseSep       | params=      9152 | macs=    439.94 M | lat= 38.447 ms | peak=    0.0 MB | out=(16, 128, 56, 56)
Pointwise          | params=      8448 | macs=    411.04 M | lat= 25.804 ms | peak=    0.0 MB | out=(16, 128, 56, 56)
InvBottleneck      | params=     52736 | macs=   2581.86 M | lat=132.369 ms | peak=    0.0 MB | out=(16, 128, 56, 56)
Ghost              | params=      4928 | macs=    234.42 M | lat= 31.326 ms | peak=    0.0 MB | out=(16, 128, 56, 56)
Dilated            | params=     73984 | macs=   3699.38 M | lat= 34.826 ms | peak=    0.0 MB | out=(16, 128, 56, 56)
Deformable         | params=     84370 | macs=    520.22 M | lat=479.308 ms | peak=    0.0 MB | out=(16, 128, 56, 56)
Shift              | params=      8448 | macs=    411.04 M | lat= 28.294 ms | peak=    0.0 MB | out=(16, 128, 56, 56)
Octave             | params=     73984 | macs=   1618.48 M | lat= 54.476 ms | peak=    0.0 MB | out=(16, 128, 56

In [None]:
import csv

def export_profiles_to_csv(
    blocks: dict,
    x: torch.Tensor,
    csv_path: str = "conv_profiles.csv",
):

    rows = []
    for name, blk in blocks.items():
        row = profile_block_return_row(name, blk, x) 
        rows.append(row)

    fieldnames = list(rows[0].keys()) if rows else [
        "name", "params", "macs", "macs_m", "lat_ms", "peak_mb", "out_shape"
    ]

    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        w.writerows(rows)

    print(f"[OK] CSV yazıldı: {csv_path}")

def profile_block_return_row(name: str, m: nn.Module, x: torch.Tensor):
    m = m.to(x.device).eval()

    mac_counter = MacCounter()
    hooks = []
    for mod in m.modules():
        if isinstance(mod, (nn.Conv2d, nn.Linear)):
            hooks.append(mod.register_forward_hook(mac_counter.hook))

    if x.device.type == "cuda":
        torch.cuda.reset_peak_memory_stats()

    with torch.no_grad():
        y = m(x)

    for h in hooks:
        h.remove()

    params = count_params(m)
    macs = mac_counter.macs
    lat_ms = benchmark_latency(m, x, iters=200, warmup=50)

    peak_mb = 0.0
    if x.device.type == "cuda":
        peak_mb = torch.cuda.max_memory_allocated() / (1024**2)

    out_shape = tuple(y.shape)

    print(
        f"{name:18s} | params={params:10d} | macs={macs/1e6:10.2f} M | "
        f"lat={lat_ms:7.3f} ms | peak={peak_mb:7.1f} MB | out={out_shape}"
    )

    return {
        "name": name,
        "params": int(params),
        "macs": int(macs),
        "macs_m": float(macs / 1e6),
        "lat_ms": float(lat_ms),
        "peak_mb": float(peak_mb),
        "out_shape": str(out_shape),
        "device": str(x.device),
        "batch": int(x.shape[0]),
        "in_shape": str(tuple(x.shape)),
    }

### hangi durumda hangi conv mantıklı?

| Conv türü                        | Ne işe yarar (öz)                         | Ne zaman mantıklı                                              | Artı                                      | Eksi / Risk                                                |
| -------------------------------- | ----------------------------------------- | -------------------------------------------------------------- | ----------------------------------------- | ---------------------------------------------------------- |
| **Pointwise (1×1)**              | Kanal karıştırır, boyutu değiştirir       | Blok içinde “kanal artır/azalt”, bottleneck                    | Çok hızlı, ucuz                           | Tek başına uzamsal (spatial) desen yakalamaz               |
| **Depthwise Separable**          | Uzamsal + kanal işlemini ayırır           | Mobil/edge, hızlı backbone (MobileNet tarzı)                   | MAC/param çok düşer                       | Aynı FLOP’ta bazen doğruluk düşebilir                      |
| **Inverted Bottleneck (MBConv)** | Genişlet → depthwise → sıkıştır           | MobilNetV2/V3, EfficientNet benzeri                            | Verim/accuracy dengesi iyi                | Tasarım parametreleri (expand, SE vs) hassas               |
| **GhostConv**                    | “Gerçek” kanalı az üret, kalanı ucuz üret | Hız/param çok kritikse                                         | Çok ucuz; pratikte hızlı                  | Bazı görevlerde temsil gücü düşebilir                      |
| **Dilated Conv**                 | Daha geniş receptive field                | Segmentation, context isteyen işler                            | Downsample etmeden geniş görüş            | “Gridding” artefact; küçük objelerde zarar                 |
| **Deformable Conv**              | Örnekleme noktaları kayar (adaptif)       | Nesne şekli değişken: detection/segmentation                   | Zor geometrilerde güçlü                   | Daha yavaş/karmaşık; her ortamda stabil değil              |
| **ShiftConv**                    | Öğrenmesiz kaydırma + 1×1                 | Çok ucuz uzamsal etki istiyorsan                               | Neredeyse bedava spatial                  | Öğrenme kapasitesi sınırlı; doğruluk riski                 |
| **OctaveConv**                   | Frekans ayrımı (high/low)                 | Büyük feature map’lerde verim                                  | Bellek/MAC düşebilir                      | Uygulaması/entegrasyonu daha “özel”, her yerde kazandırmaz |
| **DynamicConv (mixture)**        | Girdiye göre kernel karışımı              | Veri çeşitliliği yüksekse, adaptif istenirse                   | Kapasite artar (aynı katmanda adaptasyon) | Latency artar; batch başına döngü pahalı                   |
| **RepVGG (train)**               | Train’de çok dal, deploy’da tek conv      | Üretimde hız isterken train’de güçlü yapı                      | Deploy’da çok hızlı                       | Re-parameterize adımı gerekir (deploy sürümü)              |
| **CoordConv**                    | X-Y (ve r) koordinat kanalı ekler         | Konum kritik: lokasyon/ısı haritası, keypoint, basit detection | “Konum öğrenmeyi” kolaylaştırır           | Translational invariance azalır; her yerde iyi değil       |
| **GroupConv**                    | Kanalları gruplara böler                  | Param/MAC azaltmak, ResNeXt tarzı                              | Verim iyi, kontrol edilebilir             | Gruplar artarsa kanal etkileşimi azalır                    |
