## Adım 1-) 

In [1]:
import torch
import torch.nn as nn

class ChannelAttention(nn.Module):
    def __init__(self, channels: int, reduction: int = 16, min_hidden: int = 4):
        super().__init__()
        hidden = max(min_hidden, channels // reduction)
        self.avg = nn.AdaptiveAvgPool2d(1)
        self.mx  = nn.AdaptiveMaxPool2d(1)
        self.mlp = nn.Sequential(
            nn.Conv2d(channels, hidden, 1, bias=True),
            nn.ReLU(inplace=True),
            nn.Conv2d(hidden, channels, 1, bias=True),
        )
        self.gate = nn.Sigmoid()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        w = self.gate(self.mlp(self.avg(x)) + self.mlp(self.mx(x)))
        return x * w

class SpatialAttention(nn.Module):
    def __init__(self, kernel_size: int = 7):
        super().__init__()
        if kernel_size % 2 == 0:
            kernel_size += 1
        p = kernel_size // 2
        self.conv = nn.Conv2d(2, 1, kernel_size=kernel_size, padding=p, bias=False)
        self.gate = nn.Sigmoid()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        avg_map = torch.mean(x, dim=1, keepdim=True)
        max_map, _ = torch.max(x, dim=1, keepdim=True)
        s = torch.cat([avg_map, max_map], dim=1)
        w = self.gate(self.conv(s))
        return x * w

class CBAM(nn.Module):
    def __init__(self, channels: int, reduction: int = 16, sa_kernel: int = 7):
        super().__init__()
        self.ca = ChannelAttention(channels, reduction=reduction)
        self.sa = SpatialAttention(kernel_size=sa_kernel)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.ca(x)
        x = self.sa(x)
        return x

## Adım 2-) ChannelAttention’i sağlamlaştıralım: gate + act seçimi

* Bazı yerlerde ReLU yerine SiLU daha stabil olabiliyor; ayrıca sigmoid/hardsigmoid seçimi mobil vs için işe yarar.



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ChannelAttentionV1(nn.Module):
    def __init__(
        self,
        channels: int,
        reduction: int = 16,
        min_hidden: int = 4,
        act: str = "relu",          # "relu" | "silu"
        gate: str = "sigmoid",      # "sigmoid" | "hardsigmoid"
        bias: bool = True,
    ):
        super().__init__()
        hidden = max(min_hidden, channels // reduction)

        self.avg = nn.AdaptiveAvgPool2d(1)
        self.mx  = nn.AdaptiveMaxPool2d(1)

        self.fc1 = nn.Conv2d(channels, hidden, 1, bias=bias)
        self.act = nn.ReLU(inplace=True) if act == "relu" else nn.SiLU(inplace=True)
        self.fc2 = nn.Conv2d(hidden, channels, 1, bias=bias)

        g = gate.lower()
        if g == "sigmoid":
            self.gate_fn = torch.sigmoid
        elif g == "hardsigmoid":
            self.gate_fn = F.hardsigmoid
        else:
            raise ValueError("gate 'sigmoid' veya 'hardsigmoid' olmalı.")

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        a = self.fc2(self.act(self.fc1(self.avg(x))))
        m = self.fc2(self.act(self.fc1(self.mx(x))))
        w = self.gate_fn(a + m)
        return x * w

## Adım 3-) Avg/Max birleşimini “öğrenilebilir” yapalım (fusion)

* Şu an avg+max “sabit” bir kural. Bunu öğrenilebilir yaparsak bazı modellerde daha iyi olur.

In [None]:
class ChannelAttentionV2(nn.Module):
    def __init__(
        self,
        channels: int,
        reduction: int = 16,
        min_hidden: int = 4,
        act: str = "relu",
        gate: str = "sigmoid",
        fusion: str = "softmax",    # "sum" | "softmax"
        bias: bool = True,
    ):
        super().__init__()
        hidden = max(min_hidden, channels // reduction)

        self.avg = nn.AdaptiveAvgPool2d(1)
        self.mx  = nn.AdaptiveMaxPool2d(1)

        self.fc1 = nn.Conv2d(channels, hidden, 1, bias=bias)
        self.act = nn.ReLU(inplace=True) if act == "relu" else nn.SiLU(inplace=True)
        self.fc2 = nn.Conv2d(hidden, channels, 1, bias=bias)

        g = gate.lower()
        if g == "sigmoid":
            self.gate_fn = torch.sigmoid
        elif g == "hardsigmoid":
            self.gate_fn = F.hardsigmoid
        else:
            raise ValueError("gate 'sigmoid' veya 'hardsigmoid' olmalı.")

        self.fusion = fusion
        if fusion not in ("sum", "softmax"):
            raise ValueError("fusion 'sum' veya 'softmax' olmalı.")

        self.fusion_logits = nn.Parameter(torch.zeros(2))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        a = self.fc2(self.act(self.fc1(self.avg(x))))
        m = self.fc2(self.act(self.fc1(self.mx(x))))

        if self.fusion == "sum":
            z = a + m
        else:
            w2 = torch.softmax(self.fusion_logits, dim=0)
            z = w2[0] * a + w2[1] * m

        w = self.gate_fn(z)
        return x * w

## Adım 4-) SpatialAttention’i genişlet: kernel + farklı “pool” kombinasyonları

* Bazı senaryolarda avg+max yerine farklı birleştirme (örn. sum veya concat) denenebilir. Biz standart concatı koruyup kontrol ekliyoruz.

In [None]:
class SpatialAttentionV1(nn.Module):
    def __init__(
        self,
        kernel_size: int = 7,
        gate: str = "sigmoid",      # "sigmoid" | "hardsigmoid"
        bias: bool = False,
    ):
        super().__init__()
        if kernel_size % 2 == 0:
            kernel_size += 1
        p = kernel_size // 2
        self.conv = nn.Conv2d(2, 1, kernel_size=kernel_size, padding=p, bias=bias)

        g = gate.lower()
        if g == "sigmoid":
            self.gate_fn = torch.sigmoid
        elif g == "hardsigmoid":
            self.gate_fn = F.hardsigmoid
        else:
            raise ValueError("gate 'sigmoid' veya 'hardsigmoid' olmalı.")

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        avg_map = torch.mean(x, dim=1, keepdim=True)
        max_map, _ = torch.max(x, dim=1, keepdim=True)
        s = torch.cat([avg_map, max_map], dim=1)
        w = self.gate_fn(self.conv(s))
        return x * w

## Adım 4 — CBAM’e “residual scaling” ekleyelim
Bu, attention’ı “tam çarpma” yerine daha stabil hale getirir:

* Normal: x * w

* Residual: x * (1 + alpha * w)

alpha küçükse, model bozulmadan attention öğrenmeye başlar.

In [None]:
class CBAMV2(nn.Module):
    def __init__(
        self,
        channels: int,
        reduction: int = 16,
        sa_kernel: int = 7,
        ca_act: str = "relu",
        ca_gate: str = "sigmoid",
        sa_gate: str = "sigmoid",
        ca_fusion: str = "softmax",     # "sum" | "softmax"
        residual: bool = True,
        alpha_init: float = 1.0,
        learnable_alpha: bool = False,
    ):
        super().__init__()
        self.ca = ChannelAttentionV2(
            channels, reduction=reduction, act=ca_act, gate=ca_gate, fusion=ca_fusion
        )
        self.sa = SpatialAttentionV1(kernel_size=sa_kernel, gate=sa_gate)

        self.residual = residual
        if residual:
            if learnable_alpha:
                self.alpha = nn.Parameter(torch.tensor(float(alpha_init)))
            else:
                self.register_buffer("alpha", torch.tensor(float(alpha_init)))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        y = self.ca(x)
        y = self.sa(y)
        if self.residual:
            return x + self.alpha * (y - x)
        return y


## Adım 6-) “Çıktıyla beraber maskeleri de döndürelim” (debug/analiz için)

Training’de kapalı tutarız, ama görselleştirme/analiz için çok iyi.

In [None]:
class CBAMV3(nn.Module):
    def __init__(
        self,
        channels: int,
        reduction: int = 16,
        sa_kernel: int = 7,
        return_maps: bool = False,
    ):
        super().__init__()
        self.return_maps = return_maps

        self.avg = nn.AdaptiveAvgPool2d(1)
        self.mx  = nn.AdaptiveMaxPool2d(1)
        hidden = max(4, channels // reduction)
        self.fc1 = nn.Conv2d(channels, hidden, 1, bias=True)
        self.act = nn.ReLU(inplace=True)
        self.fc2 = nn.Conv2d(hidden, channels, 1, bias=True)

        if sa_kernel % 2 == 0:
            sa_kernel += 1
        p = sa_kernel // 2
        self.sa_conv = nn.Conv2d(2, 1, kernel_size=sa_kernel, padding=p, bias=False)

    def forward(self, x: torch.Tensor):
        ca_logits = self.fc2(self.act(self.fc1(self.avg(x)))) + self.fc2(self.act(self.fc1(self.mx(x))))
        ca = torch.sigmoid(ca_logits)
        y = x * ca

        avg_map = torch.mean(y, dim=1, keepdim=True)
        max_map, _ = torch.max(y, dim=1, keepdim=True)
        sa = torch.sigmoid(self.sa_conv(torch.cat([avg_map, max_map], dim=1)))
        z = y * sa

        if self.return_maps:
            return z, ca, sa
        return z


---
---
----

# Şimdi ise yukarıda 6 adımı tek bir modele entegre edelim...

----
----
---

## CBAM - MODEL - ATTENTİON

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ChannelAttention(nn.Module):
    def __init__(
        self,
        channels: int,
        reduction: int = 16,
        min_hidden: int = 4,
        act: str = "relu",              # "relu" | "silu"
        gate: str = "sigmoid",          # "sigmoid" | "hardsigmoid"
        fusion: str = "softmax",        # "sum" | "softmax"
        bias: bool = True,
    ):
        super().__init__()
        hidden = max(min_hidden, channels // reduction)

        self.avg = nn.AdaptiveAvgPool2d(1)
        self.mx  = nn.AdaptiveMaxPool2d(1)

        self.fc1 = nn.Conv2d(channels, hidden, 1, bias=bias)
        self.act = nn.ReLU(inplace=True) if act == "relu" else nn.SiLU(inplace=True)
        self.fc2 = nn.Conv2d(hidden, channels, 1, bias=bias)

        g = gate.lower()
        if g == "sigmoid":
            self.gate_fn = torch.sigmoid
        elif g == "hardsigmoid":
            self.gate_fn = F.hardsigmoid
        else:
            raise ValueError("gate 'sigmoid' veya 'hardsigmoid' olmalı.")

        self.fusion = fusion
        if fusion not in ("sum", "softmax"):
            raise ValueError("fusion 'sum' veya 'softmax' olmalı.")

        self.fusion_logits = nn.Parameter(torch.zeros(2))

    def forward(self, x: torch.Tensor):
        a = self.fc2(self.act(self.fc1(self.avg(x))))
        m = self.fc2(self.act(self.fc1(self.mx(x))))

        if self.fusion == "sum":
            z = a + m
            w2 = None
        else:
            w2 = torch.softmax(self.fusion_logits, dim=0)
            z = w2[0] * a + w2[1] * m

        ca = self.gate_fn(z)
        y = x * ca
        return y, ca, w2


class SpatialAttention(nn.Module):
    def __init__(
        self,
        kernel_size: int = 7,
        gate: str = "sigmoid",          # "sigmoid" | "hardsigmoid"
        bias: bool = False,
    ):
        super().__init__()
        if kernel_size % 2 == 0:
            kernel_size += 1
        p = kernel_size // 2

        self.conv = nn.Conv2d(2, 1, kernel_size=kernel_size, padding=p, bias=bias)

        g = gate.lower()
        if g == "sigmoid":
            self.gate_fn = torch.sigmoid
        elif g == "hardsigmoid":
            self.gate_fn = F.hardsigmoid
        else:
            raise ValueError("gate 'sigmoid' veya 'hardsigmoid' olmalı.")

    def forward(self, x: torch.Tensor):
        avg_map = torch.mean(x, dim=1, keepdim=True)
        max_map, _ = torch.max(x, dim=1, keepdim=True)
        s = torch.cat([avg_map, max_map], dim=1)
        sa = self.gate_fn(self.conv(s))
        y = x * sa
        return y, sa


class CBAM(nn.Module):
    def __init__(
        self,
        channels: int,
        reduction: int = 16,
        min_hidden: int = 4,
        ca_act: str = "relu",
        ca_gate: str = "sigmoid",
        ca_fusion: str = "softmax",     # "sum" | "softmax"
        sa_kernel: int = 7,
        sa_gate: str = "sigmoid",
        residual: bool = True,
        alpha_init: float = 1.0,
        learnable_alpha: bool = False,
        return_maps: bool = False,
    ):
        super().__init__()
        self.return_maps = return_maps
        self.residual = residual

        self.ca = ChannelAttention(
            channels=channels,
            reduction=reduction,
            min_hidden=min_hidden,
            act=ca_act,
            gate=ca_gate,
            fusion=ca_fusion,
        )
        self.sa = SpatialAttention(kernel_size=sa_kernel, gate=sa_gate)

        if residual:
            if learnable_alpha:
                self.alpha = nn.Parameter(torch.tensor(float(alpha_init)))
            else:
                self.register_buffer("alpha", torch.tensor(float(alpha_init)))

    def forward(self, x: torch.Tensor):
        y, ca, fusion_w = self.ca(x)
        y, sa = self.sa(y)

        if self.residual:
            out = x + self.alpha * (y - x)
        else:
            out = y

        if self.return_maps:
            return out, ca, sa, fusion_w
        return out


if __name__ == "__main__":
    x = torch.randn(2, 64, 56, 56)
    m = CBAM(
        channels=64,
        reduction=16,
        min_hidden=4,
        ca_act="silu",
        ca_gate="sigmoid",
        ca_fusion="softmax",
        sa_kernel=7,
        sa_gate="sigmoid",
        residual=True,
        alpha_init=1.0,
        learnable_alpha=True,
        return_maps=True,
    )
    y, ca, sa, fw = m(x)
    print("x:", x.shape, "y:", y.shape, "ca:", ca.shape, "sa:", sa.shape, "fusion_w:", None if fw is None else fw.shape)


x: torch.Size([2, 64, 56, 56]) y: torch.Size([2, 64, 56, 56]) ca: torch.Size([2, 64, 1, 1]) sa: torch.Size([2, 1, 56, 56]) fusion_w: torch.Size([2])


---
---
---

# Adım 1 — Residual CBAM (Alpha ile)
### Ne problemi çözüyor?

Klasik CBAM:

* out = x * mask yaptığı için,

- attention daha baştan agresif davranırsa bazı kanalları/konumları gereğinden fazla baskılayıp eğitimi bozabilir.

- Özellikle detection/segmentation gibi büyük sistemlerde “train instability” ve “performans dalgalanması” yaratabilir.

Residual form:

- CBAM etkisini “yumuşatır” ve modeli başta identity’e yakın tutar.

* Attention öğrenirken backbone’u kırmaz.

In [6]:
import torch
import torch.nn as nn

class ChannelAttention(nn.Module):
    def __init__(self, channels: int, reduction: int = 16, min_hidden: int = 4):
        super().__init__()
        hidden = max(min_hidden, channels // reduction)
        self.avg = nn.AdaptiveAvgPool2d(1)
        self.mx  = nn.AdaptiveMaxPool2d(1)
        self.mlp = nn.Sequential(
            nn.Conv2d(channels, hidden, 1, bias=True),
            nn.ReLU(inplace=True),
            nn.Conv2d(hidden, channels, 1, bias=True),
        )
        self.gate = nn.Sigmoid()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        w = self.gate(self.mlp(self.avg(x)) + self.mlp(self.mx(x)))
        return x * w

class SpatialAttention(nn.Module):
    def __init__(self, kernel_size: int = 7):
        super().__init__()
        if kernel_size % 2 == 0:
            kernel_size += 1
        p = kernel_size // 2
        self.conv = nn.Conv2d(2, 1, kernel_size=kernel_size, padding=p, bias=False)
        self.gate = nn.Sigmoid()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        avg_map = torch.mean(x, dim=1, keepdim=True)
        max_map, _ = torch.max(x, dim=1, keepdim=True)
        s = torch.cat([avg_map, max_map], dim=1)
        w = self.gate(self.conv(s))
        return x * w

class CBAMResidual(nn.Module):
    def __init__(
        self,
        channels: int,
        reduction: int = 16,
        sa_kernel: int = 7,
        alpha_init: float = 1.0,
        learnable_alpha: bool = False,
    ):
        super().__init__()
        self.ca = ChannelAttention(channels, reduction=reduction)
        self.sa = SpatialAttention(kernel_size=sa_kernel)

        if learnable_alpha:
            self.alpha = nn.Parameter(torch.tensor(float(alpha_init)))
        else:
            self.register_buffer("alpha", torch.tensor(float(alpha_init)))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        y = self.ca(x)
        y = self.sa(y)
        return x + self.alpha * (y - x)


---
---

## Adım 2 — Learnable Fusion + Residual CBAM

Bu adımın çözmeye çalıştığı problem şu:

* Klasik CBAM’de avg ve max daima eşit ağırlıkla toplanıyor.

* Ama bazı katmanlarda avg daha faydalı (genel bağlam), bazı katmanlarda max daha faydalı (pik/nesne sinyali).

* Sabit toplama bazen maskeyi “yanlış yönde” sertleştirebiliyor.

Bu yüzden fusion’ı iki modlu yapıyoruz:

* fusion="sum" → klasik davranış

* fusion="softmax" → öğrenilebilir ağırlık ile avg/max karışımı

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ChannelAttentionFusion(nn.Module):
    def __init__(
        self,
        channels: int,
        reduction: int = 16,
        min_hidden: int = 4,
        fusion: str = "softmax",   # "sum" | "softmax"
    ):
        super().__init__()
        assert fusion in ("sum", "softmax")
        self.fusion = fusion

        hidden = max(min_hidden, channels // reduction)
        self.avg = nn.AdaptiveAvgPool2d(1)
        self.mx  = nn.AdaptiveMaxPool2d(1)

        self.fc1 = nn.Conv2d(channels, hidden, 1, bias=True)
        self.act = nn.ReLU(inplace=True)
        self.fc2 = nn.Conv2d(hidden, channels, 1, bias=True)

        self.gate = nn.Sigmoid()

        if fusion == "softmax":
            self.fusion_logits = nn.Parameter(torch.zeros(2))
        else:
            self.fusion_logits = None

    def _mlp(self, s: torch.Tensor) -> torch.Tensor:
        return self.fc2(self.act(self.fc1(s)))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        a = self._mlp(self.avg(x))
        m = self._mlp(self.mx(x))

        if self.fusion == "sum":
            z = a + m
        else:
            w2 = torch.softmax(self.fusion_logits, dim=0)  # (2,)
            z = w2[0] * a + w2[1] * m

        w = self.gate(z)
        return x * w


class SpatialAttention(nn.Module):
    def __init__(self, kernel_size: int = 7):
        super().__init__()
        if kernel_size % 2 == 0:
            kernel_size += 1
        p = kernel_size // 2
        self.conv = nn.Conv2d(2, 1, kernel_size=kernel_size, padding=p, bias=False)
        self.gate = nn.Sigmoid()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        avg_map = torch.mean(x, dim=1, keepdim=True)
        max_map, _ = torch.max(x, dim=1, keepdim=True)
        s = torch.cat([avg_map, max_map], dim=1)
        w = self.gate(self.conv(s))
        return x * w


class CBAMResidualV2(nn.Module):
    def __init__(
        self,
        channels: int,
        reduction: int = 16,
        min_hidden: int = 4,
        sa_kernel: int = 7,
        ca_fusion: str = "softmax",   # "sum" | "softmax"
        alpha_init: float = 1.0,
        learnable_alpha: bool = False,
    ):
        super().__init__()
        self.ca = ChannelAttentionFusion(
            channels=channels,
            reduction=reduction,
            min_hidden=min_hidden,
            fusion=ca_fusion,
        )
        self.sa = SpatialAttention(kernel_size=sa_kernel)

        if learnable_alpha:
            self.alpha = nn.Parameter(torch.tensor(float(alpha_init)))
        else:
            self.register_buffer("alpha", torch.tensor(float(alpha_init)))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        y = self.ca(x)
        y = self.sa(y)
        return x + self.alpha * (y - x)


#### Ne değişti? (kodda net fark)

Klasik CA:
```python
z = mlp(avg) + mlp(max)
```


Yeni CA:
```python
w2 = softmax([p1, p2])
z = w2[0]*mlp(avg) + w2[1]*mlp(max)
```


Bu w2 iki parametreyle öğreniliyor:

* Bir katmanda avg daha iyi ise, ağırlık avg tarafına kayıyor.

* Başka bir katmanda max daha iyi ise, max tarafına kayıyor.

----
---


## Adım 3 — Temperature + Gate + Residual + Fusion
Bu adımın çözdüğü problemler:

* Mask çok keskinleşip (0’a yakın / 1’e yakın) feature’ları boğabiliyor → eğitim dalgalanır.

* Bazı modellerde sigmoid “yumuşak” kalıyor, bazılarında tam tersi “aşırı agresif” oluyor.

* Özellikle detection/segmentation’da bu “stability” meselesi sık görülür.

Çözüm:

* Gate’i seçilebilir yap (sigmoid / hardsigmoid)

Maskeyi üretmeden önce logit’i temperature ile ölçekle:

* T > 1 → daha yumuşak maske

* T < 1 → daha keskin maske

* İstersek T learnable da olabilir.

Aşağıda Adım 2’nin üstüne eklenmiş hali mecvut.



In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ChannelAttentionFusionT(nn.Module):
    def __init__(
        self,
        channels: int,
        reduction: int = 16,
        min_hidden: int = 4,
        fusion: str = "softmax",        # "sum" | "softmax"
        gate: str = "sigmoid",          # "sigmoid" | "hardsigmoid"
        temperature: float = 1.0,
        learnable_temperature: bool = False,
        eps: float = 1e-6,
    ):
        super().__init__()
        assert fusion in ("sum", "softmax")
        assert temperature > 0
        self.fusion = fusion
        self.eps = eps

        hidden = max(min_hidden, channels // reduction)
        self.avg = nn.AdaptiveAvgPool2d(1)
        self.mx  = nn.AdaptiveMaxPool2d(1)

        self.fc1 = nn.Conv2d(channels, hidden, 1, bias=True)
        self.act = nn.ReLU(inplace=True)
        self.fc2 = nn.Conv2d(hidden, channels, 1, bias=True)

        g = gate.lower()
        if g == "sigmoid":
            self.gate_fn = torch.sigmoid
        elif g == "hardsigmoid":
            self.gate_fn = F.hardsigmoid
        else:
            raise ValueError("gate 'sigmoid' veya 'hardsigmoid' olmalı.")

        if fusion == "softmax":
            self.fusion_logits = nn.Parameter(torch.zeros(2))
        else:
            self.fusion_logits = None

        self.learnable_temperature = learnable_temperature
        if learnable_temperature:
            t_raw = torch.tensor(float(temperature))
            t_inv = torch.log(torch.exp(t_raw) - 1.0 + eps)
            self.t_raw = nn.Parameter(t_inv)
        else:
            self.register_buffer("T", torch.tensor(float(temperature)))

    def _get_T(self) -> torch.Tensor:
        if self.learnable_temperature:
            return F.softplus(self.t_raw) + self.eps
        return self.T

    def _mlp(self, s: torch.Tensor) -> torch.Tensor:
        return self.fc2(self.act(self.fc1(s)))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        a = self._mlp(self.avg(x))
        m = self._mlp(self.mx(x))

        if self.fusion == "sum":
            z = a + m
        else:
            w2 = torch.softmax(self.fusion_logits, dim=0)
            z = w2[0] * a + w2[1] * m

        T = self._get_T()
        w = self.gate_fn(z / T)
        return x * w


class SpatialAttentionT(nn.Module):
    def __init__(
        self,
        kernel_size: int = 7,
        gate: str = "sigmoid",          # "sigmoid" | "hardsigmoid"
        temperature: float = 1.0,
        learnable_temperature: bool = False,
        eps: float = 1e-6,
    ):
        super().__init__()
        assert temperature > 0
        self.eps = eps

        if kernel_size % 2 == 0:
            kernel_size += 1
        p = kernel_size // 2
        self.conv = nn.Conv2d(2, 1, kernel_size=kernel_size, padding=p, bias=False)

        g = gate.lower()
        if g == "sigmoid":
            self.gate_fn = torch.sigmoid
        elif g == "hardsigmoid":
            self.gate_fn = F.hardsigmoid
        else:
            raise ValueError("gate 'sigmoid' veya 'hardsigmoid' olmalı.")

        self.learnable_temperature = learnable_temperature
        if learnable_temperature:
            t_raw = torch.tensor(float(temperature))
            t_inv = torch.log(torch.exp(t_raw) - 1.0 + eps)
            self.t_raw = nn.Parameter(t_inv)
        else:
            self.register_buffer("T", torch.tensor(float(temperature)))

    def _get_T(self) -> torch.Tensor:
        if self.learnable_temperature:
            return F.softplus(self.t_raw) + self.eps
        return self.T

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        avg_map = torch.mean(x, dim=1, keepdim=True)
        max_map, _ = torch.max(x, dim=1, keepdim=True)
        s = torch.cat([avg_map, max_map], dim=1)

        z = self.conv(s)
        T = self._get_T()
        w = self.gate_fn(z / T)
        return x * w


class CBAMResidualV3(nn.Module):
    def __init__(
        self,
        channels: int,
        reduction: int = 16,
        min_hidden: int = 4,
        sa_kernel: int = 7,
        ca_fusion: str = "softmax",
        ca_gate: str = "sigmoid",
        sa_gate: str = "sigmoid",
        ca_temperature: float = 1.0,
        sa_temperature: float = 1.0,
        learnable_temperature: bool = False,
        alpha_init: float = 1.0,
        learnable_alpha: bool = False,
    ):
        super().__init__()
        self.ca = ChannelAttentionFusionT(
            channels=channels,
            reduction=reduction,
            min_hidden=min_hidden,
            fusion=ca_fusion,
            gate=ca_gate,
            temperature=ca_temperature,
            learnable_temperature=learnable_temperature,
        )
        self.sa = SpatialAttentionT(
            kernel_size=sa_kernel,
            gate=sa_gate,
            temperature=sa_temperature,
            learnable_temperature=learnable_temperature,
        )

        if learnable_alpha:
            self.alpha = nn.Parameter(torch.tensor(float(alpha_init)))
        else:
            self.register_buffer("alpha", torch.tensor(float(alpha_init)))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        y = self.ca(x)
        y = self.sa(y)
        return x + self.alpha * (y - x)


---
---

## Adım 4 -) Dynamic Spatial CBAM

#### 1) Bu sürümde CBAM’in temel akışı ne?

Hâlâ aynı sıralama var:

* Channel Attention (CA)

* Spatial Attention (SA)

* Residual karışım

#### 2) Channel Attention tarafında neler var?
###### 2.1 Avg + Max Squeeze

Kanal önemini çıkarırken iki tür global özet alıyoruz:

* avgpool → genel enerji

* maxpool → pik enerji

##### 2.2 Fusion (sum / softmax)

* Klasikte avg+max sabit toplama.
Biz şunu ekledik:

* sum: klasik

* softmax: model iki ağırlık öğreniyor → avg mi max mı daha önemli?

* Bu sayede her blok kendi ihtiyacına göre “avg ağırlıklı” veya “max ağırlıklı” davranabiliyor.

##### 2.3 Temperature + Gate

Maskeyi üretmeden önce logit’i T ile ölçekliyoruz:

* T > 1: daha yumuşak (stabil)

* T < 1: daha keskin

Gate olarak da:

* sigmoid veya

* hardsigmoid (daha hızlı)

seçebiliyoruz.

#### 3) Dynamic Spatial Attention ne demek?

Klasik CBAM’de spatial attention şu:

* avg_map + max_map çıkar

* tek bir Conv2d(2→1, kernel=7) ile maske üret

* Bu “tek ölçek” demek.

Dynamic SA’de yaptığımız şey:

* Birden fazla spatial branch tanımladık:

>Branch 1: 3×3 (lokal detay)

>Branch 2: 7×7 (orta ölçek bağlam)

>Branch 3 (opsiyonel): 7×7 dilated (d=2) (geniş bağlam / pseudo-global)

* Her branch aynı inputtan mask logiti üretir:

>z1, z2, z3 gibi.

* Sonra bir “router” ile bu branch’leri ağırlıklandırıyoruz:

>Router inputu: SA inputunun kendisi (avg_map + max_map, yani (B,2,H,W))

>Router outputu: her branch için ağırlıklar (B, num_branches)

* Bu yüzden “dinamik”:

>Bazı görüntülerde küçük detay önemli → 3×3 ağırlığı artar

>Bazı görüntülerde büyük bağlam önemli → dilated veya 7×7 ağırlığı artar

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class ChannelAttentionFusionT(nn.Module):
    def __init__(
        self,
        channels: int,
        reduction: int = 16,
        min_hidden: int = 4,
        fusion: str = "softmax",        # "sum" | "softmax"
        gate: str = "sigmoid",          # "sigmoid" | "hardsigmoid"
        temperature: float = 1.0,
        learnable_temperature: bool = False,
        eps: float = 1e-6,
        act: str = "relu",              # "relu" | "silu"
        bias: bool = True,
        return_fusion_weights: bool = False,
    ):
        super().__init__()
        if fusion not in ("sum", "softmax"):
            raise ValueError("fusion 'sum' veya 'softmax' olmalı.")
        if gate.lower() not in ("sigmoid", "hardsigmoid"):
            raise ValueError("gate 'sigmoid' veya 'hardsigmoid' olmalı.")
        if temperature <= 0:
            raise ValueError("temperature pozitif olmalı.")
        if act not in ("relu", "silu"):
            raise ValueError("act 'relu' veya 'silu' olmalı.")

        self.fusion = fusion
        self.return_fusion_weights = return_fusion_weights
        self.eps = eps

        hidden = max(min_hidden, channels // reduction)

        self.avg = nn.AdaptiveAvgPool2d(1)
        self.mx  = nn.AdaptiveMaxPool2d(1)

        self.fc1 = nn.Conv2d(channels, hidden, 1, bias=bias)
        self.act = nn.ReLU(inplace=True) if act == "relu" else nn.SiLU(inplace=True)
        self.fc2 = nn.Conv2d(hidden, channels, 1, bias=bias)

        if gate.lower() == "sigmoid":
            self.gate_fn = torch.sigmoid
        else:
            self.gate_fn = F.hardsigmoid

        if fusion == "softmax":
            self.fusion_logits = nn.Parameter(torch.zeros(2))
        else:
            self.fusion_logits = None

        self.learnable_temperature = learnable_temperature
        if learnable_temperature:
            t_raw = torch.tensor(float(temperature))
            t_inv = torch.log(torch.exp(t_raw) - 1.0 + eps)
            self.t_raw = nn.Parameter(t_inv)
        else:
            self.register_buffer("T", torch.tensor(float(temperature)))

    def _get_T(self) -> torch.Tensor:
        if self.learnable_temperature:
            return F.softplus(self.t_raw) + self.eps
        return self.T

    def _mlp(self, s: torch.Tensor) -> torch.Tensor:
        return self.fc2(self.act(self.fc1(s)))

    def forward(self, x: torch.Tensor):
        a = self._mlp(self.avg(x))
        m = self._mlp(self.mx(x))

        fusion_w = None
        if self.fusion == "sum":
            z = a + m
        else:
            fusion_w = torch.softmax(self.fusion_logits, dim=0)  # (2,)
            z = fusion_w[0] * a + fusion_w[1] * m

        T = self._get_T()
        ca = self.gate_fn(z / T)  # (B,C,1,1)
        y = x * ca

        if self.return_fusion_weights and (fusion_w is not None):
            return y, ca, fusion_w
        return y, ca


class DynamicSpatialAttention(nn.Module):
    def __init__(
        self,
        kernels=(3, 7),
        use_dilated: bool = True,
        dilated_kernel: int = 7,
        dilated_d: int = 2,
        gate: str = "sigmoid",          # "sigmoid" | "hardsigmoid"
        temperature: float = 1.0,
        learnable_temperature: bool = False,
        eps: float = 1e-6,
        router_hidden: int = 8,
        bias: bool = True,
        return_router_weights: bool = False,
    ):
        super().__init__()
        if temperature <= 0:
            raise ValueError("temperature pozitif olmalı.")
        if gate.lower() not in ("sigmoid", "hardsigmoid"):
            raise ValueError("gate 'sigmoid' veya 'hardsigmoid' olmalı.")
        if router_hidden < 1:
            raise ValueError("router_hidden >= 1 olmalı.")

        self.eps = eps
        self.return_router_weights = return_router_weights

        ks = []
        for k in kernels:
            k = int(k)
            if k % 2 == 0:
                k += 1
            if k < 1:
                raise ValueError("kernel_size >= 1 olmalı.")
            ks.append(k)

        self.branches = nn.ModuleList()
        for k in ks:
            p = k // 2
            self.branches.append(nn.Conv2d(2, 1, kernel_size=k, padding=p, bias=False))

        if use_dilated:
            k = int(dilated_kernel)
            if k % 2 == 0:
                k += 1
            if dilated_d < 1:
                raise ValueError("dilated_d >= 1 olmalı.")
            p = dilated_d * (k - 1) // 2
            self.branches.append(
                nn.Conv2d(2, 1, kernel_size=k, padding=p, dilation=dilated_d, bias=False)
            )

        self.num_branches = len(self.branches)

        if gate.lower() == "sigmoid":
            self.gate_fn = torch.sigmoid
        else:
            self.gate_fn = F.hardsigmoid

        self.learnable_temperature = learnable_temperature
        if learnable_temperature:
            t_raw = torch.tensor(float(temperature))
            t_inv = torch.log(torch.exp(t_raw) - 1.0 + eps)
            self.t_raw = nn.Parameter(t_inv)
        else:
            self.register_buffer("T", torch.tensor(float(temperature)))

        self.router = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(2, router_hidden, 1, bias=bias),
            nn.ReLU(inplace=True),
            nn.Conv2d(router_hidden, self.num_branches, 1, bias=bias),
        )

    def _get_T(self) -> torch.Tensor:
        if self.learnable_temperature:
            return F.softplus(self.t_raw) + self.eps
        return self.T

    def forward(self, x: torch.Tensor):
        avg_map = torch.mean(x, dim=1, keepdim=True)
        max_map, _ = torch.max(x, dim=1, keepdim=True)
        s = torch.cat([avg_map, max_map], dim=1)  # (B,2,H,W)

        logits = self.router(s).flatten(1)             # (B,K)
        rw = torch.softmax(logits, dim=1)              # (B,K)

        z = torch.stack([br(s) for br in self.branches], dim=1)  # (B,K,1,H,W)
        wlogit = (rw[:, :, None, None, None] * z).sum(dim=1)     # (B,1,H,W)

        T = self._get_T()
        sa = self.gate_fn(wlogit / T)  # (B,1,H,W)
        y = x * sa

        if self.return_router_weights:
            return y, sa, rw
        return y, sa


class CBAMResidualDynamicSA(nn.Module):
    def __init__(
        self,
        channels: int,
        reduction: int = 16,
        min_hidden: int = 4,
        ca_fusion: str = "softmax",
        ca_gate: str = "sigmoid",
        ca_temperature: float = 1.0,
        ca_act: str = "relu",
        sa_gate: str = "sigmoid",
        sa_temperature: float = 1.0,
        learnable_temperature: bool = False,
        sa_kernels=(3, 7),
        sa_use_dilated: bool = True,
        sa_dilated_kernel: int = 7,
        sa_dilated_d: int = 2,
        sa_router_hidden: int = 8,
        residual: bool = True,
        alpha_init: float = 1.0,
        learnable_alpha: bool = False,
        return_maps: bool = False,
    ):
        super().__init__()
        self.return_maps = return_maps
        self.residual = residual

        self.ca = ChannelAttentionFusionT(
            channels=channels,
            reduction=reduction,
            min_hidden=min_hidden,
            fusion=ca_fusion,
            gate=ca_gate,
            temperature=ca_temperature,
            learnable_temperature=learnable_temperature,
            act=ca_act,
            return_fusion_weights=return_maps,
        )

        self.sa = DynamicSpatialAttention(
            kernels=sa_kernels,
            use_dilated=sa_use_dilated,
            dilated_kernel=sa_dilated_kernel,
            dilated_d=sa_dilated_d,
            gate=sa_gate,
            temperature=sa_temperature,
            learnable_temperature=learnable_temperature,
            router_hidden=sa_router_hidden,
            return_router_weights=return_maps,
        )

        if residual:
            if learnable_alpha:
                self.alpha = nn.Parameter(torch.tensor(float(alpha_init)))
            else:
                self.register_buffer("alpha", torch.tensor(float(alpha_init)))

    def forward(self, x: torch.Tensor):
        if self.return_maps:
            y, ca, fusion_w = self.ca(x)
            y, sa, router_w = self.sa(y)
            out = x + self.alpha * (y - x) if self.residual else y
            return out, ca, sa, fusion_w, router_w

        y, _ = self.ca(x)
        y, _ = self.sa(y)
        out = x + self.alpha * (y - x) if self.residual else y
        return out


if __name__ == "__main__":
    x = torch.randn(2, 64, 56, 56)
    m = CBAMResidualDynamicSA(
        channels=64,
        reduction=16,
        ca_fusion="softmax",
        ca_gate="sigmoid",
        ca_temperature=1.0,
        ca_act="silu",
        sa_gate="sigmoid",
        sa_temperature=1.0,
        sa_kernels=(3, 7),
        sa_use_dilated=True,
        sa_dilated_kernel=7,
        sa_dilated_d=2,
        sa_router_hidden=8,
        residual=True,
        learnable_alpha=True,
        alpha_init=1.0,
        learnable_temperature=False,
        return_maps=True,
    )
    out, ca, sa, fusion_w, router_w = m(x)
    print(out.shape, ca.shape, sa.shape, fusion_w.shape, router_w.shape)

torch.Size([2, 64, 56, 56]) torch.Size([2, 64, 1, 1]) torch.Size([2, 1, 56, 56]) torch.Size([2]) torch.Size([2, 3])


---
---

### Her class ın açıklamaları klasör içerisindeki .ipynb dosyalarında mevcuttu.

---
---