# Xecption

### Import Module

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
from torch import optim
from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import os
from torchvision import utils
import numpy as np
from torchsummary import summary
import time
import matplotlib.pyplot as plt
%matplotlib inline

### Load Data

### Depth-wise Convolution

![](https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FFBiDE%2FbtqUczJkl0Q%2FlSLn7kwBNe9rPAW4hsBn90%2Fimg.png)

- 기존에 Standared Convolution 연산 과정은 Kernel Size x Kernel Size x Input Channel만큼의 parameter가 필요했음

- Depth-wise Convolution는 두 단계로 나눠짐.

#### < 1. Depth-wise Convolution : Filtering stage >
- 기존 standard convolution 연산에서는 한 개의 kernel이 channel C에 대해서 연산을 했음.
- 그러나 Depth-wise Convolution에서는 channel을 각각 나눠서 계산한다.
- 예를 들어, $D_G$ x $D_G$ x M의 tensor가 들어오면 1x1xM의 filter를 이용해 convolution 연산을 하면 $D_G$ x $D_G$ x 1의 shape의 tensor가 나오게 되고 filter가 총 N개 라면 $D_G$ x $D_G$ x $D_G$ x N이 출력이 됩니다.
- Kernel Shape = $D_k$ x $D_k$ x N

#### < 2.  Pointwise convolution : Combination stage >
- Standard convolution의 아웃풋은 standard convolution의 output과 같음.
- kernel의 shape가 1 x 1 x M
- 1개의 kernel이 input에 대해 계산하기 $D_G$ x $D_G$ x M이 필요함.
- Pointwise convolution이 총 연산량은 N x $D_G^2$ x M과 같음.

In [3]:
class DepthwiseConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()

        self.Depthwise = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, 3, stride=1, padding=1, bias=False), ## depth_wise convolution에서는 in_channels과 out channels를 동일하게 설정 
            nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False)
        )

    def forward(self, x):
        x = self.Depthwise(x)
        return x

### Xception Block

![](https://miro.medium.com/max/1400/1*GjiwedRFaBCRvfD0ac93eQ.png)

In [4]:
class EntryFlow(nn.Module): # 위의 구조 Entry Flow와 똑같이 layer를 쌓음 / Inception module
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 32, 3, stride=2, padding=1, bias=False), ## in_channels = 3, out_channels = 32, kernel size = 3x3
            nn.BatchNorm2d(32), ## conv-layer 후 batchnorm
            nn.ReLU(), ## batchnorm -> ReLU
            nn.Conv2d(32, 64, 3, stride=1, padding=0, bias=False), ## 다음 convolution layer
            nn.BatchNorm2d(64), ## batchnorm
            nn.ReLU() 
        )

        self.conv2 = nn.Sequential(
            DepthwiseConv(64, 128), ## DepthWise-Convolution out_channels-> 128 size
            nn.BatchNorm2d(128),
            nn.ReLU(),
            DepthwiseConv(128, 128),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(3, stride=2, padding=1) ## MaxPooling / kernel_size = 2, stride = 2
        )

        self.conv2_residual = nn.Sequential( ## Residual connection convolution / output size 128로 동일
            nn.Conv2d(64, 128, 1, stride=2, padding=0),
            nn.BatchNorm2d(128)
        )
##### 계속 같은 layer가 반복되는 구조 
        self.conv3 = nn.Sequential(
            nn.ReLU(),
            DepthwiseConv(128, 256),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            DepthwiseConv(256, 256),
            nn.BatchNorm2d(256),
            nn.MaxPool2d(3, stride=2, padding=1)
        )

        self.conv3_residual = nn.Sequential(
            nn.Conv2d(128, 256, 1, stride=2, padding=0),
            nn.BatchNorm2d(256)
        )

        self.conv4 = nn.Sequential(
            nn.ReLU(),
            DepthwiseConv(256, 728),
            nn.BatchNorm2d(728),
            nn.ReLU(),
            DepthwiseConv(728, 728),
            nn.BatchNorm2d(728),
            nn.MaxPool2d(3, stride=2, padding=1)
        )

        self.conv4_residual = nn.Sequential(
            nn.Conv2d(256, 728, 1, stride=2, padding=0),
            nn.BatchNorm2d(728)
        )

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x) + self.conv2_residual(x) ### residual connection으로 output 값에 더해줌 -> gradient vanishing 방지
        x = self.conv3(x) + self.conv3_residual(x)
        x = self.conv4(x) + self.conv4_residual(x)
        return x

### EntryFlow

### Middle Flow

In [5]:
class MiddleFlow(nn.Module): ## Data가 Entry Flow를 지나 8번의 Middle Flow를 지나게 됨.
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Sequential(
            nn.ReLU(),
            DepthwiseConv(728, 728),
            nn.BatchNorm2d(728),
            nn.ReLU(),
            DepthwiseConv(728, 728),
            nn.BatchNorm2d(728),
            nn.ReLU(),
            DepthwiseConv(728, 728),
            nn.BatchNorm2d(728)
        )

        self.conv1_residual = nn.Sequential()

    def forward(self, x):
        return self.conv1(x) + self.conv1_residual(x)

### Exit Flow

In [6]:
class ExitFlow(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()

        self.conv1 = nn.Sequential(
            nn.ReLU(),
            DepthwiseConv(728, 1024),
            nn.BatchNorm2d(1024),
            nn.ReLU(),
            DepthwiseConv(1024, 1024),
            nn.BatchNorm2d(1024),
            nn.MaxPool2d(3, stride=2, padding=1)
        )

        self.conv1_residual = nn.Sequential(
            nn.Conv2d(728, 1024, 1, stride=2, padding=0),
            nn.BatchNorm2d(1024)
        )

        self.conv2 = nn.Sequential(
            DepthwiseConv(1024, 1536),
            nn.BatchNorm2d(1536),
            nn.ReLU(),
            DepthwiseConv(1536, 2048),
            nn.BatchNorm2d(2048),
            nn.ReLU()
        )

        self.avg_pool = nn.AdaptiveAvgPool2d((1,1)) ### 마지막 average_pooling / output_size = 1x1
    
    def forward(self, x):
        x = self.conv1(x) + self.conv1_residual(x)
        x = self.conv2(x)
        x = self.avg_pool(x)
        return x

In [7]:
class Xception(nn.Module):
    def __init__(self, num_classes=10, init_weights=True):
        super().__init__()
        self.init_weights = init_weights

        self.entry = EntryFlow()
        self.middle = self._make_middle_flow()
        self.exit = ExitFlow() 

        #### output_size 1x1 -> 2048로 linear layer에 들어감

        self.linear = nn.Linear(2048, num_classes) #### 2048 -> num_classes(output features)

        # weights initialization
        if self.init_weights:
            pass


    def forward(self, x):
        x = self.entry(x)
        x = self.middle(x)
        x = self.exit(x)
        x = x.view(x.size(0), -1)
        x = self.linear(x)
        return x

    def _make_middle_flow(self):
        middle = nn.Sequential()
        for i in range(8):
            middle.add_module('middle_block_{}'.format(i), MiddleFlow())
        return middle

    def _initialize_weights(self): ### initial weights (가중치 초기화) -> He initialization
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init_kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') ### fan_out -> 최댓값
                if m.bias is not None:
                    nn.init_constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init_constant_(m.weight, 1)
                nn.init_bias_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init_normal_(m.weight, 0, 0.01)
                nn.init_constant_(m.bias, 0)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Xception().to(device)

In [12]:
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms.ToTensor())
valid_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms.ToTensor())
print(len(train_dataset))
print(len(valid_dataset))

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Files already downloaded and verified
50000
50000


In [19]:
train_dataset.tranform = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Resize(299)]) ### size를 299로 

valid_dataset.tranform = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Resize(299)])

test_dataset.tranform = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Resize(299)])


In [20]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [22]:
optimizer= optim.Adam(model.parameters(), lr= 1e-2)
loss_fn= nn.CrossEntropyLoss().cuda()

In [None]:
for epoch in range(50):
    running_loss= 0
    validation_loss= 0
    model.train()
    print(f"Training epoch: {epoch+1}")
    for batch_ix, (imgs, targets) in enumerate(train_dataloader):
        imgs, targets= imgs.cuda(), targets.cuda()
        outs= model(imgs)
        loss= loss_fn(outs, targets)
        running_loss+= loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Train loss: {running_loss/len(train_dataloader)}")
    model.eval()
    for batch_ix, (imgs, targets) in enumerate(test_dataloader):
        imgs, targets= imgs.cuda(), targets.cuda()
        outs= model(imgs)
        with torch.no_grad():
            loss= loss_fn(outs, targets)
        validation_loss+= loss.item()
    print(f"Validation loss: {validation_loss/len(test_dataloader)}")
    torch.save(model.state_dict(), f"{epoch+1}.pth")
    print("Checkpoint saved\n\n")

Training epoch: 1
Train loss: 2.3413470719612643
Validation loss: 1.9369917460523856
Checkpoint saved


Training epoch: 2
Train loss: 1.8096344176577361
Validation loss: 1.7412323083359593
Checkpoint saved


Training epoch: 3
Train loss: 1.5849044795457323
Validation loss: 1.510769286094763
Checkpoint saved


Training epoch: 4
Train loss: 1.4514314256565584
Validation loss: 1.395704203520339
Checkpoint saved


Training epoch: 5
Train loss: 1.3379181061733707
Validation loss: 1.4586783826541596
Checkpoint saved


Training epoch: 6
Train loss: 1.2405835401150025
Validation loss: 1.2234804538873056
Checkpoint saved


Training epoch: 7
Train loss: 1.1686699927348931
Validation loss: 1.1453938396594014
Checkpoint saved


Training epoch: 8
Train loss: 1.0974433865595992
Validation loss: 1.1026344470703564
Checkpoint saved


Training epoch: 9
Train loss: 1.03459681952831
Validation loss: 1.015664773627211
Checkpoint saved


Training epoch: 10
Train loss: 0.9728504816881755
Validation loss: 1.