In [6]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [24]:
kernel_size = 5

sigma = 1.4
channels = 3

x_coord = torch.arange(kernel_size) #([0, 1, 2])
x_grid = x_coord.repeat(kernel_size).view(kernel_size, kernel_size) # ([0,1,2], [0,1,2], [0,1,2])
y_grid = x_grid.t() # ([0,0,0], [1,1,1], [2,2,2])
xy_grid = torch.stack([x_grid, y_grid], dim=-1).float()
mean = (kernel_size - 1)/2. # 1 for kernel size = 3
variance = sigma**2. # 4.0, for sigma = 2

# Calculate the 2-dimensional gaussian kernel which is
#gaussian_kernel = (1./(2.*math.pi*variance)) * torch.exp(-torch.sum((xy_grid - mean)**2., dim=-1) / (2*variance))
#print(gaussian_kernel)

log_kernel = (-1./(math.pi*(sigma**4))) \
                        * (1-(torch.sum((xy_grid - mean)**2., dim=-1) / (2*(sigma**2)))) \
                        * torch.exp(-torch.sum((xy_grid - mean)**2., dim=-1) / (2*(sigma**2)))
log_kernel = log_kernel / torch.sum(log_kernel)
print(log_kernel)


tensor([[-0.0410, -0.0233, -0.0022, -0.0233, -0.0410],
        [-0.0233,  0.0891,  0.1750,  0.0891, -0.0233],
        [-0.0022,  0.1750,  0.3031,  0.1750, -0.0022],
        [-0.0233,  0.0891,  0.1750,  0.0891, -0.0233],
        [-0.0410, -0.0233, -0.0022, -0.0233, -0.0410]])


In [6]:

# kernel_size = 3
# sigma = 2
# channels = 3

# x_coord = torch.arange(kernel_size) #([0, 1, 2])
# x_grid = x_coord.repeat(kernel_size).view(kernel_size, kernel_size) # ([0,1,2], [0,1,2], [0,1,2])
# y_grid = x_grid.t() # ([0,0,0], [1,1,1], [2,2,2])
# xy_grid = torch.stack([x_grid, y_grid], dim=-1).float()
# mean = (kernel_size - 1)/2. # 1 for kernel size = 3
# variance = sigma**2. # 4.0, for sigma = 2

# # Calculate the 2-dimensional gaussian kernel which is
# # the product of two gaussian distributions for two different
# # variables (in this case called x and y)
# gaussian_kernel = (1./(2.*math.pi*variance)) * torch.exp(-torch.sum((xy_grid - mean)**2., dim=-1) / (2*variance))

# # Make sure sum of values in gaussian kernel equals 1.
#     # tensor([[0.1019, 0.1154, 0.1019],
#     #         [0.1154, 0.1308, 0.1154],
#     #         [0.1019, 0.1154, 0.1019]])
# gaussian_kernel = gaussian_kernel / torch.sum(gaussian_kernel)

# # Reshape to 2d depthwise convolutional weight
#     # tensor([[[[0.1019, 0.1154, 0.1019],
#     #           [0.1154, 0.1308, 0.1154],
#     #           [0.1019, 0.1154, 0.1019]]]])
# gaussian_kernel = gaussian_kernel.view(1, 1, kernel_size, kernel_size)

# gaussian_kernel = gaussian_kernel.repeat(channels, 1, 1, 1)
# padding = 1 if kernel_size==3 else 2 if kernel_size == 5 else 0
# gaussian_filter = nn.Conv2d(in_channels=channels, out_channels=channels,
#                             kernel_size=kernel_size, groups=channels,
#                             bias=False, padding=padding)
# gaussian_filter.weight.data = gaussian_kernel
# gaussian_filter.weight.requires_grad = False 
# print(gaussian_filter)
# print(gaussian_filter.weight.data)

In [1]:
# Architecture: resnet18 with gaussian filter
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

'''Gaussian filter layer in PyTorch.
Reference:
[1] Curriculum by Smoothing. NeurIPS2020
'''

def get_gaussian_filter(kernel_size=3, sigma=2, channels=3):
    # Create a x, y coordinate grid of shape (kernel_size, kernel_size, 2)
    x_coord = torch.arange(kernel_size)
    x_grid = x_coord.repeat(kernel_size).view(kernel_size, kernel_size)
    y_grid = x_grid.t()
    xy_grid = torch.stack([x_grid, y_grid], dim=-1).float()
    mean = (kernel_size - 1)/2.
    variance = sigma**2.

    # Calculate the 2-dimensional gaussian kernel which is
    # the product of two gaussian distributions for two different
    # variables (in this case called x and y)
    gaussian_kernel = (1./(2.*math.pi*variance)) * torch.exp(
                        -torch.sum((xy_grid - mean)**2., dim=-1) / (2*variance))

    # Make sure sum of values in gaussian kernel equals 1.
    gaussian_kernel = gaussian_kernel / torch.sum(gaussian_kernel)

    # Reshape to 2d depthwise convolutional weight
    gaussian_kernel = gaussian_kernel.view(1, 1, kernel_size, kernel_size)
    gaussian_kernel = gaussian_kernel.repeat(channels, 1, 1, 1)
    padding = 1 if kernel_size==3 else 2 if kernel_size == 5 else 0
    gaussian_filter = nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=kernel_size, 
                                groups=channels, bias=False, padding=padding)
    gaussian_filter.weight.data = gaussian_kernel
    gaussian_filter.weight.requires_grad = False 
    return gaussian_filter

'''ResNet in PyTorch.
For Pre-activation ResNet, see 'preact_resnet.py'.
Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''
class BasicBlock(nn.Module):
    expansion = 1

    def __init__( self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        
        self.planes = planes        
        
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut_kernel = True
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def get_new_kernels(self, kernel_size, std):
        self.kernel1 = get_gaussian_filter(kernel_size=kernel_size, sigma=std, channels=self.planes)
        self.kernel2 = get_gaussian_filter(kernel_size=kernel_size, sigma=std, channels=self.planes)

    def forward(self, x):
        out = self.conv1(x)
        out = F.relu(self.bn1(self.kernel1(out)))
        out = self.conv2(out)
        out = self.bn2(self.kernel2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, args):
        super(ResNet, self).__init__()
        self.in_planes = 64
        self.std = args.std
        self.factor = args.std_factor
        self.epoch = args.epoch
        self.kernel_size = args.kernel_size

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, args.num_classes)

        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None: nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = F.relu(self.bn1(self.kernel1(out)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

    def get_new_kernels(self, epoch_count):
        if epoch_count % self.epoch == 0 and epoch_count is not 0: self.std *= self.factor
            
        self.kernel1 = get_gaussian_filter(kernel_size=self.kernel_size, sigma=self.std, channels=64)

        for child in self.layer1.children(): child.get_new_kernels(self.kernel_size, self.std)
        for child in self.layer2.children(): child.get_new_kernels(self.kernel_size, self.std)
        for child in self.layer3.children(): child.get_new_kernels(self.kernel_size, self.std)
        for child in self.layer4.children(): child.get_new_kernels(self.kernel_size, self.std)

def ResNet18(args):
    return ResNet(BasicBlock, [2,2,2,2], args)

def ResNet34(args):
    return ResNet(BasicBlock, [3,4,6,3], args)

def test():
    net = ResNet18()
    y = net(torch.randn(1,3,32,32))
    print(y.size())

In [2]:
# Dateset preparation
import os
import copy

import math
import argparse
from sklearn.metrics import accuracy_score

import torch
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F
from torchvision import transforms, datasets

def seed_everything(seed=27):
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

#torch.backends.cudnn.enabled = False 

parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, default='datasets/curriculum_learning')
parser.add_argument('--dataset', type=str, default='cifar100')
parser.add_argument('--log_name', type=str, default='cbs_res_def')
parser.add_argument('--alg', type=str, default='res', choices=['normal', 'vgg', 'res', 'wrn'])
parser.add_argument('--log_path', type=str, default='log')
parser.add_argument('--no-cuda', action='store_true')
parser.add_argument('--batch_size', type=int, default=512)
parser.add_argument('--num_epochs', type=int, default=200)
parser.add_argument('--percentage', type=int, default=10)
parser.add_argument('--save_model', action='store_true')
parser.add_argument('--lr', type=float, default=1e-1)

# CBS ARGS
parser.add_argument('--std', default=1, type=float)
parser.add_argument('--std_factor', default=0.9, type=float)
parser.add_argument('--epoch', default=5, type=int)
parser.add_argument('--kernel_size', default=3, type=int)

args = parser.parse_args(args = [])

transform = transforms.Compose([transforms.Scale(32),transforms.ToTensor(),
                                transforms.Normalize((0.5, 0.5, 0.5,), (0.5, 0.5, 0.5,))])    
train_data = datasets.CIFAR100(root=args.data,download=True,train=True,transform=transform)
test_data = datasets.CIFAR100(root=args.data,download=True,train=False,transform=transform)
train_loader = data.DataLoader(train_data, batch_size=args.batch_size,pin_memory=True,num_workers=int(4),shuffle=True,drop_last=True )
test_loader = data.DataLoader(test_data,batch_size=args.batch_size,pin_memory=True,num_workers=int(4),shuffle=False,drop_last=False)
args.num_classes = 100
args.in_dim = 3
#from arguments import get_args

  "please use transforms.Resize instead.")


Files already downloaded and verified
Files already downloaded and verified


In [4]:
# Trainer
import os

os.environ["CUDA_VISIBLE_DEVICES" ]= "1"

seed_everything()
decay_epoch = 30
stop_decay_epoch = decay_epoch * 3 + 1
best_epoch, best_acc = 0, 0
num_iter = 0
model = ResNet18(args)##.cuda()
optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=5e-4, momentum=0.9)
criterion = F.cross_entropy

def train(model, train_loader, optimizer, criterion):
    model.train()
    for num_iter, (images, labels) in enumerate(train_loader):
        images, labels = images.cuda(), labels.cuda()
        preds = model(images)
        loss = criterion(preds, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # if num_iter % 200 == 0:
        #     print('iter num: {} \t loss: {:.2f}'.format(num_iter, loss.item())) 

def test(model, test_loader):
    model.eval()
    total, correct = 0, 0
    for images, labels in test_loader:
        images = images.cuda()
        with torch.no_grad():
            preds = model(images)
            preds = torch.argmax(preds, dim=1).cpu().numpy()
            correct += accuracy_score(labels, preds, normalize=False)
            total += images.size(0)
    model.train()
    return correct / total * 100

num_epoch = 50 # need to increase to reproduce the paper 
for epoch_count in range(num_epoch):
    model.get_new_kernels(epoch_count)
    model = model.cuda()
    if epoch_count is not 0 and epoch_count % decay_epoch == 0 and epoch_count < stop_decay_epoch:
        for param in optim.param_groups:
            param['lr'] = param['lr'] / 10
        
    train(model, train_loader, optimizer, criterion)
    accuracy = test(model, test_loader)
    if accuracy > best_acc:
        best_acc = accuracy
        best_epoch = epoch_count
        best_model = copy.deepcopy(model)
        torch.save(best_model.state_dict(), 'best_model.pth.tar')

    print('current epoch: {}  current acc: {:.2f}  best epoch: {}  best acc: {:.2f}'.format(
            epoch_count, accuracy, best_epoch, best_acc))

current epoch: 0  current acc: 13.91  best epoch: 0  best acc: 13.91
current epoch: 1  current acc: 17.19  best epoch: 1  best acc: 17.19
current epoch: 2  current acc: 25.45  best epoch: 2  best acc: 25.45
current epoch: 3  current acc: 28.02  best epoch: 3  best acc: 28.02
current epoch: 4  current acc: 31.85  best epoch: 4  best acc: 31.85
current epoch: 5  current acc: 36.36  best epoch: 5  best acc: 36.36
current epoch: 6  current acc: 40.15  best epoch: 6  best acc: 40.15
current epoch: 7  current acc: 36.41  best epoch: 6  best acc: 40.15
current epoch: 8  current acc: 39.68  best epoch: 6  best acc: 40.15
current epoch: 9  current acc: 42.89  best epoch: 9  best acc: 42.89
current epoch: 10  current acc: 42.81  best epoch: 9  best acc: 42.89
current epoch: 11  current acc: 39.69  best epoch: 9  best acc: 42.89
current epoch: 12  current acc: 45.54  best epoch: 12  best acc: 45.54
current epoch: 13  current acc: 45.60  best epoch: 13  best acc: 45.60
current epoch: 14  current a

AttributeError: module 'torch.optim' has no attribute 'param_groups'

In [None]:
# conv2d
import math
import torch
import torch.nn as nn

def get_gaussian_filter_2D(kernel_sizex=3,kernel_sizey=1, sigma=2, channels=3):
    # Create a x, y coordinate grid of shape (kernel_size, kernel_size, 2)
    kernel_size = max(kernel_sizex, kernel_sizey)
    x_coord = torch.arange(kernel_size)
    x_grid = x_coord.repeat(kernel_size).view(kernel_size, kernel_size)
    y_grid = x_grid.t()
    xy_grid = torch.stack([x_grid, y_grid], dim=-1).float()
    mean = (kernel_size - 1)/2.
    variance = sigma**2.

    # Calculate the 2-dimensional gaussian kernel which is
    # the product of two gaussian distributions for two different
    # variables (in this case called x and y)
    xy_grid = xy_grid[:kernel_sizex,:kernel_sizey,:]
    gaussian_kernel = (1./(2.*math.pi*variance)) * torch.exp(
                        -torch.sum((xy_grid - mean)**2., dim=-1) / (2*variance))

    # Make sure sum of values in gaussian kernel equals 1.
    gaussian_kernel = gaussian_kernel / torch.sum(gaussian_kernel)

    # Reshape to 2d depthwise convolutional weight
    gaussian_kernel = gaussian_kernel.view(1, 1, kernel_sizex, kernel_sizey)
    gaussian_kernel = gaussian_kernel.repeat(channels, 1, 1, 1)
    padding = 1 if kernel_size==3 else 2 if kernel_size == 5 else 0
    gaussian_filter = nn.Conv2d(in_channels=channels, out_channels=channels,
                                kernel_size=(kernel_sizex,kernel_sizey), groups=channels,
                                bias=False, padding=padding)
    gaussian_filter.weight.data = gaussian_kernel
    gaussian_filter.weight.requires_grad = False 
    return gaussian_filter
    
get_gaussian_filter()

In [None]:
#conv1d
import math
import torch
import torch.nn as nn

def get_gaussian_filter_1D(kernel_sizex=3,kernel_sizey=1, sigma=2, channels=3):
    # Create a x, y coordinate grid of shape (kernel_size, kernel_size, 2)
    kernel_size = max(kernel_sizex, kernel_sizey)
    x_coord = torch.arange(kernel_size)
    x_grid = x_coord.repeat(kernel_size).view(kernel_size, kernel_size)
    y_grid = x_grid.t()
    xy_grid = torch.stack([x_grid, y_grid], dim=-1).float()
    mean = (kernel_size - 1)/2.
    variance = sigma**2.

    # Calculate the 2-dimensional gaussian kernel which is
    xy_grid = xy_grid[:kernel_sizex,:kernel_sizey,:]
    gaussian_kernel = (1./(2.*math.pi*variance)) * torch.exp(
                        -torch.sum((xy_grid - mean)**2., dim=-1) / (2*variance))

    # Make sure sum of values in gaussian kernel equals 1.
    gaussian_kernel = gaussian_kernel / torch.sum(gaussian_kernel)

    # Reshape to 2d depthwise convolutional weight
    gaussian_kernel = gaussian_kernel.view(1, 1, kernel_sizex, kernel_sizey)
    gaussian_kernel = gaussian_kernel.repeat(channels, 1, 1, 1)
    padding = 1 if kernel_size==3 else 2 if kernel_size == 5 else 0
    gaussian_filter = nn.Conv1d(in_channels=channels, out_channels=channels,
                                kernel_size=kernel_sizex, groups=channels,
                                bias=False, padding=padding)
    gaussian_filter.weight.data = gaussian_kernel
    gaussian_filter.weight.requires_grad = False 
    return gaussian_filter

In [63]:
kernel_sizex = 3
kernel_sizey = 1
sigma = 2
channel = 3
kernel_size = max(kernel_sizex, kernel_sizey)
x_coord = torch.arange(kernel_size)
x_grid = x_coord.repeat(kernel_size).view(kernel_size, kernel_size)
y_grid = x_grid.t()
xy_grid = torch.stack([x_grid, y_grid], dim=-1).float()
mean = (kernel_size - 1)/2.
variance = sigma**2.

# Calculate the 2-dimensional gaussian kernel which is
xy_grid = xy_grid[:kernel_sizex,:kernel_sizey,:]
print(torch.sum((xy_grid - mean)**2., dim=-1))
gaussian_kernel = (1./((math.sqrt(2.*math.pi)*sigma))) * torch.exp(
                        -torch.sum((xy_grid - mean)**2., dim=-1) / (2*variance))

print(gaussian_kernel)
# Make sure sum of values in gaussian kernel equals 1.
gaussian_kernel = gaussian_kernel / torch.sum(gaussian_kernel)

# Reshape to 2d depthwise convolutional weight
gaussian_kernel = gaussian_kernel.view(1, 1, kernel_sizex, kernel_sizey)
gaussian_kernel = gaussian_kernel.repeat(channels, 1, 1, 1)

padding = 1 if kernel_size==3 else 2 if kernel_size == 5 else 0
gaussian_filter = nn.Conv1d(in_channels=channels, out_channels=channels,
                            kernel_size=kernel_sizex, groups=channels,
                            bias=False, padding=padding)
#print(gaussian_filter.weight.data)
gaussian_filter.weight.data = gaussian_kernel
gaussian_filter.weight.requires_grad = False 
print(gaussian_filter.weight.data.shape)

tensor([[2.],
        [1.],
        [2.]])
tensor([[0.1553],
        [0.1760],
        [0.1553]])
torch.Size([3, 1, 3, 1])


In [65]:
import torch
import math
import torch.nn as nn
kernel_size = 3
sigma = 2
channels = 3

x_coord = torch.arange(kernel_size)
x_grid = x_coord.repeat(kernel_size).view(kernel_size, kernel_size)
y_grid = x_grid.t()

xy_grid = torch.stack([x_grid, y_grid], dim=-1).float()
mean = (kernel_size - 1)/2.
variance = sigma**2.
xy_grid = torch.sum((xy_grid[:kernel_size,:kernel_size,:] - mean)**2., dim=-1)
print(xy_grid)
print(xy_grid[int(kernel_size/2)])
gaussian_kernel = (1./((math.sqrt(2.*math.pi)*sigma))) * torch.exp(-1* (xy_grid[int(kernel_size/2)]) / (2*variance))

gaussian_kernel = gaussian_kernel / torch.sum(gaussian_kernel)
gaussian_kernel = gaussian_kernel.view(1, 1, 3)
gaussian_kernel = gaussian_kernel.repeat(channels, 1, 1)
print(gaussian_kernel)
print(gaussian_kernel.shape)

padding = 1 if kernel_size==3 else 2 if kernel_size == 5 else 0
gaussian_filter = nn.Conv1d(in_channels=channels, out_channels=channels,
                            kernel_size=kernel_sizex, groups=channels,
                            bias=False, padding=padding)
gaussian_filter.weight.data = gaussian_kernel
gaussian_filter.weight.requires_grad = False 
print(gaussian_filter.weight.data.shape)

tensor([[2., 1., 2.],
        [1., 0., 1.],
        [2., 1., 2.]])
tensor([1., 0., 1.])
tensor([[[0.3192, 0.3617, 0.3192]],

        [[0.3192, 0.3617, 0.3192]],

        [[0.3192, 0.3617, 0.3192]]])
torch.Size([3, 1, 3])
torch.Size([3, 1, 3])


In [None]:
#conv1d
import math
import torch
import torch.nn as nn

def get_gaussian_filter_1D(kernel_size=3, sigma=2, channels=3):
    # Create a x, y coordinate grid of shape (kernel_size, kernel_size, 2)
    
    x_coord = torch.arange(kernel_size)
    x_grid = x_coord.repeat(kernel_size).view(kernel_size, kernel_size)
    y_grid = x_grid.t()

    xy_grid = torch.stack([x_grid, y_grid], dim=-1).float()
    mean = (kernel_size - 1)/2.
    variance = sigma**2.
    xy_grid = torch.sum((xy_grid[:kernel_size,:kernel_size,:] - mean)**2., dim=-1)

    # Calculate the 1-dimensional gaussian kernel
    gaussian_kernel = (1./((math.sqrt(2.*math.pi)*sigma))) * \
                        torch.exp(-1* (xy_grid[int(kernel_size/2)]) / (2*variance))

    gaussian_kernel = gaussian_kernel / torch.sum(gaussian_kernel)
    gaussian_kernel = gaussian_kernel.view(1, 1, kernel_size)
    gaussian_kernel = gaussian_kernel.repeat(channels, 1, 1)

    padding = 1 if kernel_size==3 else 2 if kernel_size == 5 else 0
    gaussian_filter = nn.Conv1d(in_channels=channels, out_channels=channels,
                                kernel_size=kernel_sizex, groups=channels,
                                bias=False, padding=padding)
    gaussian_filter.weight.data = gaussian_kernel
    gaussian_filter.weight.requires_grad = False 
    return gaussian_filter

In [58]:
a = nn.Conv1d(in_channels=channels, out_channels=channels,
                            kernel_size=kernel_sizex, groups=channels,
                        bias=False, padding=padding)
print(a.weight.data)
print(a.weight.data.shape)

tensor([[[ 0.3295, -0.4429,  0.0897]],

        [[ 0.0608,  0.3102, -0.0349]],

        [[ 0.3158,  0.5165, -0.4010]]])
torch.Size([3, 1, 3])
