In [4]:
#imports
%matplotlib inline
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch
from sklearn.datasets import load_digits
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.utils.data as torch_data
import sklearn
from sklearn.metrics import accuracy_score

In [5]:
# fetch the dataset.
digits, targets = load_digits(return_X_y=True)
digits = digits.astype(np.float32) / 255

digits_train, digits_test, targets_train, targets_test = train_test_split(digits, targets, random_state=0)

train_size = digits_train.shape[0]

input_size = 8*8
classes_n = 10

In [6]:
class MNISTData(torch_data.Dataset):
    def __init__(self, X, y):
        super(MNISTData, self).__init__()
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return (self.X[idx],self.y[idx])

In [16]:
def get_accuracy(net, val_dset):
    test_loader = torch_data.DataLoader(val_dset,batch_size = len(val_dset)) 
    net.eval()
    for X,y in test_loader:
        X = X.to(device)
        nn_outputs = net(X).detach().numpy().argmax(axis = 1)
    return accuracy_score(nn_outputs,y.detach().numpy())

In [8]:
train_dset = MNISTData(digits_train, targets_train) 
val_dset = MNISTData(digits_test, targets_test) 

## Non-convex Fullyconnected

In [9]:
class FCN(nn.Module):

    def __init__(self):
        super(FCN, self).__init__()
        self.fc0 = nn.Linear(64, 40)
        self.fc1 = nn.Linear(40, 20)
        self.fc2 = nn.Linear(20, 10)

    def forward(self, input_):
        h1 = F.relu(self.fc0(input_))
        h2 = F.relu(self.fc1(h1))
        h3 = self.fc2(h2)
        return h3
    
    def get_sparsities(self):
        get_sparsity = lambda layer : (layer.weight.data == 0).sum().item() / (layer.weight.data.shape[0] * layer.weight.data.shape[1])
        return {
                'fc0': get_sparsity(self.fc0),
                'fc1': get_sparsity(self.fc1),
                'fc2': get_sparsity(self.fc2),
        }
    
    def l1reg(self):
        l1_reg = torch.tensor(0., requires_grad=True)
        for name, W in self.named_parameters():
            l1_reg = l1_reg +  W.norm(1)
        return l1_reg

In [10]:
device = 'cpu'

net = FCN()  
criterion = F.cross_entropy
optimizer = torch.optim.Adam(net.parameters())
scheduler = None

train_loader = torch_data.DataLoader(train_dset, batch_size=30, shuffle=True) 
val_loader = torch_data.DataLoader(val_dset, batch_size=100, shuffle=False) 

In [11]:
def train_fcn(epochs, net, criterion, optimizer, train_loader, val_loader,scheduler=None, verbose=True, save_dir=None, l1alpha=0):
    net.to(device)
    for epoch in range(1, epochs+1):
        net.train()
        loss = []
        for X, y in train_loader:
            nn_outputs = net(X)
            loss1 = criterion(nn_outputs, y) + l1alpha * net.l1reg()
            optimizer.zero_grad()
            loss1.backward()
            loss.append(loss1.item())
            optimizer.step()
        net.eval()
        val_loss = []
        for X, y in val_loader:
            X = X.to(device)
            nn_outputs = net(X)
            val_loss1 = criterion(nn_outputs,y)
            val_loss.append(val_loss1.item())
         
        if scheduler is not None:
            scheduler.step()
        freq = max(epochs//20,1)
        if verbose and epoch%freq==0:
            print('Epoch {}/{} || Loss:  Train {:.4f} | Validation {:.4f}'.format(epoch, epochs, np.mean(loss), np.mean(val_loss)))

In [13]:
train_fcn(200, net, criterion, optimizer, train_loader, val_loader, scheduler)

Epoch 10/200 || Loss:  Train 1.4474 | Validation 1.4346
Epoch 20/200 || Loss:  Train 0.7897 | Validation 0.8063
Epoch 30/200 || Loss:  Train 0.5813 | Validation 0.5980
Epoch 40/200 || Loss:  Train 0.4897 | Validation 0.5130
Epoch 50/200 || Loss:  Train 0.4307 | Validation 0.4520
Epoch 60/200 || Loss:  Train 0.3870 | Validation 0.4098
Epoch 70/200 || Loss:  Train 0.3351 | Validation 0.3635
Epoch 80/200 || Loss:  Train 0.2840 | Validation 0.3125
Epoch 90/200 || Loss:  Train 0.2419 | Validation 0.2765
Epoch 100/200 || Loss:  Train 0.2135 | Validation 0.2561
Epoch 110/200 || Loss:  Train 0.1888 | Validation 0.2441
Epoch 120/200 || Loss:  Train 0.1733 | Validation 0.2343
Epoch 130/200 || Loss:  Train 0.1578 | Validation 0.2327
Epoch 140/200 || Loss:  Train 0.1457 | Validation 0.2249
Epoch 150/200 || Loss:  Train 0.1365 | Validation 0.2244
Epoch 160/200 || Loss:  Train 0.1255 | Validation 0.2189
Epoch 170/200 || Loss:  Train 0.1152 | Validation 0.2217
Epoch 180/200 || Loss:  Train 0.1096 | V

In [17]:
get_accuracy(net, val_dset)

0.9444444444444444

In [18]:
net.get_sparsities()

{'fc0': 0.0, 'fc1': 0.0, 'fc2': 0.0}

## POSDENSE = CNN WITHOUT ADDITIONAL LAYERS

In [19]:
class PosDense(nn.Module):

    def __init__(self):
        super(PosDense, self).__init__()
        self.fcpos0 = nn.Linear(64, 40)
        self.fcpos1 = nn.Linear(40, 20)
        self.fcpos2 = nn.Linear(20, 10)
        self.fc = nn.Linear(10, 10)

    def forward(self, input_):
        h1 = F.relu(self.fcpos0(input_))
        h2 = F.relu(self.fcpos1(h1))
        h3 = F.relu(self.fcpos2(h2))
        h4 = self.fc(h3)
        return h4
    
    def positivate(self):
        self.fcpos0.weight.data = F.relu(self.fcpos0.weight.data)
        self.fcpos1.weight.data = F.relu(self.fcpos1.weight.data)
        self.fcpos2.weight.data = F.relu(self.fcpos2.weight.data)
    
    def get_sparsities(self):
        get_sparsity = lambda layer : (layer.weight.data == 0).sum().item() / (layer.weight.data.shape[0] * layer.weight.data.shape[1])
        return {
                'fcpos0': get_sparsity(self.fcpos0),
                'fcpos1': get_sparsity(self.fcpos1),
                'fcpos2': get_sparsity(self.fcpos2),
        }
    
    def l1reg(self):
        l1_reg = torch.tensor(0., requires_grad=True)
        for name, W in self.named_parameters():
            l1_reg = l1_reg + W.norm(1)
        return l1_reg

In [20]:
device = 'cpu'

net = PosDense()  
criterion = F.cross_entropy
optimizer = torch.optim.Adam(net.parameters())
scheduler = None

train_loader = torch_data.DataLoader(train_dset, batch_size=30, shuffle=True) 
val_loader = torch_data.DataLoader(val_dset, batch_size=100, shuffle=False) 

In [21]:
def train_posdense(epochs, net, criterion, optimizer, train_loader, val_loader,scheduler=None, verbose=True, save_dir=None, l1alpha=0):
    net.to(device)
    for epoch in range(1, epochs+1):
        net.train()
        loss = []
        for X, y in train_loader:
            nn_outputs = net(X)
            loss1 = criterion(nn_outputs, y) + l1alpha * net.l1reg()
            optimizer.zero_grad()
            loss1.backward()
            loss.append(loss1.item())
            optimizer.step()
            net.positivate()
        net.eval()
        val_loss = []
        for X, y in val_loader:
            X = X.to(device)
            nn_outputs = net(X)
            val_loss1 = criterion(nn_outputs,y)
            val_loss.append(val_loss1.item())
         
        if scheduler is not None:
            scheduler.step()
        freq = max(epochs//20,1)
        if verbose and epoch%freq==0:
            print('Epoch {}/{} || Loss:  Train {:.4f} | Validation {:.4f}'.format(epoch, epochs, np.mean(loss), np.mean(val_loss)))

In [22]:
train_posdense(300, net, criterion, optimizer, train_loader, val_loader, scheduler)

Epoch 15/300 || Loss:  Train 1.8697 | Validation 1.9052
Epoch 30/300 || Loss:  Train 1.4758 | Validation 1.5126
Epoch 45/300 || Loss:  Train 1.0662 | Validation 1.1344
Epoch 60/300 || Loss:  Train 0.9406 | Validation 1.0101
Epoch 75/300 || Loss:  Train 0.8591 | Validation 0.9046
Epoch 90/300 || Loss:  Train 0.7875 | Validation 0.8420
Epoch 105/300 || Loss:  Train 0.7264 | Validation 0.7858
Epoch 120/300 || Loss:  Train 0.6772 | Validation 0.7455
Epoch 135/300 || Loss:  Train 0.6224 | Validation 0.6953
Epoch 150/300 || Loss:  Train 0.5756 | Validation 0.6480
Epoch 165/300 || Loss:  Train 0.5272 | Validation 0.6156
Epoch 180/300 || Loss:  Train 0.4770 | Validation 0.5761
Epoch 195/300 || Loss:  Train 0.4354 | Validation 0.5297
Epoch 210/300 || Loss:  Train 0.3787 | Validation 0.4898
Epoch 225/300 || Loss:  Train 0.3267 | Validation 0.4236
Epoch 240/300 || Loss:  Train 0.2918 | Validation 0.4116
Epoch 255/300 || Loss:  Train 0.2462 | Validation 0.3639
Epoch 270/300 || Loss:  Train 0.2150 

In [23]:
get_accuracy(net, val_dset)

0.9377777777777778

In [24]:
net.get_sparsities()

{'fcpos0': 0.29921875, 'fcpos1': 0.2525, 'fcpos2': 0.14}

## L1-regularization

In [32]:
device = 'cpu'

net = PosDense()  
criterion = F.cross_entropy 
optimizer = torch.optim.Adam(net.parameters())
scheduler = None

train_loader = torch_data.DataLoader(train_dset, batch_size=30, shuffle=True) 
val_loader = torch_data.DataLoader(val_dset, batch_size=100, shuffle=False) 

In [33]:
train_posdense(350, net, criterion, optimizer, train_loader, val_loader, scheduler, l1alpha=1e-5)

Epoch 17/350 || Loss:  Train 1.7239 | Validation 1.7186
Epoch 34/350 || Loss:  Train 1.5065 | Validation 1.5309
Epoch 51/350 || Loss:  Train 1.3759 | Validation 1.4565
Epoch 68/350 || Loss:  Train 1.2909 | Validation 1.3545
Epoch 85/350 || Loss:  Train 1.2349 | Validation 1.3132
Epoch 102/350 || Loss:  Train 1.1477 | Validation 1.2348
Epoch 119/350 || Loss:  Train 0.9866 | Validation 1.0758
Epoch 136/350 || Loss:  Train 0.8391 | Validation 0.9099
Epoch 153/350 || Loss:  Train 0.7170 | Validation 0.7803
Epoch 170/350 || Loss:  Train 0.6501 | Validation 0.7190
Epoch 187/350 || Loss:  Train 0.5748 | Validation 0.6507
Epoch 204/350 || Loss:  Train 0.5226 | Validation 0.6049
Epoch 221/350 || Loss:  Train 0.4785 | Validation 0.5371
Epoch 238/350 || Loss:  Train 0.4201 | Validation 0.5008
Epoch 255/350 || Loss:  Train 0.3864 | Validation 0.4512
Epoch 272/350 || Loss:  Train 0.3441 | Validation 0.4368
Epoch 289/350 || Loss:  Train 0.3020 | Validation 0.4044
Epoch 306/350 || Loss:  Train 0.2807

In [34]:
get_accuracy(net, val_dset)

0.9133333333333333

In [35]:
net.get_sparsities()

{'fcpos0': 0.3671875, 'fcpos1': 0.18625, 'fcpos2': 0.045}

In [38]:
device = 'cpu'

net = FCN()  
criterion = F.cross_entropy
optimizer = torch.optim.Adam(net.parameters())
scheduler = None

train_loader = torch_data.DataLoader(train_dset, batch_size=30, shuffle=True) 
val_loader = torch_data.DataLoader(val_dset, batch_size=100, shuffle=False) 

In [39]:
train_fcn(300, net, criterion, optimizer, train_loader, val_loader, scheduler, l1alpha=1e-5)

Epoch 15/300 || Loss:  Train 0.7658 | Validation 0.7679
Epoch 30/300 || Loss:  Train 0.4519 | Validation 0.4594
Epoch 45/300 || Loss:  Train 0.3258 | Validation 0.3434
Epoch 60/300 || Loss:  Train 0.2498 | Validation 0.2646
Epoch 75/300 || Loss:  Train 0.2031 | Validation 0.2246
Epoch 90/300 || Loss:  Train 0.1678 | Validation 0.2048
Epoch 105/300 || Loss:  Train 0.1447 | Validation 0.1899
Epoch 120/300 || Loss:  Train 0.1221 | Validation 0.1846
Epoch 135/300 || Loss:  Train 0.1081 | Validation 0.1860
Epoch 150/300 || Loss:  Train 0.0954 | Validation 0.1707
Epoch 165/300 || Loss:  Train 0.0861 | Validation 0.1673
Epoch 180/300 || Loss:  Train 0.0773 | Validation 0.1723
Epoch 195/300 || Loss:  Train 0.0687 | Validation 0.1680
Epoch 210/300 || Loss:  Train 0.0634 | Validation 0.1768
Epoch 225/300 || Loss:  Train 0.0576 | Validation 0.1706
Epoch 240/300 || Loss:  Train 0.0528 | Validation 0.1751
Epoch 255/300 || Loss:  Train 0.0482 | Validation 0.1700
Epoch 270/300 || Loss:  Train 0.0438 

In [40]:
get_accuracy(net, val_dset)

0.9644444444444444

In [41]:
net.get_sparsities()

{'fc0': 0.0, 'fc1': 0.0, 'fc2': 0.0}