In [None]:
# make cells wider if working locally in jupyter
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Important notice: any use of generative AI for completing the assignment is strictly prohibited.

In [None]:
import os
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
import shutil

In [None]:
# use that if working in colab

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# you should be added as viewer to shared Google drive "ECE477 datasets"
#  at https://drive.google.com/drive/u/0/folders/0ABIZHKB-QPnRUk9PVA

!unzip "/content/drive/Shared drives/ECE477 datasets/Assignment2/data.zip" -d "."

## Warning: to ensure the reproducibility of your results and to achieve the full grade, do not change or remove RANDOM_STATE variables and setting random seed statements. If you remove or change them, you may not get the full grade. 

In [None]:
import random

RANDOM_STATE = 0
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
# define some util functions

def neuronMask(mask, add_index, func='in', sparse=False, ratio=0.2):
    if sparse:
        num = int(mask.size(0)*ratio)
        if func == 'in':
            for i in add_index:
                indices = list(range(mask.size(0)))
                np.random.shuffle(indices)
                for j in indices[:num]:
                    mask[j, i] = 1
        elif func == 'out':
            for i in add_index:
                indices = list(range(mask.size(1)))
                np.random.shuffle(indices)
                for j in indices[:num]:
                    mask[i, j] = 1
    else:
        if func == 'in':
            for i in add_index:
	            mask[:, i] = 1
        elif func == 'out':
            for i in add_index:
                mask[i, :] = 1
    return mask


def load_data_train(data_path='./data/sonar/', mode='all', fold=None):
    if fold != None:
        X_train = np.load(data_path + "X_train" + str(fold) + '.npy')
        y_train = np.load(data_path + "y_train" + str(fold) + '.npy')
        X_test = np.load(data_path + "X_validation"+ str(fold) + '.npy')
        y_test = np.load(data_path + "y_validation"+ str(fold) + '.npy')
         
    else:
        if mode == 'all':
            X_train = np.load(data_path + "X_train.npy")
            y_train = np.load(data_path + "y_train.npy")
        elif mode == 'easy':
            X_train = np.load(data_path + "X_easy.npy")
            y_train = np.load(data_path + "y_easy.npy")
        elif mode == 'hard':
            X_train = np.load(data_path + "X_hard.npy")
            y_train = np.load(data_path + "y_hard.npy")
        
        X_test = np.load(data_path + "X_validation.npy")
        y_test = np.load(data_path + "y_validation.npy")
    
    X_train = X_train.astype(float)
    X_test = X_test.astype(float)
    y_train = y_train.astype(int)
    y_test = y_test.astype(int)
    
    return X_train, y_train, X_test, y_test


def load_data_test(data_path='./data/sonar/', fold=None):
    if fold != None:
        X_train = np.load(data_path + "X_train" + str(fold) + '.npy')
        y_train = np.load(data_path + "y_train" + str(fold) + '.npy')
        X_test = np.load(data_path + "X_test"+ str(fold) + '.npy')
        y_test = np.load(data_path + "y_test"+ str(fold) + '.npy')
        
         
    else:
        X_train = np.load(data_path + "X_train_total.npy")
        X_test = np.load(data_path + "X_test.npy")
        y_train = np.load(data_path + "y_train_total.npy")
        y_test = np.load(data_path + "y_test.npy")

    X_train = X_train.astype(float)
    X_test = X_test.astype(float)
    y_train = y_train.astype(int)
    y_test = y_test.astype(int)
    

    return X_train, y_train, X_test, y_test

# Define the Deep Neural Network

In [None]:
class SDNN(object):
    def __init__(self, in_num, out_num, init_size, max_size, dataset, batch_size, **kwargs):
        self.dataset = dataset # the name of the dataset
        self.in_num = in_num # The number of input features
        self.out_num = out_num # The number of output classes
        self.init_size = init_size # the inital number of hidden neurons 
        self.max_size = max_size  # max number of allowed hidden neurons in the architecture
        self.cur_size = init_size - 1  # current size of hidden neurons
        self.batch_size = batch_size # Batch size
        self.name = dataset 
        self.flag = 0
        
        self.epoch = 0
        self.best_acc = 0
        self.best_acc_prune = 0
        self.now_acc = 0
        self.connection_count = 0
 

    # Forward pass  
    # we first compute the hidden activations by utilizing the input to hidden weights (w1) , and hidden to hidden weights,
    # (w2) and their corresponding mask matrices (m1, m2), as well as the hidden biases (b1)
    # After computing the hidden activations, we use it alongside hidden to out weight matrix (w3) and corresponding 
    # mask (m3), and input to output connections (w4) and its corresponding mask (m4), and output biases (b2)
    # at the end we return the computed output
    def forward(self, x):
        self.hidden = torch.zeros(x.size(0), self.max_size)

        for i,j in enumerate(self.active_index):
            self.hidden[:, j] = F.relu((torch.mm(self.hidden.clone(), torch.mul(self.w2[:, j], self.m2[:, j]).view(-1, 1)) 
                                       + torch.mm(x, torch.mul(self.w1[:, j], self.m1[:, j]).view(-1,1))
                                       + self.b1[:, j])).squeeze(1)

        out = torch.mm(self.hidden, torch.mul(self.w3, self.m3)) \
                  + torch.mm(x, torch.mul(self.w4, self.m4)) \
                  + self.b2
        out.retain_grad()
        return out
    
    
    # used in connection growth 
    def forwardMask(self, display=True):
        for i,j in enumerate(self.active_index):
            mask_idx = list(set(range(self.max_size)) - set(self.active_index[:i]))
            self.m2.data[:, j][mask_idx] = 0
        if display:    
            print('Forward mask, m2: %d' %np.count_nonzero(self.m2.data))
            
    
    def backwardGrad(self, outgrad):
        self.hidden.grad = torch.mm(outgrad, torch.t(self.w3))
        rev_idx = np.flip(self.active_index, axis=0)
        for i,j in enumerate(rev_idx):
            for k in range(i):
                self.hidden.grad.data[:, j] = self.hidden.grad.data[:, j] + self.hidden.grad.data[:, k] \
                                                  *self.w2.data[j, k]  
                

    def displayConnection(self, display=True):
        """it shows the number of active weights in m1, m2, m3, and m4 masks"""
        m1 = 0
        m2 = 0
        m3 = 0
        m4 = np.count_nonzero(self.m4.data)
        for i,j in enumerate(self.active_index):
            m1 += np.count_nonzero(self.m1.data[:, j])
            m3 += np.count_nonzero(self.m3.data[j, :])
            for k in range(i):
                m2 += np.count_nonzero(self.m2.data[self.active_index[k]][j])
        
        if display:
            print('Connection Info: ')
            print('m1: %d, m2: %d, m3: %d, m4: %d' %(m1,m2,m3,m4))
            print('Total: %d' % (m1+m2+m3+m4))
        return m1, m2, m3, m4, m1+m2+m3+m4


    def save_checkpoint(self, state, is_best, folder_to_save, filename = '_checkpoint.pth.tar'):
        name_to_save = os.path.join(folder_to_save, self.name + filename)
        torch.save(state, name_to_save)
        if is_best:
            shutil.copyfile(name_to_save, os.path.join(folder_to_save, self.name + '_model_best.pth.tar'))
            print(f"also saved as the best checkpoint to {os.path.join(folder_to_save, self.name + '_model_best.pth.tar')}")

## Define SDNN class memebers

### Task: handle validation data (1 point)

fill in the missing code

In [None]:
# loading the data
# mode:train -> loading train and validation data, we use this mode in training the model and learning the 
# architecture
# mode: test - > loading train_total (train+validation set) and test data
# we use the test mode after learning the architecture to evaluate the model on the test set

def loadData(self, folder_to_load='data', mode='train', fold=None):
    path_to_load = os.path.join(folder_to_load, self.dataset) + '/'
    if mode == 'train':
        self.X_train, self.y_train, self.X_validation, self.y_validation = load_data_train(path_to_load)
    elif mode == 'test':
        self.X_train, self.y_train, self.X_validation, self.y_validation = load_data_test(path_to_load)
        
    # converting the numpy arrays to torch tensors
    self.X_train = torch.tensor(self.X_train, dtype=torch.float32)
    self.y_train = torch.tensor(self.y_train.reshape(-1), dtype=torch.long)
    # create validation splits from numpy arrays
    # your code is here
    self.X_validation = ...
    self.y_validation = ...

    # creating the data loaders for train
    self.traindata = torch.utils.data.TensorDataset(self.X_train, self.y_train)
    self.trainloader = torch.utils.data.DataLoader(self.traindata, batch_size=self.batch_size, shuffle=True)
    
    # creating the data loaders for validation
    # don't shuffle validation data!
    # your code is here
    self.validationdata = ...
    self.validationloader = ...
    
    
SDNN.loadData = loadData

### Task: define optimizer (2 points)

Define SGD optimizer on all network weights and biases. 

Parameters: lr=0.001, momentum=0.9, weight_decay=1e-4, nesterov=True

In [None]:
# initializing the structure
# four main components and their corresponding masks
# We define weight matrices, and masks, for connection between input neurons to hidden neurons,
# hidden neurons to hidden neurons, hidden neurons to output neurons, and input neurons to output neurons
# We also define bias terms for hidden neurons and output neurons

def structureInit(self, load=False, sparse=True, ratio=0.2, file=None):
    # input to hidden
    self.w1 = torch.randn(self.in_num, self.max_size) * 0.1
    self.m1 = torch.zeros(self.in_num, self.max_size)
    # hidden to hidden
    self.w2 = torch.randn(self.max_size, self.max_size) * 0.1
    self.m2 = torch.ones(self.max_size, self.max_size)
    # hidden to output
    self.w3 = torch.randn(self.max_size, self.out_num) * 0.1
    self.m3 = torch.zeros(self.max_size, self.out_num)
    # input to output
    self.w4 = torch.randn(self.in_num, self.out_num) * 0.1
    self.m4 = torch.ones(self.in_num, self.out_num)

    self.b1 = torch.zeros(1, self.max_size)
    self.b2 = torch.zeros(1, self.out_num)

    self.w1.requires_grad = True
    self.w2.requires_grad = True
    self.w3.requires_grad = True
    self.w4.requires_grad = True
    self.b1.requires_grad = True
    self.b2.requires_grad = True

    self.params = {'w1': self.w1, 'w2': self.w2, 'w3': self.w3, 'w4': self.w4,
                   'm1': self.m1, 'm2': self.m2, 'm3': self.m3, 'm4': self.m4,
                  }

    self.criterion = nn.CrossEntropyLoss()
    # your code is here
    # clarification: predefined self.params are intended for different purpose
    # don't pass it into the optimizer
    self.optimizer = ...

    # starting from scratch
    # at first, we have init_size active hidden neurons
    # neuronMask is a helper function defined in utils.py
    # This function sets the appropriate number of mask values equal to 1
    # note that in the initialization step, we only activate connections between input to hidden (m1)
    # and hidden to output (m3), and biases for hidden neurons (b1)
    if load == False:
        self.active_index = list(range(self.init_size))
        if sparse:
            self.m1.data = neuronMask(self.m1.data, self.active_index, sparse=True, ratio=ratio)
        else:
            self.m1.data = neuronMask(self.m1.data, self.active_index)
        self.m3.data = neuronMask(self.m3.data, self.active_index, 'out')
        self.b1.data = neuronMask(self.b1.data, self.active_index)

    # loading from a pretrained model
    # we have to load all the parameters of the model, including the index of active neurons
    # and all the learned weight, bias, and mask matrices
    else:
        checkpoint = torch.load(file, weights_only=True)
        self.active_index = checkpoint['active_index']
        self.w1.data = checkpoint['state_dict']['w1']
        self.m1.data = checkpoint['state_dict']['m1']
        self.w2.data = checkpoint['state_dict']['w2']
        self.m2.data = checkpoint['state_dict']['m2']
        self.w3.data = checkpoint['state_dict']['w3']
        self.m3.data = checkpoint['state_dict']['m3']
        self.w4.data = checkpoint['state_dict']['w4']
        self.m4.data = checkpoint['state_dict']['m4']
        self.b1.data = checkpoint['state_dict']['b1']
        self.b2.data = checkpoint['state_dict']['b2']
        self.epoch = checkpoint['epoch']
        self.best_acc = checkpoint['best_acc']
        self.now_acc = checkpoint['now_acc']
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        
        
SDNN.structureInit = structureInit

In [None]:
def train(self, duration=10, folder_to_save='tmp'):
    for epoch in range(self.epoch, self.epoch+duration): #loop over the dataset multiple times based on #epochs
        running_loss = 0.0
        # reading the data using the data loaders defined earlier
        for i, data in enumerate(self.trainloader, 0):
            # get the inputs
            inputs, labels = data
            inputs.requires_grad_(True)
            # zero the parameter gradients
            self.optimizer.zero_grad()

            # forward + backward + optimize
            outputs = self.forward(inputs)
            loss = self.criterion(outputs, labels)

            loss.backward()
            self.optimizer.step()

            # computing the running loss
            running_loss += loss.item()

            # updating the weight matrices
            self.w1.data = self.w1.data * self.m1.data
            self.w2.data = self.w2.data * self.m2.data
            self.w3.data = self.w3.data * self.m3.data
            self.w4.data = self.w4.data * self.m4.data

        # computing the train accuracy
        total = 0
        correct = 0
        for i, data in enumerate(self.trainloader, 0):
            inputs, labels = data
            #inputs = inputs.view(inputs.size(0), -1)
            outputs = self.forward(inputs) 
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels.data).sum()
            total += labels.size(0)
        train_acc = correct * 1. / total

        # computing the validation accuracy
        total = 0
        correct = 0
        for i, data in enumerate(self.validationloader, 0):
            inputs, labels = data
            outputs = self.forward(inputs) 
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels.data).sum()
            total += labels.size(0)
        validation_acc = correct * 1. /total
        self.now_acc = validation_acc

        # saving the model if the validation accuracy is better than the current best validation accuracy 
        # of the pruned model
        # we use the save_checkpoint function defined later to save the model and all the important parameters
        # you can change the filename as you wish
        if (validation_acc > self.best_acc_prune) and (self.flag == 1):
            self.best_acc_prune = validation_acc
            self.save_checkpoint({
                'epoch': epoch,
                'best_acc': self.best_acc,
                'now_acc': self.now_acc,
                'state_dict': {
                    'w1': self.w1.data,
                    'm1': self.m1.data,
                    'w2': self.w2.data,
                    'm2': self.m2.data,
                    'w3': self.w3.data,
                    'm3': self.m3.data,
                    'w4': self.w4.data,
                    'm4': self.m4.data,
                    'b1': self.b1.data,
                    'b2': self.b2.data,                    
                },
                'active_index': self.active_index,
                'optimizer': self.optimizer.state_dict(),
            }, False, folder_to_save, filename='_prune.pth.tar')

        if (validation_acc > self.best_acc):
            self.best_acc = validation_acc
            self.save_checkpoint({
                'epoch': epoch + 1,
                'best_acc': self.best_acc,
                'now_acc': self.now_acc,
                'state_dict': {
                    'w1': self.w1.data,
                    'm1': self.m1.data,
                    'w2': self.w2.data,
                    'm2': self.m2.data,
                    'w3': self.w3.data,
                    'm3': self.m3.data,
                    'w4': self.w4.data,
                    'm4': self.m4.data,
                    'b1': self.b1.data,
                    'b2': self.b2.data,                    
                },
                'active_index': self.active_index,
                'optimizer': self.optimizer.state_dict(),
            }, True, folder_to_save)
        else:
            self.save_checkpoint({
                'epoch': epoch,
                'best_acc': self.best_acc,
                'now_acc': self.now_acc,
                'state_dict': {
                    'w1': self.w1.data,
                    'm1': self.m1.data,
                    'w2': self.w2.data,
                    'm2': self.m2.data,
                    'w3': self.w3.data,
                    'm3': self.m3.data,
                    'w4': self.w4.data,
                    'm4': self.m4.data,
                    'b1': self.b1.data,
                    'b2': self.b2.data,                    
                },
                'active_index': self.active_index,
                'optimizer': self.optimizer.state_dict(),
            }, False, folder_to_save)
        print('Epoch: %d, Training accuracy: %f, Validation accuracy: %f' 
              % (epoch, train_acc, validation_acc))

        m1,m2,m3,m4,m_all = self.displayConnection(display=False)

    self.epoch += duration
    
    
SDNN.train = train

In [None]:
# cell division function, we have the options between activation based, gradient-based, and random cell-division
# We normally use activation-based (duplicating the cell with the highest activation value) or
# random cell division (randomly selecting a hidden cell to be duplicated)
# We can make this decisions either by looking at the full data, or a batch of data
# Other than mode, the other inputs are num (shows number of neurons to be duplicated)
# full_data that shows whether or not to use the full data for neuron selection
# and if full data is flase, size shows how many batches to use to compute the neuron actications

def cellDivision(self, mode='acti', num=1, full_data=False, size=1):
    '''
    Function: add neurons.
    Arguments:
        mode: 'acti' activation-based,'grad' gradient-based, 'rand' random
        num: number of neurons added each time
        full_data: whether to use full data to decide which neuron to split
        size: if full_data=False, number of batches used to decide which neuron to split
    '''

    # computing the hidden activation values, either by using the whole data or several batches of data
    # we sum up the hidden activations for several batches
    if mode == 'acti':
        activation = np.zeros(self.max_size)
        if full_data:
            for i, data in enumerate(self.trainloader, 0):
                inputs,_ = data
                self.forward(inputs) 
                activation += torch.sum(torch.abs(self.hidden.data), 0)
        else:
            loader = iter(self.trainloader)
            for i in range(size):
                inputs,_ = next(loader)
                self.forward(inputs) 
                activation += torch.sum(torch.abs(self.hidden.data), 0).cpu().numpy()

        # selecting the neurons with the highest activations to be duplicated
        # we select 'num' neurons to be duplicated
        max_index_arr = np.flip(np.argsort(activation)[-num:], axis=0)
    elif mode == 'grad':

        # selecting the neurons to be activated based on the hidden gradients
        # we did not use this method in the final experiments of the paper
        # however, it is worth exploring
        # we use the function badwardGrad defined later to compute gradients
        activation = np.zeros(self.max_size)
        if full_data:
            for i, data in enumerate(self.trainloader, 0):
                inputs, labels = data
                self.optimizer.zero_grad()
                outputs = self.forward(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.backwardGrad(outputs)
                activation += torch.sum(self.hidden.grad.data, 0)
        else:
            loader = iter(self.trainloader)
            for i in range(size):
                inputs, labels = loader.next()
                self.optimizer.zero_grad()
                outputs = self.forward(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()

                self.backwardGrad(outputs)
                activation += torch.sum(self.hidden.grad.data, 0)
        max_index_arr = np.flip(np.argsort(activation)[-num:], axis=0)
    elif mode == 'rand':
        # selection 'num' neurons random from active neurons
        max_index_arr = np.random.choice(self.active_index, size=num, replace=False)

    # after selecting the neuron to be duplicated, we duplicate that neuron and its connections, 
    # and add noise to weights of the new added neuron

    for max_index in max_index_arr:
        # we add the index at the end of active_index list of active neurons
        add_index = len(self.active_index)
        # current size
        self.cur_size = add_index
        print('Max index: %d' %max_index)

        # we only add a new neurons if the number of neurons will be less than the maximum number of neurons
        # set at the beginning
        if add_index < self.max_size:
            print('Adding neuron: %d' %add_index)
            python_max_index = int(max_index)  # Convert to Python int
            if python_max_index in self.active_index:
                self.active_index.insert(self.active_index.index(python_max_index), add_index)
                # duplicating the masks
                self.m1.data[:, add_index] = self.m1.data[:, python_max_index]
                self.m2.data[:, add_index] = self.m2.data[:, python_max_index]
                self.m3.data[add_index, :] = self.m3.data[python_max_index, :]

                # duplicating the weight matrices and adding noise
                self.w1.data[:, python_max_index] = self.w1.data[:, python_max_index]
                self.w1.data[:, add_index] = self.w1.data[:, python_max_index] + torch.randn(self.in_num) * 0.01
                self.w2.data[:, add_index] = self.w2.data[:, python_max_index] + torch.randn(self.max_size) * 0.01
                self.w3.data[add_index, :] = self.w3.data[python_max_index, :] + torch.randn(self.out_num) * 0.01
                self.b1.data[:, add_index] = self.b1.data[:, python_max_index]
            else:
                print(f"Value {python_max_index} not found in active_index. Skipping insertion due to error.")
    
    # updating the weight matrices
    self.w1.data = self.w1.data * self.m1.data
    self.w2.data = self.w2.data * self.m2.data
    self.w3.data = self.w3.data * self.m3.data
    self.w4.data = self.w4.data * self.m4.data
    self.displayConnection()
    
    
SDNN.cellDivision = cellDivision

### Task: update correlation matrix and masking weights (6 points)

Fill in the missing code where prompted (2 spots)

In [None]:
def addConnection(self, mode='grad', percentile={'m2':90, }, size=1, full_data=False):
    '''
    Function: add connections.
    Arguments:
        mode: 'corr' correlation-based, 'grad' gradient-based, 'rand' random
        percentile: top-k percentile of connections are added
    '''
    print('\nAdding connection...')
    self.flag = 0

    cov_mat = {
        'm1': np.zeros([self.in_num, self.max_size]),
        'm2': np.zeros([self.max_size, self.max_size]),
        'm3': np.zeros([self.max_size, self.out_num]),
        'm4': np.zeros([self.in_num, self.out_num]),
    }

    # gradient-based growth 
    # we use the backwardGrad function to compute gradients
    if mode == 'grad':
        loader = iter(self.trainloader)
        for i in range(size):
            inputs, lables = next(loader)
            self.optimizer.zero_grad()
            outputs = self.forward(inputs)

            loss = self.criterion(outputs, labels)
            loss.backward()
            self.backwardGrad(outputs)

            cov_mat_m1 = torch.mm(inputs.T, self.hidden.grad)
            cov_mat_m2 = torch.mm(self.hidden.T, self.hidden.grad)
            cov_mat_m3 = torch.mm(self.hidden.T, outputs.grad)
            cov_mat_m4 = torch.mm(inputs.T, outputs.grad)

            # add to covariance matrix values cov_mat_m1, cov_mat_m2, cov_mat_m3, cov_mat_m4
            # your code is here
            cov_mat['m1'] += 
            cov_mat['m2'] += 
            cov_mat['m3'] += 
            cov_mat['m4'] += 

    elif mode == 'rand':
        cov_mat['m1'][:, :self.cur_size] = np.random.rand(self.in_num, self.cur_size)
        cov_mat['m2'][:self.cur_size, :self.cur_size] = np.random.rand(self.cur_size, self.cur_size)
        cov_mat['m3'][:self.cur_size, :] = np.random.rand(self.cur_size, self.out_num)
        cov_mat['m4'] = np.random.rand(self.in_num, self.out_num)


    for i in percentile:
        if len(np.nonzero(cov_mat[i])[0]) == 0:
            threshold = 0
        else:
            threshold = np.percentile(cov_mat[i][np.nonzero(cov_mat[i])], percentile[i])
        self.params[i].data[torch.Tensor(cov_mat[i])>threshold] = 1

    self.forwardMask()
    self.displayConnection()

    # self.m1, self.m2, self.m3, self.m4 are masks for correspondings weights.
    # they are float tensors containing 1. and 0. values
    # update weights masking out corresponding values.
    # Impprtant: For weights and masks tensors in calculation, call their .data() property
    # to prevent tracking gradients on these tensors by torch autograd system

    # your code is here
    self.w1 = 
    self.w2 =
    self.w3 =
    self.w4 =
    
    
    
SDNN.addConnection = addConnection

### Task: fill in validation accuracy calculation (2 points)
Fill in the missing code to calculate the validation accuracy

In [None]:
def displayAcc(self):
    """computing and displaying the train and test accuracy"""
    total = 0
    correct = 0
    for i, data in enumerate(self.trainloader, 0):
        inputs, labels = data
        outputs = self.forward(inputs) 
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels.data).sum()
        total += labels.size(0)
    print('Train: %d/%d' %(correct, total))
    train_acc = correct * 1. / total

    total = 0
    correct = 0
    
    # your code is here
    # hint: use validationloader
    
    

    print('Train accuracy: %f, Test accuracy: %f'
          % (train_acc, validation_acc))
    return validation_acc


SDNN.displayAcc = displayAcc

# Dataset


In [None]:
# name of the dataset (folder) in the data directory
dataset = 'arrhythmia0'

expr = '_1'

### Task: define the number of features and classes for the loaded dataset (1 points)

These are required to correctly define number of input and output neurons in the network.

Display both in_num and out_num numbers explicitly to get the full credit.

In [None]:
# test dataset load to define number of features and classes only
# you don't need this further, since SDNN class loads its own dataset instance

test_dataset_load = load_data_train(f'./data/{dataset}/')

In [None]:
# your code is here

# number of input features
in_num = 
# number of output classes
out_num = 

In [None]:
# hyperaparameters

params = {
    'init_num': 5,   # initial number of hidden neurons
    'max_num': 100,  # max number of hidden neurons
    'sparse': False, 
    'sparse_ratio': 0.1,   
    'loop_num': 10,   # number of iterations in growth and pruning applications
    'full_data': False,  # for small data use full_data : True to use all the data
    'remove': False,  
}

# Constructive approach

This is a constructive approach. We start by having a small number of hidden neurons in the
architecture (‘init_size’). The starting number of neurons in the architecture is a hyperparameter
that can be chosen based on a task at hand and by looking at the validation performance. In an
iterative process, we apply connection growth and neuron growth to grow the network size. We
have a max number of neurons that we cannot surpass (one of the parameters).


Each of these architecture changing functions have their own respective hyperapameters that
can be optimized by monitoring validation accuracy (these are explained in comments in the
code). The number of epochs to train the architecture, the number of iterations to change the
architecture, the number of neurons to add at each neuron growth operation, the o

In [None]:
# create folder to store checkpoints
os.makedirs('record_full', exist_ok=True)

## Task: run training (5 points)

**if training finishes successfully, it outputs "Training is done" at the end of iterations. You will be able to see that both training and validation accuracies changes with adding new connections.**

Create a new model from scratch and train it.

Make sure to store the training results in checkpoint folder created right above.

In [None]:
# ==============
# Random states:
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed(RANDOM_STATE)
# ==============



# create the SDNN object using params dict. Set batch_size to 256.
# your code is here
sdnet = ...


#starting the model from scratch
sdnet.structureInit(load=False, sparse=params['sparse'], ratio=params['sparse_ratio'])
name = dataset + expr + '_1'
sdnet.name = name


# loading the data in train mode
sdnet.loadData(mode='train')

# train the architecture for 10 epochs first, before applying the growth and pruning.
# save data to the checkpoint folder
# your code is here
...

# iterative GP in the loop
for i in range(params['loop_num']):
    # adding connections, mode, and percentile can be explored
    sdnet.addConnection(mode='grad', percentile={'m2':70, 'm1':70, 'm3':70, 'm4':70}, full_data=False)
    
    # after adding connections, train the architecture for 10 epochs, save data to the checkpoint folder
    # your code is here
    ...
    
    # neuron growth operation
    sdnet.cellDivision(full_data=params['full_data'])
    # after architecture changing operation is applied, again
    # train the architecture for 10 epochs, save data to the checkpoint folder
    # your code is here
    ...
    sdnet.train(params['interval'], 'record_full')

print("Training is done\n")

### Loading the best model learned in the iterative process

During training, we are seeking to maximize the validation accuracy.

Here you will see the best model that you got during iterative growing neurons and connections.

### Task: report the best validation accuracy (3 points)

In [1]:
# your code is here

# load the SDNN object from the best checkpoint
# hint: search for SDNN member function that loads from checkpoint 
# your code is here

# display the active connections and train and validation accuracies
# for the best checkpoint
# hint: use a predefined functions for that
# your code is here