## Intrinsic Dimension : Proof of Concept

This just demonstrates that the Intrinsic Dimension idea works for a simple MNIST-CNN model.

Of possible interest : This PyTorch version is 'generic' in the sense that it should be able
to take arbitrary parameterised models and investigate their Intrinsic Dimension (without
writing specialised `Module_IntrinsicDimension` classes for each module type).

In [1]:
import torch
import torch.nn.functional as F
from torchvision import datasets, transforms
import os

mnist_root = '/home/jevjev/Dropbox/Projects/datasets'

In [2]:
n_epochs = 20
batch_size, batch_log_interval, lr = 32, 600, 0.01
seed = 10

try_cuda = True

In [3]:
use_cuda = try_cuda and torch.cuda.is_available()
torch.manual_seed(seed)

device = torch.device("cuda" if use_cuda else "cpu")

In [4]:
loader_kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

mnist_transform = transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])

train_loader = torch.utils.data.DataLoader(
    datasets.MNIST(mnist_root, train=True, download=True, transform=mnist_transform),
    batch_size=batch_size, shuffle=True, **loader_kwargs)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST(mnist_root, train=False, transform=mnist_transform),
    batch_size=1000, shuffle=False, **loader_kwargs)

In [5]:
class RegularCNNModel(torch.nn.Module):
    def __init__(self):
        super(RegularCNNModel, self).__init__()
        self.conv1 = torch.nn.Conv2d(1, 16, kernel_size=3)
        self.conv2 = torch.nn.Conv2d(16, 32, kernel_size=3)
        #self.conv2_drop = torch.nn.Dropout2d()
        self.fc1 = torch.nn.Linear(32*5*5, 50)
        self.fc2 = torch.nn.Linear(50, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.max_pool2d( x, 2)
        x = F.relu(x)
        
        x = self.conv2(x)
        x = F.max_pool2d( x, 2)
        x = F.relu(x)
        
        #print(x.size())
        
        x = x.view(-1, 32*5*5)
        
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        
        return F.log_softmax(x, dim=1)

In [6]:
def parameter_count(model):
    param_tot=0
    for name, param in model.named_parameters():
        if param.requires_grad:
            #print(name, param.data.size(), v_size)
            param_size = 1
            for d in list(param.data.size()):
                param_size *= d
            param_tot += param_size
    return param_tot    

In [7]:
def train(model, optimizer, epoch_num):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % batch_log_interval == 0:
            print('Train Epoch: {} [{: 6d}/{: 6d} ({:2.0f}%)]\tLoss: {:.4f}'.format(
                epoch_num, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test(model):
    model.eval()
    test_loss, correct = 0., 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, size_average=False).item() # sum up batch loss
            pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    pct_correct = 100. * correct / len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.1f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset), pct_correct))
    return pct_correct

In [8]:
def use_model(model):  # , **kwargs
    _model = model.to(device)
    #optimizer = optim.SGD(_model.parameters(), lr=args.lr, momentum=momentum)
    optimizer = torch.optim.Adam(_model.parameters(), lr=lr)
    return _model, optimizer

def get_stats_for(model, n_epochs=n_epochs):
    print("Total model parameters : %d" % (parameter_count(model),) )
    _model, optimizer = use_model(model)
    for epoch in range(1, n_epochs + 1):
        train(_model, optimizer, epoch)
        pct_correct = test(_model)
    return pct_correct

In [9]:
get_stats_for(RegularCNNModel())

Total model parameters : 45360


KeyboardInterrupt: 

# Define Intrinsic Dimension Wrapper

In [22]:
class IntrinsicDimensionWrapper(torch.nn.Module):
    
    def __init__(self, module, intrinsic_dimension, verbose=False):
        """
        Wrapper to estimate the intrinsic dimensionality of the 
        objective landscape for a specific task given a specific model
        :param module: pytorch nn.Module
        :param intrinsic_dimension: dimensionality within which we search for solution
        :param verbose: if things should be printed out
        """
        super(IntrinsicDimensionWrapper, self).__init__()
        
        self.verbose = verbose
        
        self.m = [module] # Hide this from inspection by get_parameters()
        
        self.name_base_localname = []
        
        # Stores the initial value: \theta_{0}^{D}
        self.initial_value = dict()
        
        # Stores the randomly generated projection matrix P
        self.random_matrix = dict()
       
        # Parameter vector that is updated, initialised with zeros as per text: \theta^{d}
        V = torch.nn.Parameter( torch.zeros( (intrinsic_dimension) ).to(device) )
        self.register_parameter('V', V)
        v_size = (intrinsic_dimension, )
        
        # Iterates over layers in the Neural Network
        for name, param in module.named_parameters():
            # If the parameter requires gradient update
            if param.requires_grad:
                
                if self.verbose: print(name, param.data.size(), v_size)
                
                # Saves the initial values of the initialised parameters from param.data and sets them to no grad.
                # (initial values are the 'origin' of the search)
                self.initial_value[name] = v0 = param.clone().detach().requires_grad_(False).to(device)
                print(name, v0.size())
                print(v0)
                # If v0.size() is [4, 3], then below operation makes it [4, 3, v_size]
                matrix_size = v0.size() + v_size

                # Generates random projection matrices P, sets them to no grad
                self.random_matrix[name] = (torch.randn(matrix_size, requires_grad=False).to(device) / intrinsic_dimension**0.5)
                
                # NOTE!: lines below are not clear! 
                base, localname = module, name
                while '.' in localname:
                    if self.verbose: print('Local name', localname)
                    prefix, localname = localname.split('.', 1)
                    if self.verbose: print('Prefix', prefix, '  Name', name, '  Local name', localname)
                    base = base.__getattr__(prefix)
                self.name_base_localname.append( (name, base, localname) )

        for name, base, localname in self.name_base_localname:
            delattr(base, localname)

    def forward(self, x):   
        # Iterate over the layers
        for name, base, localname in self.name_base_localname:
            #if self.verbose: print(name, base, localname)
            #print(self.initial_value[name].size(), self.random_matrix[name].size(), self.V.size(), 
            #      torch.matmul(self.random_matrix[name], self.V).size())
            
            # Product between matrix P and \theta^{d}
            ray = torch.matmul(self.random_matrix[name], self.V)
            # Add the \theta_{0}^{D} to P \dot \theta^{d}
            param = self.initial_value[name] + torch.squeeze(ray, -1)

            setattr(base, localname, param)

        # Pass through the model, by getting the module from a list self.m
        module = self.m[0]
        x = module(x)
        return x

# Testing

In [23]:
#model_single= torch.nn.Linear(3,4)
model_single = IntrinsicDimensionWrapper(module=torch.nn.Linear(3, 4), 
                                         intrinsic_dimension=10,
                                         verbose=False)

#[p[0] for p in model_test.named_parameters()]
[ (p.view(-1), p.requires_grad) for p in model_single.parameters() ]
#model_test.initial_value

weight torch.Size([4, 3])
tensor([[ 0.3699,  0.4400,  0.5121],
        [-0.4485, -0.1343,  0.0431],
        [-0.4591,  0.5401,  0.4066],
        [-0.1691,  0.0712, -0.0065]])
bias torch.Size([4])
tensor([ 0.1371, -0.4178, -0.0163, -0.0169])


[(tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], grad_fn=<ViewBackward>),
  True)]

In [25]:
for name, base, localname in model_single.name_base_localname:
    print(model_single.random_matrix[name].size())
    print(model_single.V.size())
    print(torch.matmul(model_single.random_matrix[name], model_single.V))
    print('\n')

torch.Size([4, 3, 10])
torch.Size([10])
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]], grad_fn=<UnsafeViewBackward>)


torch.Size([4, 10])
torch.Size([10])
tensor([0., 0., 0., 0.], grad_fn=<MvBackward>)




In [None]:
x = torch.tensor( [1., 5., -1.25] ).to(device)

model_single(x)

In [None]:
#model_test.V.data[3]=0.
model_single.V.data[3]+=.005
model_single(x)

In [None]:
opt_test = torch.optim.Adam(model_single.parameters(), lr=lr)

model_single.train()

#data, target = data.to(device), target.to(device)
x_batch = torch.unsqueeze(x,0)
data, target = x_batch, torch.tensor( [1,] ).to(device)

opt_test.zero_grad()
output = model_single(data)
loss = F.nll_loss(output, target)
print(loss)
loss.backward()
opt_test.step()

model_single.V.requires_grad,
#model_single.m[0].weight.grad, model_single.m[0].weight.grad

# Apply to regular CNN

In [None]:
## Now, let's build the CNN model with Intrinsic Dimension Wrapping...

intrinsic_dimension_guess = 100

In [None]:
model_base = RegularCNNModel()
#[name for name,param in model_base.named_parameters()]

model_wrapped = IntrinsicDimensionWrapper( model_base, intrinsic_dimension_guess )
# [name for name,param in model_wrapped.named_parameters()]
#[param for param in model_wrapped.parameters()]

In [None]:
get_stats_for(model_wrapped)

# Toy MNIST

In [6]:
class FCNAsInPAper(torch.nn.Module):
    def __init__(self):
        super(FCNAsInPAper, self).__init__()
        self.fc1 = torch.nn.Linear(784, 200)
        self.fc2 = torch.nn.Linear(200, 200)
        self.fc3 = torch.nn.Linear(200, 10)

    def forward(self, x):
        
        x = x.view(-1, 784)
        
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        
        return F.log_softmax(x, dim=1)

In [7]:
intrinsic_dimension_guess = 750

In [8]:
model_base = FCNAsInPAper()

In [12]:
print(parameter_count(model_base))

199210


In [13]:
model_wrapped = IntrinsicDimensionWrapper( model_base, intrinsic_dimension_guess )

In [16]:
get_stats_for(model_wrapped)

Total model parameters : 750





Test set: Average loss: 0.5160, Accuracy: 8422/10000 (84.2%)


Test set: Average loss: 0.6030, Accuracy: 8259/10000 (82.6%)


Test set: Average loss: 0.4815, Accuracy: 8491/10000 (84.9%)



KeyboardInterrupt: 