### Mount GDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Remove pre-existing logs

In [None]:
# !rm -r /content/linear/ /content/attention/

### Import Libraries

In [None]:
import logging
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
import os
import sys
import time
import torch
import torch.nn as nn

from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from torch.optim import Adam
from torchvision.datasets import MNIST
from torchvision.datasets import FashionMNIST
from torchvision.datasets import CIFAR10
from torchvision.datasets import CIFAR100
from torchvision.datasets import SVHN
from torchvision.transforms import Compose, ToTensor, Normalize, Lambda
from torch.utils.data import DataLoader
from torch.utils.data.dataset import TensorDataset

### Add GPU

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


### Choose FF Type and Dataset

In [None]:
ff_type = 'linear'
# ff_type = 'attention'

dataset = 'MNIST'
# dataset = 'FashionMNIST'
# dataset = 'CIFAR10'
# dataset = 'CIFAR100'
# dataset = 'SVHN'

### Create Log Directories

In [None]:
if not os.path.exists(ff_type):
  os.mkdir(ff_type)
        
path = os.path.join(ff_type, dataset)
if not os.path.exists(path):
  os.mkdir(path)

### Define Nvidia SMI logging to be run as a background process

In [None]:
def start_logging(logger):
    print("******* nvidia-smi logging started *******")
    while True:
        nvidia_smi_entry = os.popen("nvidia-smi --query-gpu=timestamp,temperature.gpu,utilization.gpu,utilization.memory,memory.used,memory.free,memory.total,power.draw --format=csv | sed -n '2 p'").read()
        logger.info(nvidia_smi_entry)
    print("******* nvidia-smi logging ended *******")

### Setup Loggers

In [None]:
formatter = logging.Formatter('%(message)s')
def setup_logger(name, log_file, level=logging.INFO):
    """To setup as many loggers as you want"""

    handler = logging.FileHandler(log_file)
    handler.setFormatter(formatter)

    logger = logging.getLogger(name)
    logger.setLevel(level)
    logger.addHandler(handler)
    
    return logger

### Initialize Loggers

In [None]:
# nvidia smi logger

logger_nvidia_smi = setup_logger(path+"_logger_nvidia_smi", os.path.join(path, "nvidia_smi.csv"))
system_logger = multiprocessing.Process(target=start_logging, args=[logger_nvidia_smi])

# layer loggers

logger_layer_ff = setup_logger(path+"_logger_layer_ff", os.path.join(path, "layer_ff.csv"))
logger_layer_ff.info('layer_num,layertime_ms')

# e2e loggers

logger_e2e_ff = setup_logger(path+"_logger_e2e_ff", os.path.join(path, "e2e_ff.csv"))
logger_e2e_ff.info('e2etime_ms')

# linear ff loggers

logger_gpu_compute_linear_ff = setup_logger(path+"_logger_gpu_compute_linear_ff", os.path.join(path, "gpu_compute_linear_ff.csv"))
logger_gpu_compute_linear_ff.info('epoch,gpucomputetime_ms')

logger_epoch_linear_ff = setup_logger(path+"_logger_epoch_linear_ff", os.path.join(path, "epoch_linear_ff.csv"))
logger_epoch_linear_ff.info('epoch_num,epochtime_ms')

# attention ff loggers

logger_gpu_compute_attention_ff = setup_logger(path+"_logger_gpu_compute_attention_ff", os.path.join(path, "gpu_compute_attention_ff.csv"))
logger_gpu_compute_attention_ff.info('epoch,gpucomputetime_ms')

logger_epoch_attention_ff = setup_logger(path+"_logger_epoch_attention_ff", os.path.join(path, "epoch_attention_ff.csv"))
logger_epoch_attention_ff.info('epoch_num,epochtime_ms')

INFO:linear/MNIST_logger_layer_ff:layer_num,layertime_ms
INFO:linear/MNIST_logger_e2e_ff:e2etime_ms
INFO:linear/MNIST_logger_gpu_compute_linear_ff:epoch,gpucomputetime_ms
INFO:linear/MNIST_logger_epoch_linear_ff:epoch_num,epochtime_ms
INFO:linear/MNIST_logger_gpu_compute_attention_ff:epoch,gpucomputetime_ms
INFO:linear/MNIST_logger_epoch_attention_ff:epoch_num,epochtime_ms


### Initialize Timers

In [None]:
# layer ff timers

layer_ff_start = torch.cuda.Event(enable_timing=True)
layer_ff_end = torch.cuda.Event(enable_timing=True)

# e2e ff timers

e2e_ff_start = torch.cuda.Event(enable_timing=True)
e2e_ff_end = torch.cuda.Event(enable_timing=True)

# linear ff timers

gpu_compute_linear_ff_start = torch.cuda.Event(enable_timing=True)
gpu_compute_linear_ff_end = torch.cuda.Event(enable_timing=True)
epoch_linear_ff_start = torch.cuda.Event(enable_timing=True)
epoch_linear_ff_end = torch.cuda.Event(enable_timing=True)

# attention ff timers

gpu_compute_attention_ff_start = torch.cuda.Event(enable_timing=True)
gpu_compute_attention_ff_end = torch.cuda.Event(enable_timing=True)
epoch_attention_ff_start = torch.cuda.Event(enable_timing=True)
epoch_attention_ff_end = torch.cuda.Event(enable_timing=True)
e2e_attention_ff_start = torch.cuda.Event(enable_timing=True)
e2e_attention_ff_end = torch.cuda.Event(enable_timing=True)

### Define Overlay

In [None]:
def overlay_y_on_x(x, y, num_labels):
    """Replace the first num_label pixels of data [x] with one-hot-encoded label [y]
    """
    x_ = x.clone()
    x_[:, :num_labels] *= 0.0
    x_[range(x.shape[0]), y] = x.max()
    return x_

### Define Attention Layer

In [None]:
class Attention_layer(nn.Module):

  def __init__(self, input_size: tuple, output_size: int, patch_size: int):
    """
      input_size: n_h, n_w, n_c
      output_size: int
      patch_size: int
    """
    super().__init__()
    self.input_size = input_size
    self.patch_size = patch_size
    self.output_size = output_size

    n_h, n_w, n_c = input_size
    patch_dim = n_h // patch_size
    num_patches = patch_dim ** 2 
    self.patch_embedding = nn.Conv2d(n_c, 64, kernel_size=patch_size, stride=patch_size, device=device)

    for param in self.patch_embedding.parameters():
      param.requires_grad = False

    self.self_attn = nn.MultiheadAttention(64, 8, device=device)
    self.fc1 = nn.Linear(64, output_size, device=device)
    self.lrelu = nn.ReLU() #nn.LeakyReLU()

  def __normalize(self, x):
    x_shape = x.shape
    x = x.reshape(x_shape[0], -1)
    x = x / (x.norm(2, 1, keepdim=True) + 1e-4)
    x = x.reshape(x_shape)
    return x

  def __reshape(self, x):
    x = x.reshape(x.size(0), self.input_size[2], self.input_size[0], self.input_size[1])
    return x

  def forward(self, x):
    x = self.__normalize(x) 
    x = self.__reshape(x) # (batch_size, channel_size, height, width)
    x = self.patch_embedding(x) # (batch_size, 64, patch_dim, patch_dim)
    x = x.flatten(2).transpose(1, 2) # (batch_size, num_patches, 64)

    x = x.transpose(0, 1)  # (num_patches, batch_size, 64)
    x, _ = self.self_attn(x, x, x)
    x = x.mean(dim=0) # (batch_size, 64)


    x = self.__normalize(x)
    x = self.fc1(x)

    x = self.lrelu(x)

    return x
  
  def train_ff(self, train_loader, epoch_range, batch_size, lr=0.03, threshold=2.0):
    optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
    next_pos = []
    next_neg = []
    next_label = []
    for e in epoch_range:
      epoch_attention_ff_start.record() # start recording epoch time
      for (x_pos, x_neg, label) in train_loader:
        gpu_compute_attention_ff_start.record() # start recording gpu compute time
        x_pos = x_pos.to(device)
        x_neg = x_neg.to(device)
        g_pos = self.forward(x_pos).pow(2).mean(dim=1)
        g_neg = self.forward(x_neg).pow(2).mean(dim=1)

        loss = torch.log(1 + torch.exp(
            torch.cat([
                threshold - g_pos,
                g_neg - threshold
            ])
        )).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        x_pos = x_pos.detach()
        x_neg = x_neg.detach()
        gpu_compute_attention_ff_end.record() # end recording gpu compute time
        torch.cuda.synchronize()
        logger_gpu_compute_attention_ff.info(str(e)+","+str(gpu_compute_attention_ff_start.elapsed_time(gpu_compute_attention_ff_end))) # log gpu compute time
      epoch_attention_ff_end.record() # end recording epoch time
      torch.cuda.synchronize()
      logger_epoch_attention_ff.info(str(e)+","+str(epoch_attention_ff_start.elapsed_time(epoch_attention_ff_end))) # log epoch time

    for (x_pos, x_neg, label) in train_loader:
      x_pos = x_pos.to(device)
      x_neg = x_neg.to(device)    
      x_pos_next = self.forward(x_pos).detach()
      x_neg_next = self.forward(x_neg).detach()
      next_pos.append(x_pos_next)
      next_neg.append(x_neg_next)
      next_label.append(label)
      x_pos = x_pos.detach()
      x_neg = x_neg.detach()
    
    next_pos = torch.cat(next_pos, dim=0)
    next_neg = torch.cat(next_neg, dim=0)
    next_label = torch.cat(next_label, dim=0)

    return DataLoader(TensorDataset(
        next_pos,
        next_neg,
        next_label
    ), batch_size=batch_size, shuffle=True)
    


### Define Linear Layer

In [None]:
class Linear_layer(nn.Linear):

  def __init__(self, in_features, out_features,
                 bias=True, device=None, dtype=None):
    super().__init__(in_features, out_features, bias, device, dtype)
    self.relu = torch.nn.ReLU()

  def forward(self, x):
      x_direction = x / (x.norm(2, 1, keepdim=True) + 1e-4)
      return self.relu(
          torch.mm(x_direction, self.weight.T) +
          self.bias.unsqueeze(0))

  def train_ff(self, train_loader, epoch_range, batch_size, lr=0.03, threshold=2.0):
    
    optimizer = torch.optim.Adam(self.parameters(), lr=lr)
    next_pos = []
    next_neg = []
    next_label = []

    for i in epoch_range:
      epoch_linear_ff_start.record() # start recording epoch time
      for (x_pos, x_neg, label) in train_loader:
        gpu_compute_linear_ff_start.record() # start recording gpu compute time
        x_pos = x_pos.to(device)
        x_neg = x_neg.to(device)
        g_pos = self.forward(x_pos).pow(2).mean(1)
        g_neg = self.forward(x_neg).pow(2).mean(1)
        loss = torch.log(1 + torch.exp(torch.cat([
            -g_pos + threshold,
            g_neg - threshold]))).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        x_pos = x_pos.detach()
        x_neg = x_neg.detach()
        gpu_compute_linear_ff_end.record() # end recording gpu compute time
        torch.cuda.synchronize()
        logger_gpu_compute_linear_ff.info(str(i)+","+str(gpu_compute_linear_ff_start.elapsed_time(gpu_compute_linear_ff_end))) # log gpu compute time
      epoch_linear_ff_end.record() # end recording epoch time
      torch.cuda.synchronize()
      logger_epoch_linear_ff.info(str(i)+","+str(epoch_linear_ff_start.elapsed_time(epoch_linear_ff_end))) # log epoch time

    for (x_pos, x_neg, label) in train_loader:
      x_pos = x_pos.to(device)
      x_neg = x_neg.to(device)  
      x_pos_next = self.forward(x_pos).detach()
      x_neg_next = self.forward(x_neg).detach()
      next_pos.append(x_pos_next)
      next_neg.append(x_neg_next)
      next_label.append(label)
      x_pos = x_pos.detach()
      x_neg = x_neg.detach()
  
    next_pos = torch.cat(next_pos, dim=0)
    next_neg = torch.cat(next_neg, dim=0)
    next_label = torch.cat(next_label, dim=0)
    
    return DataLoader(TensorDataset(
        next_pos,
        next_neg,
        next_label
    ), batch_size=batch_size, shuffle=True)

### Define Net

In [None]:
class Net(nn.Module):

  def __init__(self, layers, num_labels):
    super().__init__()
    self.layers = layers
    self.num_labels = num_labels

  def train_ff(self, train_loader, epochs=1000, **kwargs):
    cur_train_loader = train_loader
    batch_size = train_loader.batch_size
    for i, layer in enumerate(self.layers):
      
      is_large_batch = len(cur_train_loader) >= 5
      print(f"Training layer: {i+1} ... tqdm: {'loader' if not is_large_batch else 'epoch'}")

      cur_train_loader = tqdm(cur_train_loader) if not is_large_batch else cur_train_loader
      epoch_range = tqdm(range(epochs)) if is_large_batch else range(epochs)
      
      layer_ff_start.record() # start recording layer train time
      cur_train_loader = layer.train_ff(cur_train_loader, epoch_range=epoch_range, batch_size=batch_size, **kwargs) # train layer
      layer_ff_end.record() # end recording layer train time
      torch.cuda.synchronize()
      logger_layer_ff.info(str(i)+","+str(layer_ff_start.elapsed_time(layer_ff_end))) # log layer train time

  def forward(self, x):
    for layer in self.layers:
      x = layer(x)
    return x

  def predict_ff(self, data_loader):

    def predict(layers, x, num_labels):
      goodness_per_label = []
      for label in range(num_labels):
          h = overlay_y_on_x(x, label, num_labels)
          goodness = []
          for i, layer in enumerate(layers):
              h = layer(h)
              # if i==0:
                # continue
              goodness += [h.pow(2).mean(1)]
          goodness_per_label += [sum(goodness).unsqueeze(1)]
      goodness_per_label = torch.cat(goodness_per_label, 1)
      return goodness_per_label.argmax(1)
    
    preds = []
    labels = []
    for x, label in data_loader:
      x = x.to(device)
      preds.append(predict(self.layers, x, self.num_labels))
      labels.append(label)

    preds = torch.cat(preds, 0)
    labels = torch.cat(labels, 0)
    return preds.cpu(), labels.cpu()

  def predict_bp(self, data_loader):
    preds = []
    labels = []
    for input, label in data_loader:
      input = input.to(device)
      pred = self.forward(input)
      preds.append(pred.argmax(1))
      labels.append(label)
    preds = torch.cat(preds, 0)
    labels = torch.cat(labels,0)
    return preds.cpu(), labels.cpu()

### Define MNIST Dataset

In [None]:
def MNIST_dataset():
  transform = Compose([
        ToTensor(),
        Normalize((0.1307,), (0.3081,)),
        Lambda(lambda x: torch.flatten(x))])
  train_data = MNIST('./data/MNIST/', train=True,
              download=True,
              transform=transform)
  test_data = MNIST('./data/MNIST/', train=False,
              download=True,
              transform=transform)
  return train_data, test_data

### Define Fashion MNIST Dataset

In [None]:
def FashionMNIST_dataset():

    transform = Compose([
        ToTensor(),
        Normalize((0.1307,), (0.3081,)),
        Lambda(lambda x: torch.flatten(x))])

    train_loader = FashionMNIST('./data/FashionMNIST', train=True,
              download=True,
              transform=transform)

    test_loader = FashionMNIST('./data/FashionMNIST', train=False,
              download=True,
              transform=transform)

    return train_loader, test_loader

### Define CIFAR10 Dataset

In [None]:
def CIFAR10_dataset():

    transform = Compose([
        ToTensor(),
        Normalize((0.4914, 0.4822, 0.4465), (0.4914, 0.4822, 0.4465)),
        Lambda(lambda x: torch.flatten(x))])

    train_loader = CIFAR10('./data/CIFAR10', train=True,
              download=True,
              transform=transform)

    test_loader = CIFAR10('./data/CIFAR10', train=False,
              download=True,
              transform=transform)

    return train_loader, test_loader

### Define CIFAR100 Dataset

In [None]:
def CIFAR100_dataset():

    transform = Compose([
        ToTensor(),
        Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        Lambda(lambda x: torch.flatten(x))])

    train_loader = CIFAR100('./data/CIFAR100', train=True,
              download=True,
              transform=transform)

    test_loader = CIFAR100('./data/CIFAR100', train=False,
              download=True,
              transform=transform)

    return train_loader, test_loader

### Define SVHN Dataset

In [None]:
def SVHN_dataset():

    transform = Compose([
        ToTensor(),
        Normalize((0.4377, 0.4438, 0.4728), (0.1980, 0.2010, 0.1970)),
        Lambda(lambda x: torch.flatten(x))])

    train_loader = SVHN('./data/SVHN', split='train',
              download=True,
              transform=transform)

    test_loader = SVHN('./data/SVHN', split='test',
              download=True,
              transform=transform)

    return train_loader, test_loader

### Define Train and Test Sets

In [None]:
def create_ff_train_dataset(train_loader, num_labels):
  pos_set = []
  neg_set = []
  label_set = []
  for input, label in tqdm(train_loader):
    x_pos = overlay_y_on_x(input, label, num_labels)
    rnd = torch.randperm(input.size(0))
    x_neg = overlay_y_on_x(input, label[rnd], num_labels)
    pos_set.append(x_pos)
    neg_set.append(x_neg)
    label_set.append(label)
  pos_set = torch.cat(pos_set, 0)
  neg_set = torch.cat(neg_set, 0)
  label_set = torch.cat(label_set, 0)
  return TensorDataset(pos_set, neg_set, label_set)

def create_ff_val_dataset(val_loader):
  inputs = []
  labels = []

  for input, label in tqdm(val_loader):
    inputs.append(input)
    labels.append(label)
  inputs = torch.cat(inputs, 0)
  labels = torch.cat(labels, 0)
  return TensorDataset(inputs, labels)

### Set Random Seed

In [None]:
torch.manual_seed(1234)

<torch._C.Generator at 0x7f3a500a2db0>

### Download Dataset

In [None]:
if dataset == 'MNIST':
  train_data, test_data = MNIST_dataset()
elif dataset == 'FashionMNIST':
  train_data, test_data = FashionMNIST_dataset()
elif dataset == 'CIFAR10': 
  train_data, test_data = CIFAR10_dataset()
elif dataset == 'CIFAR100':
  train_data, test_data = CIFAR100_dataset()
elif dataset == 'SVHN':
  train_data, test_data = SVHN_dataset()
else:
  print('Invalid dataset')
  exit()
print(train_data, test_data)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 29270198.02it/s]


Extracting ./data/MNIST/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 8777948.83it/s]

Extracting ./data/MNIST/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz





Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 7790016.99it/s]


Extracting ./data/MNIST/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 18676988.99it/s]

Extracting ./data/MNIST/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/MNIST/raw

Dataset MNIST
    Number of datapoints: 60000
    Root location: ./data/MNIST/
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.1307,), std=(0.3081,))
               Lambda()
           ) Dataset MNIST
    Number of datapoints: 10000
    Root location: ./data/MNIST/
    Split: Test
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.1307,), std=(0.3081,))
               Lambda()
           )





### Create Train and Test Sets

In [None]:
train_dataset = create_ff_train_dataset(DataLoader(train_data, batch_size=1024, shuffle=False), 10)
test_dataset = create_ff_val_dataset(DataLoader(test_data, batch_size=1024, shuffle=False))

100%|██████████| 59/59 [00:20<00:00,  2.91it/s]
100%|██████████| 10/10 [00:02<00:00,  3.68it/s]


### Define Net

In [None]:
if ff_type == 'linear':
  
  layers = [
    Linear_layer(28*28, 2000, device=device),
    # Linear_layer(32*32*3, 2000, device=device),
    Linear_layer(2000, 2000, device=device),
    Linear_layer(2000, 2000, device=device),
    Linear_layer(2000, 2000, device=device)
  ]

elif ff_type == 'attention':
  
    layers = [
    Attention_layer((28, 28, 1), 2000, 4),
    # Attention_layer((32, 32, 3), 2000, 4),
    Linear_layer(2000, 2000, device=device),
    Linear_layer(2000, 2000, device=device),
    Linear_layer(2000, 2000, device=device)
  ]

else:
  print('Invalid net type')
  exit()

net = Net(layers, 10)

In [None]:
layers

[Linear_layer(
   in_features=784, out_features=2000, bias=True
   (relu): ReLU()
 ),
 Linear_layer(
   in_features=2000, out_features=2000, bias=True
   (relu): ReLU()
 ),
 Linear_layer(
   in_features=2000, out_features=2000, bias=True
   (relu): ReLU()
 ),
 Linear_layer(
   in_features=2000, out_features=2000, bias=True
   (relu): ReLU()
 )]

### Start Nvidia SMI logging

In [None]:
# system_logger.start()

### Train Net

In [None]:
logger_e2e_ff.setLevel(logging.DEBUG)
logger_epoch_attention_ff.setLevel(logging.DEBUG)

In [None]:
e2e_ff_start.record() # start recording e2e train time
net.train_ff(DataLoader(train_dataset, batch_size=512, shuffle=True), epochs=60, lr=0.02, threshold=15)
e2e_ff_end.record() # end recording e2e train time
torch.cuda.synchronize()
logger_e2e_ff.info(str(e2e_ff_start.elapsed_time(e2e_ff_end))) # log e2e train time

Training layer: 1 ... tqdm: epoch


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 30%|███       | 18/60 [00:31<01:17,  1.85s/it]INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.3554558753967285
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.898752212524414
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,6.299551963806152
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.4358720779418945
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.453760147094727
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.490880012512207
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.527103900909424
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.375999927520752
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.356800079345703
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.490079879760742
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.376031875610352
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.334688186645508
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.406400203704834
IN

Training layer: 2 ... tqdm: epoch


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 30%|███       | 18/60 [00:33<01:19,  1.89s/it]INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.488736152648926
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.389823913574219
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.420032024383545
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.415711879730225
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.4173760414123535
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.408448219299316
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.400576114654541
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.400512218475342
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.402239799499512
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.6229119300842285
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.397823810577393
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.421472072601318
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.371903896331787
IN

Training layer: 3 ... tqdm: epoch


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 30%|███       | 18/60 [00:35<01:22,  1.97s/it]INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.816800117492676
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.811391830444336
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.824480056762695
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.81987190246582
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.813695907592773
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.84438419342041
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.82966423034668
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.827360153198242
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.8368000984191895
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.8179521560668945
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.813663959503174
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.818367958068848
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.804448127746582
INFO:

Training layer: 4 ... tqdm: epoch


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.488607883453369
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.433343887329102
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.451776027679443
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.460608005523682
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.499135971069336
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.449024200439453
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.456096172332764
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.436927795410156
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.464064121246338
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.431935787200928
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.465280055999756
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.46560001373291
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.439455986022949
INFO:linear/MNIST_logger_gpu_compute_linear_ff:18,5.

### Check Performance of Net

In [None]:
pred, true = net.predict_ff(DataLoader(test_dataset, batch_size=512, shuffle=False))
accuracy_score(true, pred)
f1_score(true, pred, average=None)

array([0.11929275, 0.15569002, 0.09788268, 0.06449975, 0.0650042 ,
       0.03541473, 0.02707137, 0.02406739, 0.02703963, 0.02359882])

### Stop Nvidia SMI logging

In [None]:
system_logger.terminate()

### Create .zip files of logs

In [None]:
!zip -r /content/linear.zip /content/linear/
!zip -r /content/attention.zip /content/attention/

updating: content/linear/ (stored 0%)
updating: content/linear/FashionMNIST/ (stored 0%)
updating: content/linear/FashionMNIST/gpu_compute_attention_ff.csv (deflated 40%)
updating: content/linear/FashionMNIST/epoch_linear_ff.csv (deflated 59%)
updating: content/linear/FashionMNIST/nvidia_smi.csv (deflated 89%)
updating: content/linear/FashionMNIST/gpu_compute_linear_ff.csv (deflated 62%)
updating: content/linear/FashionMNIST/layer_ff.csv (deflated 27%)
updating: content/linear/FashionMNIST/epoch_attention_ff.csv (deflated 43%)
updating: content/linear/FashionMNIST/e2e_ff.csv (deflated 23%)
updating: content/linear/CIFAR10/ (stored 0%)
updating: content/linear/CIFAR10/gpu_compute_attention_ff.csv (stored 0%)
updating: content/linear/CIFAR10/epoch_linear_ff.csv (deflated 58%)
updating: content/linear/CIFAR10/nvidia_smi.csv (deflated 88%)
updating: content/linear/CIFAR10/gpu_compute_linear_ff.csv (deflated 62%)
updating: content/linear/CIFAR10/layer_ff.csv (deflated 11%)
updating: content