In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity
import math
import sklearn
from sklearn import datasets
import argparse
import torch.nn.functional as F
from utils import timer
from torch.utils.data import DataLoader, TensorDataset

class MLP(nn.Module):
    def __init__(self, in_features,hidden_dim,out_features, num_hidden_layers):
        super(MLP, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(in_features, hidden_dim))
        self.layers.append(nn.ReLU())
        for _ in range(num_hidden_layers - 1):
            self.layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.layers.append(nn.ReLU())
        self.layers.append(nn.Linear(hidden_dim, out_features))
        self.layers.append(nn.Softmax(dim=1))

        self.connections = (num_hidden_layers-1)* hidden_dim**2 + hidden_dim*(in_features+out_features)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

def run(args):
    num_epochs = args.num_epochs
    batch_size = args.batch_size    

    print(args)
    x, y = datasets.fetch_covtype(return_X_y=True)
    num_samples = args.sample_portion * x.shape[0]
    learning_rate = math.sqrt(batch_size/num_samples)*0.01
    
    model = MLP(in_features=x.shape[1], hidden_dim=99, out_features=7, num_hidden_layers=3)
    print(f'model in_features: {x.shape[1]}, hidden_dim: 99, out_features: 7, num_hidden_layers: 3, num_samples: {num_samples},leanring_rate: {learning_rate}')
    
    #GFLO
    num_connections  = model.connections
    GFLO = num_epochs*6*num_samples*num_connections / 1e9
    print(f"GFLO is {GFLO:.2f}, should take {GFLO/(5000 * 0.3):.2f} seconds on a 5 TFLOPS machine")

    # load data 
    x = torch.tensor(x, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.long)
    y = F.one_hot(y-1, num_classes=7).to(torch.float32)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # move data and model to GPU
    x = x.to(device)
    y = y.to(device)
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    dataset = TensorDataset(x, y)
    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=args.shuffle)

    print('start training the model')
    epoch_timer = timer.Timer()
    test_timer = timer.Timer()

    loss = criterion(y*0,y)
    print(f'Initial Loss: {loss.item():.10f}')
    for epoch in range(num_epochs):
        epoch_timer.start()
        if args.data_loader==2:
            for i, (batch_x, batch_y) in enumerate(dataloader):
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()              
        elif args.data_loader==1:
            for i in range(0, num_samples, batch_size):
                batch_x = x[i:i+batch_size]
                batch_y = y[i:i+batch_size]
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        else:
                outputs = model(x)
                loss = criterion(outputs, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        if (epoch<10)| ((epoch+1) % 10 == 0) :
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.10f}')
        epoch_timer.end()        
    print('finished training the model')

    test_timer.start()
    model.eval()
    with torch.no_grad():
        test_outputs = model(x)
        test_loss = criterion(test_outputs, y)
        print(f'Test Loss: {test_loss.item():.10f}')
    test_timer.end()
    print(f'epoch : {epoch_timer.get_average_time()}, total : {epoch_timer.get_total_time()}, test : {test_timer.get_average_time()}')    

In [2]:
class Arg :
    def __init__(self):
        self.num_epochs = 300
        self.sample_portion = 1
        self.data_loader = 0
        self.batch_size = 16384
        self.shuffle = False

args = Arg()

In [3]:
with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
    with record_function("model_inference"):
        run(args)
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

<__main__.Arg object at 0x000001FBE7A854C0>
model in_features: 54, hidden_dim: 99, out_features: 7, num_hidden_layers: 3, num_samples: 581012,leanring_rate: 0.0016792579712777344
GFLO is 26815.91, should take 17.88 seconds on a 5 TFLOPS machine
start training the model
Initial Loss: 1.9459103346
Epoch [1/300], Loss: 2.0612566471
Epoch [2/300], Loss: 1.8008165359
Epoch [3/300], Loss: 1.8008171320
Epoch [4/300], Loss: 1.8008171320
Epoch [5/300], Loss: 1.8008171320
Epoch [6/300], Loss: 1.8008171320
Epoch [7/300], Loss: 1.8008171320
Epoch [8/300], Loss: 1.8008171320
Epoch [9/300], Loss: 1.8008171320
Epoch [10/300], Loss: 1.8008171320
Epoch [20/300], Loss: 1.8008171320
Epoch [30/300], Loss: 1.8008171320
Epoch [40/300], Loss: 1.8008171320
Epoch [50/300], Loss: 1.8008171320
Epoch [60/300], Loss: 1.8008171320
Epoch [70/300], Loss: 1.8008171320
Epoch [80/300], Loss: 1.8008171320
Epoch [90/300], Loss: 1.8008171320
Epoch [100/300], Loss: 1.8008171320
Epoch [110/300], Loss: 1.8008171320
Epoch [120