Итак, наша задача состоит в том, чтобы рассчитать время работы сети.
Она сводится к тому, чтобы:
1. Выделить признаки, которые мы будем варьировать в нашей сети
2. Сгенерировать данные, то есть получить время выполнения в зависимости от заданных параметров
3. Обучить сеть и сделать выводы о том, что имеет смысл улучшить 

In [1]:
#Импортируем зависимости
import pandas as pd 
import random

import torch
from torch import nn
from torchvision import models

from time import time
from tqdm import tqdm

Будем решать задачу с тем условием, что мы используем обычную cnn.
Выделим такие признаки:
1. Количество каждого типа слоя 
2. Общее количество учитываемых параметров
3. Оно же по каждому типу слоя

In [2]:
data = {
    'l_conv': [], 
    'l_pool': [],
    'l_batch_conv': [], 
    'l_batch_linear': [],  
    'l_dropout': [],
    'l_linear': [],
    'n_conv': [], 
    'n_linear':[],
    'n_batch_conv': [], 
    'n_batch_linear': [],  
    'p_dropout': [],
    'time': [],
    'p_all': []
}

Зафиксируем входную размерность на 128. Создадим датасет размера 100*128

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_s = 128
s_batch = 128
number = 100

s_kernel = [1, 3, 5]
pad = [0, 1, 3]
st = [1, 2, 3]

In [9]:
t_loader = []
for i in range(0, 100, 1):
    t_labels = torch.randint(0, 50, (1, s_batch)).reshape(-1)
    t_batch = torch.rand((s_batch, 1, input_s, input_s))
    t_loader.append((t_batch, t_labels))

Опишем класс сети.


In [12]:
class MyNet(nn.Module):

    def __init__(self, s_input):
        super().__init__()

        self.layers = None
        self.data = {
            'l_conv': 0, 
            'l_pool': 0,
            'l_batch_conv': 0, 
            'l_batch_linear': 0,  
            'l_dropout': 0,
            'l_linear': 0,
            'n_conv': 0, 
            'n_linear': 0,
            'n_batch_conv': 0, 
            'n_batch_linear': 0,  
            'p_dropout': 0,
            'time': 0,
            'p_all': 0
        } 
        in_channels = 1 
        out_channels = 16

        p_g = random.uniform(0,1)
        p_batch = random.uniform(0,1)
        p_dropout = random.uniform(0,1)

        p = torch.randint(2, 6, (1,1)) / 10
        self.data['p_dropout'] = round(float(p), 2)

        n_pool = random.randint(2, 4)
        self.data['l_pool'] = int(n_pool)

        init = True
        while s_input != 1:
            stride = st[random.randint(0, len(st)-1)]
            padding = pad[random.randint(0, len(st)-1)]
            kernel = s_kernel[random.randint(0, len(s_kernel)-1)]
            s_output = (s_input + 2 * padding - kernel) / stride + 1

            if s_input > kernel:
                self.data['l_conv'] += 1
                self.data['n_conv'] += (kernel * kernel * in_channels + 1) * out_channels

                if init:
                    self.layers = nn.ModuleList([nn.Conv2d(in_channels, out_channels, kernel, stride, padding)])
                    init = False
                else:
                    self.layers.append(nn.Conv2d(in_channels, out_channels, kernel, stride, padding))
                
                if p_batch >= p_g:
                    self.layers.append(nn.BatchNorm2d(out_channels))
                    self.data['l_batch_conv'] += 1
                    self.data['n_batch_conv'] += out_channels * 2

                
                                     
                self.layers.append(nn.ReLU(True))
                s_input = s_output

                if n_pool > 0 and s_input > 2:
                    self.layers.append(nn.MaxPool2d(2,2))
                    n_pool -= 1
                    s_output /= 2
                    s_input = s_output
                
                in_channels = out_channels
                if s_input < 64:
                    out_channels = 128
                elif s_input < 128:
                    out_channels = 64
                else:
                    out_channels = 32
            else:
              break

        # приводим входную размерность к ндиному виду
        s_input = int(s_input) * int(s_input) * in_channels

        n_lin = torch.randint(2, 4, (1,1)) # число линейных слоев
        self.data['l_linear'] = int(n_lin)
        self.layers.append(nn.Flatten())

        while(n_lin > 1):
            self.layers.append(nn.Linear(s_input, s_input))
            self.data['n_linear'] += (s_input + 1) * s_input
            
            if (p_g <= p_batch):
                self.layers.append(nn.BatchNorm1d(s_input))
                self.data['l_batch_linear'] += 1
                self.data['n_batch_linear'] += s_input * 2
            
            self.layers.append(nn.ReLU(True))
            if (p_g <= p_dropout):
                self.layers.append(nn.Dropout(round(float(p),2)))
                self.data['l_dropout'] += 1
            n_lin -= 1

        self.layers.append(nn.Linear(s_input, 1000))
        self.data['n_linear'] += (s_input + 1) * 1000

        self.data['l_pool'] -= int(n_pool)

    def forward(self, x):
        for i in range(len(self.layers)):
            x = self.layers[i](x)
        return x

Соберем информацию по 4 измерениям времени на каждой эпохе

In [13]:
for i in range(number):
    model = MyNet(input_s).to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.15)
    loss_function = nn.CrossEntropyLoss() 
    k = 4
    model.train()
    common_time = 0
    for test in range(k):
        start = time()
        for i, (batch, labels) in enumerate(tqdm(t_loader)):
            loss = torch.tensor(0, dtype=torch.float32)
            optimizer.zero_grad()
            batch = batch.to(device)
            labels = labels.to(device)
            output = model(batch)
            loss = loss_function(output, labels)
            loss.backward()
            optimizer.step()

        common_time += time()-start

    common_time = round(float(common_time / k), k)
    model.data['time'] = common_time
    model.data['p_all'] = sum([p.numel() for p in model.parameters() if p.requires_grad])

    for key in data:
        data[key].append(model.data[key])

100%|██████████| 100/100 [00:02<00:00, 37.27it/s]
100%|██████████| 100/100 [00:02<00:00, 42.90it/s]
100%|██████████| 100/100 [00:02<00:00, 42.98it/s]
100%|██████████| 100/100 [00:02<00:00, 43.00it/s]
100%|██████████| 100/100 [00:02<00:00, 49.05it/s]
100%|██████████| 100/100 [00:02<00:00, 49.50it/s]
100%|██████████| 100/100 [00:02<00:00, 49.43it/s]
100%|██████████| 100/100 [00:02<00:00, 49.56it/s]
100%|██████████| 100/100 [00:04<00:00, 23.17it/s]
100%|██████████| 100/100 [00:04<00:00, 23.05it/s]
100%|██████████| 100/100 [00:04<00:00, 23.17it/s]
100%|██████████| 100/100 [00:04<00:00, 23.33it/s]
100%|██████████| 100/100 [00:06<00:00, 15.74it/s]
100%|██████████| 100/100 [00:06<00:00, 15.70it/s]
100%|██████████| 100/100 [00:06<00:00, 15.75it/s]
100%|██████████| 100/100 [00:06<00:00, 15.67it/s]
100%|██████████| 100/100 [00:01<00:00, 75.32it/s]
100%|██████████| 100/100 [00:01<00:00, 74.65it/s]
100%|██████████| 100/100 [00:01<00:00, 75.00it/s]
100%|██████████| 100/100 [00:01<00:00, 74.57it/s]


In [20]:
df = pd.DataFrame.from_dict(data) 
df.to_csv (r'out.csv', index = False, header=True)

In [21]:
df = pd.read_csv("out.csv")

In [22]:
df

Unnamed: 0,l_conv,l_pool,l_batch_conv,l_batch_linear,l_dropout,l_linear,n_conv,n_linear,n_batch_conv,n_batch_linear,p_dropout,time,p_all
0,3,3,0,0,0,1,42336,4097000,0,0,0.2,6.7518,4139336
1,3,3,0,0,0,2,148960,2074600,0,0,0.3,3.2267,2223560
2,4,3,0,0,0,1,412320,1025000,0,0,0.3,2.3776,1437320
3,4,3,0,0,0,2,136096,322792,0,0,0.2,0.9451,458888
4,25,2,0,0,0,1,10149792,257000,0,0,0.3,22.4521,10406792
...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,8,3,0,0,0,1,1546400,2305000,0,0,0.2,3.4132,3851400
101,6,4,0,0,0,1,773408,2305000,0,0,0.2,6.9695,3078408
102,4,2,4,1,0,2,1807264,2074600,1568,2048,0.2,4.9956,3885480
103,4,3,4,0,0,1,1217696,257000,1568,0,0.3,7.8595,1476264


In [23]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

s = StandardScaler()
s.fit(df)
s_df = s.transform(df)

X_train, X_test, y_train, y_test = train_test_split(s_df, df['time'], test_size=0.25, random_state=42)

model = SGDRegressor() 
model.fit(X_train, y_train)

SGDRegressor()

В качестве модели выберем линейную регрессию.

In [24]:
train_predictions = model.predict(X_train) 
test_predictions = model.predict(X_test)

train_mae = mean_absolute_error(y_train, train_predictions)
test_mae = mean_absolute_error(y_test, test_predictions)

train_mape = mean_absolute_percentage_error(y_train, train_predictions)
test_mape = mean_absolute_percentage_error(y_test, test_predictions)

print("Train MAE: {}".format(train_mae))
print("Test MAE: {}".format(test_mae))
print("Train MAPE: {}".format(train_mape))
print("Test MAPE: {}".format(test_mape))

Train MAE: 0.14573497948155398
Test MAE: 0.2183818895878307
Train MAPE: 0.02845370925927857
Test MAPE: 0.04093581121915958
