In [1]:
import sys
sys.path.append('../code')
from resnet import *
from cifar_very_tiny import *
from cifar_tiny import *
from cifar_dataset import *    
import torch as t 
import numpy as np
import tqdm
import matplotlib.pylab as plt
import matplotlib.cm as cm
import json
import hyperparams
from importlib import reload

%matplotlib inline
plt.rcParams['figure.figsize']=(12,9)
plt.rcParams['font.size']= 20

In [2]:
# добавил в загрузку валидационную выборку
# обрати внимание, maxsize --- это размер совокпного обучения и валидации
# поэтому размер обучающей выборки совпадает с тем, что было до этого
_, test_loader, train_loader_no_augumentation, valid_loader = cifar10_loader(batch_size=128, split_train_val=True,
                                                                             maxsize=10112*2)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [38]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
epoch_num = 50
run_num = 5 # количество запусков эксперимента
# версия нужна, чтобы различать старые и новые результаты экспериментов. 
# менять нужно каждый раз, когда есть хотя бы незначительные изменения в эксперименте
experiment_version = '18' 

validate_every_epoch = 5 # каждые 5 эпох отслеживать параметры модели

# с этими гиперпараметрами мы начинаем эксперименты
start_beta = 0.9914 #0.3 
start_temp  = 6.5 #10**(0.5)

In [4]:
def accuracy(student):
        student.eval()
        total = 0 
        correct = 0
        with t.no_grad():
            for x,y in test_loader:
                x = x.to(device)
                y = y.to(device)
                out = student(x)
                correct += t.eq(t.argmax(out, 1), y).sum()
                total+=len(x)
        student.train()
        return (correct/total).cpu().detach().numpy()

In [5]:
# запуск без дистилляции
for _ in range(run_num):
    internal_results = []
    student = Cifar_Very_Tiny(10).to(device)
    optim = t.optim.Adam(student.parameters())    
    crit = nn.CrossEntropyLoss()
    for e in range(epoch_num):
        tq = tqdm.tqdm(train_loader_no_augumentation)
        losses = []
        for x,y in tq:
            x = x.to(device)
            y = y.to(device)
            student.zero_grad()            
            loss = crit(student(x), y)
            losses.append(loss.cpu().detach().numpy())
            loss.backward()
            optim.step()
            tq.set_description('current loss:{}'.format(np.mean(losses[-10:])))        
        if e==0 or (e+1)%validate_every_epoch == 0: # если номер эпохи делится на 5 или эпоха - первая             
            test_loss = []
            student.eval()
            for x,y in test_loader:
                x = x.to(device)
                y = y.to(device)                            
                test_loss.append(crit(student(x), y).detach().cpu().numpy())                 
            test_loss = float(np.mean(test_loss))
            acc = float(accuracy(student))
            student.train()
            internal_results.append({'epoch': e, 'test loss':test_loss, 'accuracy':acc})
            print (internal_results[-1])

    with open('exp'+experiment_version+'_basic.jsonl', 'a') as out:
        out.write(json.dumps({'results':internal_results, 'version': experiment_version})+'\n')

KeyboardInterrupt: 

In [6]:
kl = nn.KLDivLoss(reduction='batchmean')
sm = nn.Softmax(dim=1)

def distill(out, batch_logits, temp):
    g = sm(out/temp)
    f = F.log_softmax(batch_logits/temp)    
    return kl(f, g)

In [7]:
# Запуск --- с CNN-дистилляцией
# в качестве значений гиперпараметров ставим  start_beta, start_temp
logits = np.load('./logits_cnn.npy')
for _ in range(run_num):
    internal_results = []
    beta = start_beta
    temp = start_temp
    student = Cifar_Very_Tiny(10).to(device)
    optim = t.optim.Adam(student.parameters())   
    crit = nn.CrossEntropyLoss()
    for e in range(epoch_num):
        tq = tqdm.tqdm(train_loader_no_augumentation)
        losses = []
        for batch_id, (x,y) in enumerate(tq):
            x = x.to(device)
            y = y.to(device)            
            batch_logits = t.Tensor(logits[128*batch_id:128*(batch_id+1)]).to(device)            
            student.zero_grad()
            out = student(x)
            student_loss = crit(out, y)            
            distillation_loss = distill(out, batch_logits, temp)
            loss = (1-beta) * student_loss + beta*distillation_loss
            losses.append(loss.cpu().detach().numpy())
            loss.backward()
            optim.step()
            tq.set_description('current loss:{}'.format(np.mean(losses[-10:])))
        if e==0 or (e+1)%validate_every_epoch == 0: # если номер эпохи делится на 5 или эпоха - первая             
            test_loss = []
            student.eval()
            for x,y in test_loader:
                x = x.to(device)
                y = y.to(device)                            
                test_loss.append(crit(student(x), y).detach().cpu().numpy())                 
            test_loss = float(np.mean(test_loss))
            acc = float(accuracy(student))
            student.train()
            internal_results.append({'epoch': e, 'test loss':test_loss, 'accuracy':acc})
            print (internal_results[-1])

            
    with open('exp'+experiment_version+'_distill.jsonl', 'a') as out:
        out.write(json.dumps({'results':internal_results, 'version': experiment_version})+'\n')

FileNotFoundError: [Errno 2] No such file or directory: './logits_cnn.npy'

In [11]:
teacher.get_features([0,2,4,6])

Cifar_Tiny(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1))
  (conv1_bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2_bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv3_bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)

In [39]:
# Запуск --- со случаными значениями гиперпараметров
crit = nn.CrossEntropyLoss()

# определяем функцию потерь как замкнутую относительно аргументов функцию
# нужно для подсчета градиентов гиперпараметров по двухуровневой оптимизации
def param_loss(batch,model, h):
    x,y,batch_logits,t_features, s_features, w = batch    
    beta,beta2,beta3,temp = h
    out = model(x)
    beta = F.sigmoid(beta)
    beta2 = F.sigmoid(beta2)
    beta3 = F.sigmoid(beta3)/100
    
    temp = F.sigmoid(temp) * 10
    distillation_loss = distill(out, batch_logits, temp)
    student_loss = crit(out, y)                
    
    w_loss = 0
    for i in range(4):        
        w_loss += t.sum((t.matmul(s_features[i], w[i]) -  t_features[i])**2, 1).mean()
        
        
    loss = beta * distillation_loss + beta2 * student_loss + w_loss * beta3
    
    return loss

logits = np.load('../code/logits_cnn.npy')
teacher = Cifar_Tiny(10).to(device)
teacher.load_state_dict(t.load('../code/aux_pkt.model?raw=true', map_location=device), )

for _ in range(run_num):
    internal_results = []
    
    # теперь beta и temp - не числа, а тензоры, по которым можно считать градиент
    beta1 = t.nn.Parameter(t.tensor(np.random.uniform(low=-1, high=1), device=device), requires_grad=True)
    beta2 = t.nn.Parameter(t.tensor(np.random.uniform(low=-1, high=1), device=device), requires_grad=True)
    beta3 = t.nn.Parameter(t.tensor(np.random.uniform(low=-1, high=1), device=device), requires_grad=True)
    temp = t.nn.Parameter(t.tensor(np.random.uniform(low=-2, high=0), device=device), requires_grad=True)    
    h = [beta1, beta2, beta3, temp]
    
    
    student = Cifar_Very_Tiny(10).to(device)
    optim = t.optim.Adam(student.parameters())   
    
    for x,_ in train_loader_no_augumentation:
        break
    x = x.to(device)
    w = []
    f_in = student.get_features(x, [0,1,2,3])
    f_out = teacher.get_features(x, [0,1,2,3])
    for in_, out_ in zip(f_in, f_out):
        w.append(t.randn(in_.shape[1], out_.shape[1]).cuda())
    
    for e in range(epoch_num): # хочется посмотреть куда сойдутся гиперпараметры, поэтому возьмем побольше эпох
        tq = tqdm.tqdm(train_loader_no_augumentation)
        losses = []
        for batch_id, ((x,y)) in enumerate(tq):
            x = x.to(device)
            y = y.to(device)            
            batch_logits = t.Tensor(logits[128*batch_id:128*(batch_id+1)]).to(device) 
            
            optim.zero_grad()
            f_in = student.get_features(x, [0,1,2,3])
            f_out = teacher.get_features(x, [0,1,2,3])
    
            loss = param_loss((x,y,batch_logits,f_out, f_in, w), student,h)
            losses.append(loss.cpu().detach().numpy())
            loss.backward()
            optim.step()
            tq.set_description('current loss:{}'.format(np.mean(losses[-10:])))
        if e==0 or (e+1)%validate_every_epoch == 0: # если номер эпохи делится на 5 или эпоха - первая             
            test_loss = []
            student.eval()
            for x,y in test_loader:
                x = x.to(device)
                y = y.to(device)                            
                test_loss.append(crit(student(x), y).detach().cpu().numpy())                 
            test_loss = float(np.mean(test_loss))
            
            
            acc = float(accuracy(student))
            student.train()
            internal_results.append({'epoch': e, 'test loss':test_loss, 'accuracy':acc, 
                                     'temp':float(10*F.sigmoid(h[2]).cpu().detach().numpy()),
                                     'beta1':float(F.sigmoid(h[0]).cpu().detach().numpy()),
                                     'beta2':float(F.sigmoid(h[1]).cpu().detach().numpy()),
                                    'beta3':float(F.sigmoid(h[1]).cpu().detach().numpy())})
            
            print (internal_results[-1])

            
    with open('exp'+experiment_version+'_w_dist_h_rand.jsonl', 'a') as out:
        out.write(json.dumps({'results':internal_results, 'version': experiment_version})+'\n')

  f = F.log_softmax(batch_logits/temp)
current loss:8956.294921875: 100%|██████████| 79/79 [00:02<00:00, 30.98it/s]  
current loss:8640.8662109375:   5%|▌         | 4/79 [00:00<00:02, 30.88it/s]

{'epoch': 0, 'test loss': 2.3041396141052246, 'accuracy': 0.09999999403953552, 'temp': 2.699783742427826, 'beta1': 0.5665669441223145, 'beta2': 0.7055090069770813, 'beta3': 0.7055090069770813}


current loss:6319.2509765625: 100%|██████████| 79/79 [00:02<00:00, 30.80it/s] 
current loss:4380.1240234375: 100%|██████████| 79/79 [00:02<00:00, 31.16it/s] 
current loss:3049.644287109375: 100%|██████████| 79/79 [00:02<00:00, 31.38it/s]
current loss:2106.16455078125: 100%|██████████| 79/79 [00:02<00:00, 31.24it/s] 
current loss:2056.36181640625:   4%|▍         | 3/79 [00:00<00:02, 26.83it/s]

{'epoch': 4, 'test loss': 2.290347099304199, 'accuracy': 0.09999999403953552, 'temp': 2.699783742427826, 'beta1': 0.5665669441223145, 'beta2': 0.7055090069770813, 'beta3': 0.7055090069770813}


current loss:1444.6912841796875: 100%|██████████| 79/79 [00:02<00:00, 30.85it/s]
current loss:992.6282958984375: 100%|██████████| 79/79 [00:02<00:00, 31.15it/s] 
current loss:685.8023681640625: 100%|██████████| 79/79 [00:02<00:00, 30.96it/s]
current loss:479.0387268066406: 100%|██████████| 79/79 [00:02<00:00, 30.64it/s] 
current loss:338.35845947265625: 100%|██████████| 79/79 [00:02<00:00, 31.06it/s]
current loss:330.5971374511719:   4%|▍         | 3/79 [00:00<00:02, 27.80it/s]

{'epoch': 9, 'test loss': 2.2908222675323486, 'accuracy': 0.10679999738931656, 'temp': 2.699783742427826, 'beta1': 0.5665669441223145, 'beta2': 0.7055090069770813, 'beta3': 0.7055090069770813}


current loss:242.14859008789062: 100%|██████████| 79/79 [00:02<00:00, 31.12it/s]
current loss:175.82823181152344: 100%|██████████| 79/79 [00:02<00:00, 31.16it/s]
current loss:129.56983947753906: 100%|██████████| 79/79 [00:02<00:00, 31.46it/s]
current loss:97.01971435546875: 100%|██████████| 79/79 [00:02<00:00, 31.23it/s] 
current loss:73.92278289794922: 100%|██████████| 79/79 [00:02<00:00, 30.97it/s]
current loss:71.55269622802734:   5%|▌         | 4/79 [00:00<00:02, 31.40it/s]

{'epoch': 14, 'test loss': 2.304670810699463, 'accuracy': 0.10029999911785126, 'temp': 2.699783742427826, 'beta1': 0.5665669441223145, 'beta2': 0.7055090069770813, 'beta3': 0.7055090069770813}


current loss:57.37443161010742: 100%|██████████| 79/79 [00:02<00:00, 31.28it/s] 
current loss:45.39491271972656: 100%|██████████| 79/79 [00:02<00:00, 30.93it/s] 
current loss:36.630767822265625: 100%|██████████| 79/79 [00:02<00:00, 30.62it/s]
current loss:30.149276733398438: 100%|██████████| 79/79 [00:02<00:00, 31.07it/s]
current loss:25.297693252563477: 100%|██████████| 79/79 [00:02<00:00, 31.39it/s]


KeyboardInterrupt: 

In [25]:
w

[tensor([[-0.7514,  0.1773, -0.4779,  ...,  0.3393, -0.3660,  0.2624],
         [-0.8842, -0.1683,  0.4462,  ...,  3.1425,  0.3287, -0.6552],
         [-0.8631, -0.1994,  0.2775,  ..., -0.6473, -0.7569, -0.4378],
         ...,
         [ 0.8663, -1.7815,  1.3069,  ..., -0.0480, -0.7634, -0.4520],
         [ 0.5320,  0.3822, -0.8424,  ..., -0.4680, -1.6757, -0.2089],
         [ 0.9002, -1.2314,  0.3657,  ..., -0.7398,  0.0684, -0.9744]]),
 tensor([[-0.0401,  0.1368,  0.9736,  ...,  0.7400,  0.0514, -0.0842],
         [-0.3874, -0.7632, -0.0408,  ...,  0.5974,  0.1077, -0.4998],
         [-0.5327,  0.4283,  1.5127,  ..., -0.3498, -0.0648, -0.5655],
         ...,
         [-0.7716, -0.9684, -1.2373,  ..., -0.0454,  0.4990,  0.7507],
         [-1.0290,  0.3870, -1.4709,  ..., -0.0779, -0.4832,  1.4097],
         [-0.9672,  1.9007, -0.3325,  ..., -1.5615, -0.7924,  0.3367]]),
 tensor([[ 1.4746, -0.5375, -1.9071,  ..., -0.2746, -1.1300,  1.0408],
         [ 1.0869,  2.0572,  0.8932,  ...,  1

In [51]:
# Запуск --- с CNN-дистилляцией и оптимизацией гиперпараметров, 2-beta
crit = nn.CrossEntropyLoss()
# определяем функцию потерь как замкнутую относительно аргументов функцию
# нужно для подсчета градиентов гиперпараметров по двухуровневой оптимизации
def param_loss(batch,model, h):
    x,y,batch_logits,t_features, s_features, w = batch    
    beta,beta2,beta3,temp = h
    out = model(x)
    beta = F.sigmoid(beta)
    beta2 = F.sigmoid(beta2)
    beta3 = F.sigmoid(beta3)/10000
    
    temp = F.sigmoid(temp) * 10
    distillation_loss = distill(out, batch_logits, temp)
    student_loss = crit(out, y)                
    
    w_loss = 0
    for i in range(4):        
        w_loss += t.sum((t.matmul(s_features[i], w[i]) -  t_features[i])**2, 1).mean()
        
        
    loss = beta * distillation_loss + beta2 * student_loss + w_loss * beta3
    
    return loss

# определяем функцию валидационную функцию потерь как замкнутую относительно аргументов функцию
# нужно для подсчета градиентов гиперпараметров по двухуровневой оптимизации
def hyperparam_loss(batch, model):
    x,y = batch
    out = model(x)
    student_loss = crit(out, y)            
    return student_loss

hist = []
logits = np.load('../code/logits_cnn.npy')
teacher.eval()
for _ in range(run_num):
    internal_results = []    
    # теперь beta и temp - не числа, а тензоры, по которым можно считать градиент
    beta1 = t.nn.Parameter(t.tensor(np.random.uniform(low=-1, high=1), device=device), requires_grad=True)
    beta2 = t.nn.Parameter(t.tensor(np.random.uniform(low=-1, high=1), device=device), requires_grad=True)
    beta3 = t.nn.Parameter(t.tensor(np.random.uniform(low=-1, high=1), device=device), requires_grad=True)
    temp = t.nn.Parameter(t.tensor(np.random.uniform(low=-2, high=0), device=device), requires_grad=True)    
    h = [beta1, beta2, beta3, temp]
    
    student = Cifar_Very_Tiny(10).to(device)
    optim = t.optim.Adam(list(student.parameters())+ w)
    
    # параметры Adam и функцию подсчета градиента 
    # взял из статьи по DARTS (выбор архитектуры сети градиентными методами)
    # там также используется оптимизация гиперпараметров
    
    for x,_ in train_loader_no_augumentation:
        break
    x = x.to(device)
    w = []
    f_in = student.get_features(x, [0,1,2,3])
    f_out = teacher.get_features(x, [0,1,2,3])
    for in_, out_ in zip(f_in, f_out):
        w.append(t.randn(in_.shape[1], out_.shape[1]).cuda())
    
    
    optim2 = t.optim.SGD(h,  lr=10e4)   
    hyper_grad_calc = hyperparams.AdamHyperGradCalculator(student, param_loss, hyperparam_loss, optim, h)
    
    crit = t.nn.CrossEntropyLoss()

    for e in range(epoch_num): # хочется посмотреть куда сойдутся гиперпараметры, поэтому возьмем побольше эпох
        
        
        tq = tqdm.tqdm(zip(train_loader_no_augumentation, valid_loader))
        losses = []
        for batch_id, ((x,y), (v_x, v_y)) in enumerate(tq):
       
            x = x.to(device)
            y = y.to(device)            
                      
            batch_logits = t.Tensor(logits[128*batch_id:128*(batch_id+1)]).to(device) 
            # если настала пора понаблюдать за траекторий гиперпараметров
          
            #print (batch_id, 'train mini')
            v_x = v_x.to(device)
            v_y = v_y.to(device)  
            optim2.zero_grad()            
            f_in = student.get_features(x, [0,1,2,3])
            f_out = teacher.get_features(x, [0,1,2,3])
    
            hyper_grad_calc.calc_gradients((x,y,batch_logits,f_out, f_in, w), (v_x, v_y))    
            t.nn.utils.clip_grad_value_(h, 1.0)
            for h_ in h:
                h_.grad = t.where(t.isnan(h_.grad), t.zeros_like(h_.grad), h_.grad)  

            optim2.step()                         
            optim.zero_grad()
            f_in = student.get_features(x, [0,1,2,3])
            f_out = teacher.get_features(x, [0,1,2,3])
    
            loss = param_loss((x,y,batch_logits,f_out, f_in, w), student,h)
        
            losses.append(loss.cpu().detach().numpy())
            loss.backward()
            optim.step()
            tq.set_description('current loss:{}'.format(np.mean(losses[-10:])))
    
        if e==0 or (e+1)%validate_every_epoch == 0: # если номер эпохи делится на 5 или эпоха - первая             
            test_loss = []
            student.eval()
            for x,y in test_loader:
                x = x.to(device)
                y = y.to(device)                            
                test_loss.append(crit(student(x), y).detach().cpu().numpy())                 
            test_loss = float(np.mean(test_loss))
            test_loss2 = []            
            for x,y in test_loader:
                x = x.to(device)
                y = y.to(device)                            
                test_loss2.append(crit(student(x), y).detach().cpu().numpy())                 
            print (float(np.mean(test_loss2)))
            
            
            acc = float(accuracy(student))
            student.train()
            internal_results.append({'epoch': e, 'test loss':test_loss, 'accuracy':acc, 
                                     'temp':float(0.1+9.9*F.sigmoid(h[3]).cpu().detach().numpy()),
                                     'beta1':float(F.sigmoid(h[0]).cpu().detach().numpy()),
                                     'beta2':float(F.sigmoid(h[1]).cpu().detach().numpy()),
                                    'beta3':float(F.sigmoid(h[2]).cpu().detach().numpy())})
            
            print (internal_results[-1])

            
    with open('w_exp'+experiment_version+'_dist_h_b2_optim.jsonl', 'a') as out:
        out.write(json.dumps({'results':internal_results, 'version': experiment_version})+'\n')

  f = F.log_softmax(batch_logits/temp)
current loss:228.1383819580078: : 79it [00:05, 13.50it/s] 


2.1346006393432617


current loss:225.26449584960938: : 2it [00:00, 13.53it/s]

{'epoch': 0, 'test loss': 2.132808208465576, 'accuracy': 0.204599991440773, 'temp': 2.129318678379059, 'beta1': 0.4880298376083374, 'beta2': 0.7217536568641663, 'beta3': 0.7007215619087219}


current loss:162.95828247070312: : 79it [00:05, 13.65it/s]
current loss:116.46406555175781: : 79it [00:05, 13.76it/s]
current loss:82.61605072021484: : 79it [00:05, 13.75it/s] 
current loss:58.85979461669922: : 79it [00:05, 13.77it/s] 


1.8675687313079834


current loss:59.263771057128906: : 2it [00:00, 13.32it/s]

{'epoch': 4, 'test loss': 1.8707540035247803, 'accuracy': 0.2921999990940094, 'temp': 1.2544663459062577, 'beta1': 0.5404069423675537, 'beta2': 0.7507766485214233, 'beta3': 0.7007215619087219}


current loss:43.63921356201172: : 79it [00:05, 13.77it/s] 
current loss:36.539329528808594: : 79it [00:05, 13.45it/s]
current loss:22.653362274169922: : 79it [00:05, 13.48it/s]
current loss:18.780595779418945: : 79it [00:05, 13.57it/s]
current loss:15.52271842956543: : 79it [00:05, 13.62it/s] 


2.102837085723877


current loss:15.90566635131836: : 2it [00:00, 12.06it/s]

{'epoch': 9, 'test loss': 2.10451078414917, 'accuracy': 0.24939998984336853, 'temp': 4.356904476881027, 'beta1': 0.6707221269607544, 'beta2': 0.80565345287323, 'beta3': 0.7007215619087219}


current loss:13.205431938171387: : 79it [00:05, 13.53it/s]
current loss:11.421762466430664: : 79it [00:05, 13.61it/s]
current loss:10.018226623535156: : 79it [00:05, 13.65it/s]
current loss:8.90816593170166: : 79it [00:05, 13.68it/s] 
current loss:8.020467758178711: : 79it [00:05, 13.70it/s]


2.1593198776245117


current loss:8.231345176696777: : 2it [00:00, 12.72it/s]

{'epoch': 14, 'test loss': 2.1594324111938477, 'accuracy': 0.19999998807907104, 'temp': 3.7231019735336304, 'beta1': 0.6860544681549072, 'beta2': 0.8415436148643494, 'beta3': 0.7007215619087219}


current loss:7.310000419616699: : 79it [00:05, 13.58it/s] 
current loss:6.738398551940918: : 79it [00:05, 13.68it/s] 
current loss:6.287932395935059: : 79it [00:05, 13.61it/s] 
current loss:5.9333086013793945: : 79it [00:05, 13.70it/s]
current loss:5.65286922454834: : 79it [00:05, 13.87it/s]  


2.1667041778564453


current loss:5.759459495544434: : 2it [00:00, 12.62it/s]

{'epoch': 19, 'test loss': 2.1613965034484863, 'accuracy': 0.18769998848438263, 'temp': 3.1235377341508865, 'beta1': 0.699241042137146, 'beta2': 0.8607771396636963, 'beta3': 0.7007215619087219}


current loss:5.434043884277344: : 79it [00:05, 13.53it/s] 
current loss:5.279083251953125: : 79it [00:05, 13.47it/s] 
current loss:5.1768269538879395: : 79it [00:05, 13.67it/s]
current loss:5.122745990753174: : 79it [00:05, 13.59it/s] 
current loss:5.111889839172363: : 79it [00:05, 13.58it/s] 


2.1573472023010254


current loss:5.159760475158691: : 2it [00:00, 13.40it/s]

{'epoch': 24, 'test loss': 2.1618847846984863, 'accuracy': 0.19859999418258667, 'temp': 2.532135121524334, 'beta1': 0.711664080619812, 'beta2': 0.8712711930274963, 'beta3': 0.7007215619087219}


current loss:4.939513683319092: : 44it [00:03, 13.32it/s] 


KeyboardInterrupt: 

In [1]:
with open("exp6_basic.jsonl", "r") as read_file:
    data_b = [json.loads(line) for line in read_file]
with open("exp6_distill.jsonl", "r") as read_file:
    data_d = [json.loads(line) for line in read_file]
with open("exp6_dist_h_rand.jsonl", "r") as read_file:
    data_dr = [json.loads(line) for line in read_file]
with open("exp6_dist_h_optim.jsonl", "r") as read_file:
    data_h = [json.loads(line) for line in read_file]

FileNotFoundError: [Errno 2] No such file or directory: 'exp6_basic.jsonl'

In [None]:
from matplotlib import pylab as plt
plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['lines.markersize'] = 12
plt.rcParams['xtick.labelsize'] = 24
plt.rcParams['ytick.labelsize'] = 24
plt.rcParams['legend.fontsize'] = 24
plt.rcParams['axes.titlesize'] = 36
plt.rcParams['axes.labelsize'] = 24

epoch_b = np.array([data_b[2]['results'][i]['epoch'] for i in range(len(data_b[0]['results']))])
loss_b = np.array([subdata['results'][i]['test loss'] for i in range(len(data_b[0]['results'])) for subdata in data_b]).reshape(epoch_b.shape[0], -1)
plt.plot(epoch_b, loss_b.mean(1), '-', color='red', label='без дистилляции')
plt.fill_between(epoch_b, loss_b.mean(1)-loss_b.std(1), loss_b.mean(1)+loss_b.std(1), alpha=0.2, color='red')

epoch_d = np.array([data_d[2]['results'][i]['epoch'] for i in range(len(data_d[2]['results']))])
loss_d = np.array([subdata['results'][i]['test loss'] for i in range(len(data_d[0]['results'])) for subdata in data_d]).reshape(epoch_d.shape[0], -1)
plt.plot(epoch_d, loss_d.mean(1), '-', color='blue', label='оптимальные гипепараметров')
plt.fill_between(epoch_d, loss_d.mean(1)-loss_d.std(1), loss_d.mean(1)+loss_d.std(1), alpha=0.2, color='blue')

epoch_dr = np.array([data_dr[2]['results'][i]['epoch'] for i in range(len(data_dr[2]['results']))])
loss_dr = np.array([subdata['results'][i]['test loss'] for i in range(len(data_dr[0]['results'])) for subdata in data_dr]).reshape(epoch_dr.shape[0], -1)
plt.plot(epoch_dr, loss_dr.mean(1), '-', color='black', label='случайные гипепараметры')
plt.fill_between(epoch_dr, loss_dr.mean(1)-loss_dr.std(1), loss_dr.mean(1)+loss_dr.std(1), alpha=0.2, color='black')


epoch_h = np.array([data_dr[2]['results'][i]['epoch'] for i in range(len(data_h[2]['results']))])
loss_h = np.array([subdata['results'][i]['test loss'] for i in range(len(data_d[0]['results'])) for subdata in data_h]).reshape(epoch_h.shape[0], -1)
plt.plot(epoch_h, loss_h.mean(1), '-', color='green', label='оптимизация гипепараметры')
plt.fill_between(epoch_h, loss_h.mean(1)-loss_h.std(1), loss_h.mean(1)+loss_h.std(1), alpha=0.2, color='green')

plt.xlabel('Количество эпох')
plt.ylabel('Потеря на тестовой выборке')

plt.legend()
plt.savefig('loss.pdf')

In [None]:
epoch_b = np.array([data_b[2]['results'][i]['epoch'] for i in range(len(data_b[0]['results']))])
acc_b = np.array([subdata['results'][i]['accuracy'] for i in range(len(data_b[0]['results'])) for subdata in data_b]).reshape(epoch_b.shape[0], -1)
plt.plot(epoch_b, acc_b.mean(1), '-', color='red', label='без дистилляции')
plt.fill_between(epoch_b, acc_b.mean(1)-acc_b.std(1), acc_b.mean(1)+acc_b.std(1), alpha=0.2, color='red')

epoch_d = np.array([data_d[2]['results'][i]['epoch'] for i in range(len(data_d[2]['results']))])
acc_d = np.array([subdata['results'][i]['accuracy'] for i in range(len(data_d[0]['results'])) for subdata in data_d]).reshape(epoch_d.shape[0], -1)
plt.plot(epoch_d, acc_d.mean(1), '-', color='blue', label='оптимальные гипепараметры')
plt.fill_between(epoch_d, acc_d.mean(1)-acc_d.std(1), acc_d.mean(1)+acc_d.std(1), alpha=0.2, color='blue')

epoch_h = np.array([data_h[2]['results'][i]['epoch'] for i in range(len(data_h[2]['results']))])
acc_h = np.array([subdata['results'][i]['accuracy'] for i in range(len(data_d[0]['results'])) for subdata in data_h]).reshape(epoch_h.shape[0], -1)
plt.plot(epoch_h, acc_h.mean(1), '-', color='green', label='оптимизация гиперпараметров')
plt.fill_between(epoch_h, acc_h.mean(1)-acc_h.std(1), acc_h.mean(1)+acc_h.std(1), alpha=0.2, color='green')

epoch_dr = np.array([data_dr[2]['results'][i]['epoch'] for i in range(len(data_dr[2]['results']))])
acc_dr = np.array([subdata['results'][i]['accuracy'] for i in range(len(data_dr[0]['results'])) for subdata in data_dr]).reshape(epoch_h.shape[0], -1)
plt.plot(epoch_dr, acc_dr.mean(1), '-', color='black', label='случайные гиперпараметры')
plt.fill_between(epoch_dr, acc_dr.mean(1)-acc_h.std(1), acc_dr.mean(1)+acc_dr.std(1), alpha=0.2, color='black')


plt.xlabel('Количество эпох')
plt.ylabel('Точность классификации')
plt.legend()
plt.savefig('acc.pdf')

In [None]:
epoch_b = np.hstack((epoch_b, epoch_b, epoch_b, epoch_b, epoch_b))

In [None]:
epoch_b = np.array([data_b[2]['results'][i]['epoch'] for i in range(len(data_b[0]['results']))])
epoch_b.reshape(41, 1)
epoch_b = np.hstack((epoch_b, epoch_b, epoch_b, epoch_b, epoch_b))
loss_b = np.array([subdata['results'][i]['test loss'] for i in range(len(data_b[0]['results'])) for subdata in data_b]).reshape(epoch_b.shape[0], -1)
plt.scatter(epoch_b, loss_b, color='red', marker='.', label='без дистилляции')
#plt.fill_between(epoch_b, loss_b.mean(1)-loss_b.std(1), loss_b.mean(1)+loss_b.std(1), alpha=0.2, color='red')

epoch_d = np.array([data_d[2]['results'][i]['epoch'] for i in range(len(data_d[2]['results']))])
epoch_d.reshape(41, 1)
epoch_d = np.hstack((epoch_d, epoch_d, epoch_d, epoch_d, epoch_d))
loss_d = np.array([subdata['results'][i]['test loss'] for i in range(len(data_d[0]['results'])) for subdata in data_d]).reshape(epoch_d.shape[0], -1)
plt.scatter(epoch_d, loss_d, marker='d', color='blue', label='оптимальные гипепараметры')
#plt.fill_between(epoch_d, loss_d.mean(1)-loss_d.std(1), loss_d.mean(1)+loss_d.std(1), alpha=0.2, color='blue')

epoch_dr = np.array([data_dr[2]['results'][i]['epoch'] for i in range(len(data_dr[2]['results']))])
epoch_dr.reshape(41, 1)
epoch_dr = np.hstack((epoch_dr, epoch_dr, epoch_dr, epoch_dr, epoch_dr))
loss_dr = np.array([subdata['results'][i]['test loss'] for i in range(len(data_dr[0]['results'])) for subdata in data_dr]).reshape(epoch_dr.shape[0], -1)
plt.scatter(epoch_dr, loss_dr, marker='x', color='black', label='случайные гипепараметры')
#plt.fill_between(epoch_dr, loss_dr.mean(1)-loss_dr.std(1), loss_dr.mean(1)+loss_dr.std(1), alpha=0.2, color='black')


epoch_h = np.array([data_dr[2]['results'][i]['epoch'] for i in range(len(data_h[2]['results']))])
epoch_h.reshape(41, 1)
epoch_h = np.hstack((epoch_h, epoch_h, epoch_h, epoch_h, epoch_h))
loss_h = np.array([subdata['results'][i]['test loss'] for i in range(len(data_d[0]['results'])) for subdata in data_h]).reshape(epoch_h.shape[0], -1)
plt.scatter(epoch_h, loss_h, marker='+', color='green', label='оптимизация гипепараметров')
#plt.fill_between(epoch_h, loss_h.mean(1)-loss_h.std(1), loss_h.mean(1)+loss_h.std(1), alpha=0.2, color='green')

plt.xlabel('Количество эпох')
plt.ylabel('Потеря на тестовой выборке')
plt.legend()
plt.savefig('scatter_plot_loss.pdf')

In [None]:
epoch_d = np.array([data_d[2]['results'][i]['epoch'] for i in range(len(data_d[2]['results']))])
beta_d = np.array([data_d[2]['results'][i]['beta'] for i in range(len(data_d[2]['results']))])
plt.plot(epoch_d, beta_d, '-', color='blue', label='дистилляция без оптимизации гипепараметров')
plt.fill_between(epoch_d, beta_d-beta_d.std(), beta_d+beta_d.std(), alpha=0.2, color='blue')

epoch_h = np.array([data_h[2]['results'][i]['epoch'] for i in range(len(data_h[2]['results']))])
beta_h = np.array([data_h[2]['results'][i]['beta'] for i in range(len(data_h[2]['results']))])
plt.plot(epoch_h, beta_h, '-', color='green', label='дистилляция с оптимизацией гипепараметров')
plt.fill_between(epoch_h, beta_h-beta_h.std(), beta_h+beta_h.std(), alpha=0.2, color='green')

plt.legend()
plt.savefig('3.eps')

In [None]:
epoch_d = np.array([data_d[2]['results'][i]['epoch'] for i in range(len(data_d[2]['results']))])
temp_d = np.array([data_d[2]['results'][i]['temp'] for i in range(len(data_d[2]['results']))])
plt.plot(epoch_d, temp_d, '-', color='blue', label='дистилляция без оптимизации гипепараметров')
plt.fill_between(epoch_d, temp_d-temp_d.std(), temp_d+temp_d.std(), alpha=0.2, color='blue')

epoch_h = np.array([data_h[2]['results'][i]['epoch'] for i in range(len(data_h[2]['results']))])
temp_h = np.array([data_h[2]['results'][i]['temp'] for i in range(len(data_h[2]['results']))])
plt.plot(epoch_h, temp_h, '-', color='green', label='дистилляция с оптимизацией гипепараметров')
plt.fill_between(temp_h, temp_h-temp_h.std(), temp_h+temp_h.std(), alpha=0.2, color='green')

plt.legend()
plt.savefig('4.eps')

In [None]:
l[0]

In [None]:
cm.seismic(l[0])

In [None]:
acc_dr = np.array([subdata['results'][i]['accuracy'] for i in range(len(data_dr[0]['results'])) for subdata in data_dr]).reshape(epoch_dr.shape[0], -1)
acc_h = np.array([subdata['results'][i]['accuracy'] for i in range(len(data_h[0]['results'])) for subdata in data_h]).reshape(epoch_h.shape[0], -1)
all_results = list(acc_dr) + list(acc_h)
max_ = np.max(all_results)
min_ = np.min(all_results)

colors = [cm.seismic((r-min_)/(max_-min_)) for r in acc_dr.flatten()]
temp_dr = np.array([subdata['results'][i]['temp'] for i in range(len(data_dr[0]['results'])) for subdata in data_dr]).reshape(epoch_dr.shape[0], -1)
beta_dr = np.array([subdata['results'][i]['beta'] for i in range(len(data_dr[0]['results'])) for subdata in data_dr]).reshape(epoch_dr.shape[0], -1)
plt.scatter(beta_dr.flatten(), temp_dr.flatten(), marker='d', c=colors, label='случайные гипепараметры')

colors = [cm.seismic((r-min_)/(max_-min_)) for r in acc_h.flatten()]
temp_h = np.array([subdata['results'][i]['temp'] for i in range(len(data_h[0]['results'])) for subdata in data_h]).reshape(epoch_h.shape[0], -1)
beta_h = np.array([subdata['results'][i]['beta'] for i in range(len(data_h[0]['results'])) for subdata in data_h]).reshape(epoch_h.shape[0], -1)
plt.scatter(beta_h, temp_h, marker='x', c=colors, label='оптимизация гипепараметров')

plt.xlabel('beta')
plt.ylabel('$T_0$')
plt.legend()
plt.savefig('scatter_plot_beta_temp.pdf')

In [None]:
max_

In [None]:
"""
посмотреть, куда сходятся гиперпараметры.
Задача скорее всего невыпуклая по гиперпараметрам, поэтому может быть несколько точек экстремума.

Взять одно, наилучшее значение гиперпараметров.

Посчитать дистилляцию БЕЗ оптимизации гиперпараметров с наилушчими значениями.

НЕ ЗАБУДЬ ПОМЕНЯТЬ ИМЯ ФАЙЛА ДЛЯ СОХРАНЕНИЯ
"""

In [None]:
"""
Посчитать дистилляцию с оптимизацей гиперпараметров, в качестве начальной точки взять не случайные значения,
а start_beta, start_temp.

НЕ ЗАБУДЬ ПОМЕНЯТЬ ИМЯ ФАЙЛА ДЛЯ СОХРАНЕНИЯ
"""

In [None]:
"""
Построить график функции потерь на тесте в зависимости от эпохи. 
На графике должны быть линии для :
    - оптимизации без дистилляции
    - оптимизации с дистилляцией без оптимизации гиперпараметров, значения соответсвутют start_temp, start_beta
    - оптимизации с дистилляцией без оптимизации гиперпараметров, значения соответсвутют оптимизированным значениям гиперпараметров
    - оптимизации с дистилляцией c оптимизацией гиперпараметров, начальное приближение соответсвуeт start_temp, start_beta
    - оптимизации с дистилляцией c оптимизацией гиперпараметров, начальное приближение случайное
"""

In [None]:
"""
Построить график точности на тесте в зависимости от эпохи. 
На графике должны быть линии для :
    - оптимизации без дистилляции
    - оптимизации с дистилляцией без оптимизации гиперпараметров, значения соответсвутют start_temp, start_beta
    - оптимизации с дистилляцией без оптимизации гиперпараметров, значения соответсвутют оптимизированным значениям гиперпараметров
    - оптимизации с дистилляцией c оптимизацией гиперпараметров, начальное приближение соответсвуeт start_temp, start_beta
    - оптимизации с дистилляцией c оптимизацией гиперпараметров, начальное приближение случайное
"""

In [None]:
"""
Построить график беты в зависимости от эпохи. 
На графике должны быть линии для :    
    - оптимизации с дистилляцией без оптимизации гиперпараметров, значения соответсвутют start_temp, start_beta
    - оптимизации с дистилляцией без оптимизации гиперпараметров, значения соответсвутют оптимизированным значениям гиперпараметров
    - оптимизации с дистилляцией c оптимизацией гиперпараметров, начальное приближение соответсвуeт start_temp, start_beta
    - оптимизации с дистилляцией c оптимизацией гиперпараметров, начальное приближение случайное
"""

In [None]:
"""
Построить график температуры в зависимости от эпохи. 
На графике должны быть линии для :    
    - оптимизации с дистилляцией без оптимизации гиперпараметров, значения соответсвутют start_temp, start_beta
    - оптимизации с дистилляцией без оптимизации гиперпараметров, значения соответсвутют оптимизированным значениям гиперпараметров
    - оптимизации с дистилляцией c оптимизацией гиперпараметров, начальное приближение соответсвуeт start_temp, start_beta
    - оптимизации с дистилляцией c оптимизацией гиперпараметров, начальное приближение случайное
"""