In [1]:
import scipy.io as scio
import numpy as np

import os
import time
import math
import torch
import torch.nn.functional as F
 
dataFile = './data1.mat'
data = scio.loadmat(dataFile)

In [2]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'TrainingX', 'TrainingY', 'TestX', 'TestY'])

In [3]:
x_train, y_train, x_test, y_test = data["TrainingX"], data["TrainingY"], data["TestX"], data["TestY"]
# x_train, y_train, x_test, y_test = np.array(x_train), np.array(y_train), np.array(x_test), np.array(y_test)

In [4]:
def kernel(X,Y):
    N1, N2 = len(X), len(Y)
    X_norm = np.sum(X ** 2, axis = -1)
    Y_norm = np.sum(Y ** 2, axis = -1)
    K = X_norm[:,None] + Y_norm[None,:] - 2 * np.dot(X, Y.T)
    sigma_2 = np.sum(K)/(N2**2)
    K = np.exp(-K/(2*sigma_2))
    return K

In [5]:
kx_train, kx_test = kernel(x_train,x_train), kernel(x_test, x_train)

dtype = torch.float32 
# put tensor on cpu(or you can try GPU)
device = torch.device("cpu")
KX_train = torch.tensor(kx_train, dtype=dtype, device=device)
Y_train = torch.tensor(y_train)

KX_test = torch.tensor(kx_test, dtype=dtype, device=device)
Y_test = torch.tensor(y_test)

In [6]:
def test_loss_accu(KX_test, Y_test, w, lambda_param):
    Z = torch.mm(KX_test, w)
    p = 1.0/(1.0+torch.exp(-Z))
    pred = (p>0.5)*2-1
    correct = pred.eq(Y_test.view_as(pred)).sum().item()
    accu = correct/len(Y_test)

    A = 1.0/(1.0+torch.exp(-Z*Y_test))
    l2_regularization = torch.sum(w ** 2)
    loss = -torch.mean(torch.log(A.view(-1)))+lambda_param*l2_regularization
    return loss, accu

#### Q1: GD

In [7]:
def GD(KX_train, Y_train, KX_test, Y_test, n_iterations, learning_rate, lambda_param, epsilon):   
    
    N = 10000
    w = torch.zeros((N,1), device=device, requires_grad = True)
    torch.nn.init.kaiming_uniform_(w, a=math.sqrt(1000))

    
    try:
        filename = './results/GD_{}_{}.txt'.format(learning_rate, lambda_param)
        os.remove(filename)
        print('Removed previous results!')
    except:
        pass

    t0 = time.time()
    t_test = 0
    for i in range(1, n_iterations + 1):

        Z = torch.mm(KX_train, w)
        p = 1.0/(1.0+torch.exp(-Z))
        pred = (p>0.5)*2-1
        correct = pred.eq(Y_train.view_as(pred)).sum().item()
        accu = correct/len(Y_train)

        A = 1.0/(1.0+torch.exp(-Z*Y_train))
        l2_regularization = torch.sum(w ** 2)
        loss = -torch.mean(torch.log(A.view(-1)))+lambda_param*l2_regularization
        if w.grad is not None:
            w.grad.zero_()  # 1

        loss.backward()  # 2

        with torch.no_grad():  # 3
            w -= learning_rate * w.grad


        if i == 1 or i % 25 == 0:
            t_test_0 = time.time()
            test_loss, test_accu = test_loss_accu(KX_test, Y_test, w, lambda_param)
            t_test += time.time()-t_test_0
            
            result = "iteration {}: Time:{:.2f}, TrainLoss: {:.4f}, TrainAccu: {:.4f}, TestLoss: {:.4f}, TestAccu: {:.4f}".format(i, time.time()-t_test-t0,loss, accu, test_loss, test_accu)
            print(result)
            filename = './results/GD_{}_{}.txt'.format(learning_rate, lambda_param)
            with open(filename, 'a') as fp: 
                fp.write(result+'\n')

        if w.grad.norm()<epsilon:
            break
            

In [8]:
n_iterations = 10000 
learning_rates = [0.005]
lambda_param = 1e-4
epsilon = 1e-5

for learning_rate in learning_rates:
    GD(KX_train, Y_train, KX_test, Y_test, n_iterations, learning_rate, lambda_param, epsilon)

Removed previous results!
iteration 1: Time:0.07, TrainLoss: 0.9229, TrainAccu: 0.5000, TestLoss: 0.6897, TestAccu: 0.5680
iteration 25: Time:1.07, TrainLoss: 1.8232, TrainAccu: 0.5000, TestLoss: 0.6496, TestAccu: 0.6530
iteration 50: Time:1.88, TrainLoss: 2.2196, TrainAccu: 0.5000, TestLoss: 0.6262, TestAccu: 0.8100
iteration 75: Time:2.71, TrainLoss: 1.6317, TrainAccu: 0.5000, TestLoss: 0.5817, TestAccu: 0.8040
iteration 100: Time:3.81, TrainLoss: 1.6618, TrainAccu: 0.5000, TestLoss: 0.5634, TestAccu: 0.8400
iteration 125: Time:4.78, TrainLoss: 1.3346, TrainAccu: 0.5194, TestLoss: 0.5317, TestAccu: 0.8550
iteration 150: Time:5.58, TrainLoss: 1.0791, TrainAccu: 0.5499, TestLoss: 0.5198, TestAccu: 0.8670
iteration 175: Time:6.38, TrainLoss: 0.8117, TrainAccu: 0.6714, TestLoss: 0.5030, TestAccu: 0.8690
iteration 200: Time:7.18, TrainLoss: 0.5371, TrainAccu: 0.7601, TestLoss: 0.4991, TestAccu: 0.8780
iteration 225: Time:7.98, TrainLoss: 0.4063, TrainAccu: 0.8256, TestLoss: 0.4957, TestAc

iteration 2050: Time:69.24, TrainLoss: 0.2801, TrainAccu: 0.8935, TestLoss: 0.4780, TestAccu: 0.9000
iteration 2075: Time:70.20, TrainLoss: 0.2797, TrainAccu: 0.8940, TestLoss: 0.4777, TestAccu: 0.9000
iteration 2100: Time:71.10, TrainLoss: 0.2794, TrainAccu: 0.8938, TestLoss: 0.4774, TestAccu: 0.8990
iteration 2125: Time:71.92, TrainLoss: 0.2791, TrainAccu: 0.8939, TestLoss: 0.4771, TestAccu: 0.8990
iteration 2150: Time:72.72, TrainLoss: 0.2787, TrainAccu: 0.8940, TestLoss: 0.4769, TestAccu: 0.8990
iteration 2175: Time:73.53, TrainLoss: 0.2784, TrainAccu: 0.8942, TestLoss: 0.4766, TestAccu: 0.9000
iteration 2200: Time:74.33, TrainLoss: 0.2781, TrainAccu: 0.8943, TestLoss: 0.4763, TestAccu: 0.9000
iteration 2225: Time:75.13, TrainLoss: 0.2778, TrainAccu: 0.8946, TestLoss: 0.4760, TestAccu: 0.9000
iteration 2250: Time:75.93, TrainLoss: 0.2775, TrainAccu: 0.8945, TestLoss: 0.4758, TestAccu: 0.9000
iteration 2275: Time:76.73, TrainLoss: 0.2772, TrainAccu: 0.8946, TestLoss: 0.4755, TestAcc

iteration 4075: Time:135.41, TrainLoss: 0.2621, TrainAccu: 0.9025, TestLoss: 0.4604, TestAccu: 0.9090
iteration 4100: Time:136.21, TrainLoss: 0.2620, TrainAccu: 0.9025, TestLoss: 0.4603, TestAccu: 0.9090
iteration 4125: Time:137.00, TrainLoss: 0.2618, TrainAccu: 0.9024, TestLoss: 0.4601, TestAccu: 0.9090
iteration 4150: Time:137.80, TrainLoss: 0.2617, TrainAccu: 0.9024, TestLoss: 0.4599, TestAccu: 0.9090
iteration 4175: Time:138.59, TrainLoss: 0.2616, TrainAccu: 0.9025, TestLoss: 0.4598, TestAccu: 0.9100
iteration 4200: Time:139.40, TrainLoss: 0.2614, TrainAccu: 0.9026, TestLoss: 0.4596, TestAccu: 0.9110
iteration 4225: Time:140.19, TrainLoss: 0.2613, TrainAccu: 0.9026, TestLoss: 0.4594, TestAccu: 0.9110
iteration 4250: Time:140.99, TrainLoss: 0.2612, TrainAccu: 0.9025, TestLoss: 0.4593, TestAccu: 0.9110
iteration 4275: Time:141.78, TrainLoss: 0.2610, TrainAccu: 0.9026, TestLoss: 0.4591, TestAccu: 0.9110
iteration 4300: Time:142.57, TrainLoss: 0.2609, TrainAccu: 0.9027, TestLoss: 0.459

iteration 6100: Time:199.78, TrainLoss: 0.2533, TrainAccu: 0.9054, TestLoss: 0.4490, TestAccu: 0.9120
iteration 6125: Time:200.57, TrainLoss: 0.2533, TrainAccu: 0.9054, TestLoss: 0.4489, TestAccu: 0.9120
iteration 6150: Time:201.36, TrainLoss: 0.2532, TrainAccu: 0.9055, TestLoss: 0.4487, TestAccu: 0.9120
iteration 6175: Time:202.16, TrainLoss: 0.2531, TrainAccu: 0.9057, TestLoss: 0.4486, TestAccu: 0.9120
iteration 6200: Time:202.95, TrainLoss: 0.2530, TrainAccu: 0.9058, TestLoss: 0.4485, TestAccu: 0.9120
iteration 6225: Time:203.75, TrainLoss: 0.2529, TrainAccu: 0.9059, TestLoss: 0.4484, TestAccu: 0.9120
iteration 6250: Time:204.54, TrainLoss: 0.2529, TrainAccu: 0.9061, TestLoss: 0.4483, TestAccu: 0.9120
iteration 6275: Time:205.33, TrainLoss: 0.2528, TrainAccu: 0.9064, TestLoss: 0.4481, TestAccu: 0.9120
iteration 6300: Time:206.12, TrainLoss: 0.2527, TrainAccu: 0.9064, TestLoss: 0.4480, TestAccu: 0.9120
iteration 6325: Time:206.91, TrainLoss: 0.2526, TrainAccu: 0.9063, TestLoss: 0.447

iteration 8125: Time:264.18, TrainLoss: 0.2479, TrainAccu: 0.9097, TestLoss: 0.4404, TestAccu: 0.9140
iteration 8150: Time:264.97, TrainLoss: 0.2479, TrainAccu: 0.9097, TestLoss: 0.4403, TestAccu: 0.9150
iteration 8175: Time:265.77, TrainLoss: 0.2478, TrainAccu: 0.9097, TestLoss: 0.4402, TestAccu: 0.9150
iteration 8200: Time:266.57, TrainLoss: 0.2478, TrainAccu: 0.9098, TestLoss: 0.4402, TestAccu: 0.9150
iteration 8225: Time:267.36, TrainLoss: 0.2477, TrainAccu: 0.9098, TestLoss: 0.4401, TestAccu: 0.9150
iteration 8250: Time:268.16, TrainLoss: 0.2477, TrainAccu: 0.9098, TestLoss: 0.4400, TestAccu: 0.9150
iteration 8275: Time:268.95, TrainLoss: 0.2476, TrainAccu: 0.9098, TestLoss: 0.4399, TestAccu: 0.9150
iteration 8300: Time:269.75, TrainLoss: 0.2476, TrainAccu: 0.9098, TestLoss: 0.4398, TestAccu: 0.9150
iteration 8325: Time:270.54, TrainLoss: 0.2475, TrainAccu: 0.9097, TestLoss: 0.4397, TestAccu: 0.9150
iteration 8350: Time:271.33, TrainLoss: 0.2475, TrainAccu: 0.9098, TestLoss: 0.439

#### Q2: SGD

In [9]:
from torch.utils.data import TensorDataset
import random

In [10]:
def SGD(KX_train, Y_train, KX_test, Y_test, n_epoches,batch_size, learning_rate, lambda_param, epsilon):
    N = 10000
    w = torch.zeros((N,1), device=device, requires_grad = True)
    torch.nn.init.kaiming_uniform_(w, a=math.sqrt(1000))
    
    try:
        filename = './results/SGD_{}_{}_{}.txt'.format(learning_rate, lambda_param,batch_size)
        os.remove(filename)
        print('Removed previous results!')
    except:
        pass

    train_dataset = TensorDataset(KX_train, Y_train)
    train_loader = torch.utils.data.DataLoader(train_dataset,
        batch_size=batch_size, shuffle=True, sampler=None,
        num_workers=4, pin_memory=True)
    
    t0 = time.time()
    t_test = 0
    for epoch in range(1, n_epoches + 1):
        for batch_idx, (data, target) in enumerate(train_loader):
            Z = torch.mm(data, w)
            p = 1.0/(1.0+torch.exp(-Z))
            pred = (p>0.5)*2-1
            correct = pred.eq(target.view_as(pred)).sum().item()
            accu = correct/len(target)

            A = 1.0/(1.0+torch.exp(-Z*target))
            l2_regularization = torch.sum(w ** 2)
            loss = -torch.mean(torch.log(A.view(-1)))+lambda_param*l2_regularization
            if w.grad is not None:
                w.grad.zero_()  # 1

            loss.backward()  # 2

            with torch.no_grad():  # 3
                w -= learning_rate * w.grad


            if (batch_idx*batch_size) % 500 == 0:
                t_test_0 = time.time()
                test_loss, test_accu = test_loss_accu(KX_test, Y_test, w, lambda_param)
                t_test += time.time()-t_test_0
                
                result = "epoch {}[{}/{} ({:.0f}%)]: Time:{:.2f}, TrainLoss: {:.4f}, TrainAccu: {:.4f}, TestLoss: {:.4f}, TestAccu: {:.4f}".format(epoch, batch_idx*batch_size, len(train_loader.dataset),
                        100. * batch_idx / len(train_loader), time.time()-t0, loss, accu, test_loss, test_accu)
                print(result)
                filename = './results/SGD_{}_{}_{}.txt'.format(learning_rate, lambda_param,batch_size)
                with open(filename, 'a') as fp: 
                    fp.write(result+'\n')

            if w.grad.norm()<epsilon:
                break

In [11]:
n_epoches = 100
batch_sizes = [1, 10, 100]
learning_rates =[0.001, 0.005]
lambda_param =1e-4
epsilon = 1e-5

for batch_size in batch_sizes:
    for learning_rate in learning_rates:
        SGD(KX_train, Y_train, KX_test, Y_test, n_epoches, batch_size, learning_rate, lambda_param, epsilon)

Removed previous results!






















































Removed previous results!


































































































































































Removed previous results!




















































Removed previous results!






















































#### BFGS

In [12]:
x_train2 = np.vstack((x_train[:2000],x_train[-2000:]))
y_train2 = np.vstack((y_train[:2000],y_train[-2000:]))

In [13]:
kx_train2, kx_test2 = kernel(x_train2,x_train2), kernel(x_test, x_train2)

dtype = torch.float32 
# put tensor on cpu(or you can try GPU)
device = torch.device("cpu")

KX_train2 = torch.tensor(kx_train2, dtype=dtype, device=device)
Y_train2 = torch.tensor(y_train2)

KX_test2 = torch.tensor(kx_test2, dtype=dtype, device=device)
Y_test2 = torch.tensor(y_test)

In [14]:
def BFGS(KX_train, Y_train, KX_test, Y_test, n_iterations, learning_rate, lambda_param, epsilon):
    N = len(KX_train)
    w = torch.zeros((N,1), device=device, requires_grad = True)
    torch.nn.init.kaiming_uniform_(w, a=math.sqrt(1000))
    
    try:
        filename = './results/BFGS_{}_{}.txt'.format(learning_rate, lambda_param)
        os.remove(filename)
        print('Removed previous results!')
    except:
        pass

    train_dataset = TensorDataset(KX_train, Y_train)
    train_loader = torch.utils.data.DataLoader(train_dataset,
        batch_size=batch_size, shuffle=True, sampler=None,
        num_workers=4, pin_memory=True)
    
    t0 = time.time()
    t_test = 0
    
    Z = torch.mm(KX_train, w)
    p = 1.0/(1.0+torch.exp(-Z))
    pred = (p>0.5)*2-1
    correct = pred.eq(Y_train.view_as(pred)).sum().item()
    accu = correct/len(Y_train)

    A = 1.0/(1.0+torch.exp(-Z*Y_train))
    l2_regularization = torch.sum(w ** 2)
    loss = -torch.mean(torch.log(A.view(-1)))+lambda_param*l2_regularization
    if w.grad is not None:
        w.grad.zero_()  # 1

    loss.backward()  # 2
    
    t_test_0 = time.time()
    test_loss, test_accu = test_loss_accu(KX_test, Y_test, w, lambda_param)
    t_test += time.time()-t_test_0
    
    result = "iteration {}: Time:{:.2f}, TrainLoss: {:.4f}, TrainAccu: {:.4f}, TestLoss: {:.4f}, TestAccu: {:.4f}".format(1, time.time()-t_test-t0,loss, accu, test_loss, test_accu)
    print(result)
    filename = './results/BFGS_{}_{}.txt'.format(learning_rate, lambda_param)
    with open(filename, 'a') as fp: 
        fp.write(result+'\n')

    H, r, grad_past = torch.eye(N), torch.Tensor(N,1), torch.Tensor(N,1)
    r.copy_(w.grad)
    grad_past.copy_(w.grad)
    
    for i in range(2, n_iterations + 1):
        
        with torch.no_grad():  # 3
            w -= learning_rate * r
            
        Z = torch.mm(KX_train, w)
        p = 1.0/(1.0+torch.exp(-Z))
        pred = (p>0.5)*2-1
        correct = pred.eq(Y_train.view_as(pred)).sum().item()
        accu = correct/len(Y_train)

        A = 1.0/(1.0+torch.exp(-Z*Y_train))
        l2_regularization = torch.sum(w ** 2)
        loss = -torch.mean(torch.log(A.view(-1)))+lambda_param*l2_regularization
        if w.grad is not None:
            w.grad.zero_()  # 1

        loss.backward()  # 2
        
        s = -learning_rate * r
        y = w.grad - grad_past
        grad_past.copy_(w.grad)

        p = 1.0/s.view(-1).dot(y.view(-1))
        I = torch.eye(N)
        H = torch.matmul(torch.matmul(I-p*torch.matmul(s,y.t()), H), I-p*torch.matmul(y,s.t()))+p*torch.matmul(s,s.t())

        r = torch.matmul(H, w.grad)

        if i == 1 or i % 1 == 0:
            t_test_0 = time.time()
            test_loss, test_accu = test_loss_accu(KX_test, Y_test, w, lambda_param)
            t_test += time.time()-t_test_0

            result = "iteration {}: Time:{:.2f}, TrainLoss: {:.4f}, TrainAccu: {:.4f}, TestLoss: {:.4f}, TestAccu: {:.4f}".format(i, time.time()-t_test-t0,loss, accu, test_loss, test_accu)
            print(result)
            filename = './results/BFGS_{}_{}.txt'.format(learning_rate, lambda_param)
            with open(filename, 'a') as fp: 
                fp.write(result+'\n')

        if w.grad.norm()<epsilon:
            break

In [15]:
n_iterations = 1000
learning_rates = [0.005]
lambda_param = 1e-4
epsilon = 1e-5

for learning_rate in learning_rates:
    BFGS(KX_train2, Y_train2, KX_test2, Y_test2, n_iterations, learning_rate, lambda_param, epsilon)

Removed previous results!
iteration 1: Time:0.01, TrainLoss: 0.6850, TrainAccu: 0.5000, TestLoss: 0.6470, TestAccu: 0.7060
iteration 2: Time:0.72, TrainLoss: 0.6702, TrainAccu: 0.5005, TestLoss: 0.6361, TestAccu: 0.6760
iteration 3: Time:1.34, TrainLoss: 0.6677, TrainAccu: 0.5005, TestLoss: 0.6338, TestAccu: 0.6870
iteration 4: Time:1.97, TrainLoss: 0.6650, TrainAccu: 0.5005, TestLoss: 0.6314, TestAccu: 0.6970
iteration 5: Time:2.59, TrainLoss: 0.6623, TrainAccu: 0.5005, TestLoss: 0.6289, TestAccu: 0.7060
iteration 6: Time:3.21, TrainLoss: 0.6596, TrainAccu: 0.5012, TestLoss: 0.6264, TestAccu: 0.7160
iteration 7: Time:3.83, TrainLoss: 0.6568, TrainAccu: 0.5020, TestLoss: 0.6239, TestAccu: 0.7220
iteration 8: Time:4.44, TrainLoss: 0.6541, TrainAccu: 0.5022, TestLoss: 0.6214, TestAccu: 0.7310
iteration 9: Time:5.06, TrainLoss: 0.6513, TrainAccu: 0.5022, TestLoss: 0.6189, TestAccu: 0.7360
iteration 10: Time:5.69, TrainLoss: 0.6486, TrainAccu: 0.5038, TestLoss: 0.6164, TestAccu: 0.7410
ite

iteration 84: Time:55.61, TrainLoss: 0.4908, TrainAccu: 0.8445, TestLoss: 0.4723, TestAccu: 0.8760
iteration 85: Time:56.27, TrainLoss: 0.4892, TrainAccu: 0.8452, TestLoss: 0.4708, TestAccu: 0.8760
iteration 86: Time:56.94, TrainLoss: 0.4876, TrainAccu: 0.8465, TestLoss: 0.4693, TestAccu: 0.8770
iteration 87: Time:57.61, TrainLoss: 0.4860, TrainAccu: 0.8478, TestLoss: 0.4679, TestAccu: 0.8770
iteration 88: Time:58.27, TrainLoss: 0.4845, TrainAccu: 0.8490, TestLoss: 0.4664, TestAccu: 0.8770
iteration 89: Time:58.94, TrainLoss: 0.4829, TrainAccu: 0.8495, TestLoss: 0.4649, TestAccu: 0.8770
iteration 90: Time:59.61, TrainLoss: 0.4813, TrainAccu: 0.8512, TestLoss: 0.4635, TestAccu: 0.8770
iteration 91: Time:60.29, TrainLoss: 0.4798, TrainAccu: 0.8535, TestLoss: 0.4620, TestAccu: 0.8780
iteration 92: Time:60.95, TrainLoss: 0.4783, TrainAccu: 0.8552, TestLoss: 0.4606, TestAccu: 0.8780
iteration 93: Time:61.62, TrainLoss: 0.4767, TrainAccu: 0.8565, TestLoss: 0.4592, TestAccu: 0.8780
iteration 

iteration 166: Time:113.69, TrainLoss: 0.3876, TrainAccu: 0.8975, TestLoss: 0.3754, TestAccu: 0.8990
iteration 167: Time:114.38, TrainLoss: 0.3866, TrainAccu: 0.8980, TestLoss: 0.3745, TestAccu: 0.9010
iteration 168: Time:115.09, TrainLoss: 0.3856, TrainAccu: 0.8980, TestLoss: 0.3736, TestAccu: 0.9010
iteration 169: Time:115.79, TrainLoss: 0.3847, TrainAccu: 0.8982, TestLoss: 0.3727, TestAccu: 0.9010
iteration 170: Time:116.49, TrainLoss: 0.3837, TrainAccu: 0.8988, TestLoss: 0.3718, TestAccu: 0.9010
iteration 171: Time:117.19, TrainLoss: 0.3827, TrainAccu: 0.8982, TestLoss: 0.3709, TestAccu: 0.9010
iteration 172: Time:117.89, TrainLoss: 0.3818, TrainAccu: 0.8978, TestLoss: 0.3700, TestAccu: 0.9010
iteration 173: Time:118.59, TrainLoss: 0.3809, TrainAccu: 0.8980, TestLoss: 0.3691, TestAccu: 0.9010
iteration 174: Time:119.30, TrainLoss: 0.3799, TrainAccu: 0.8978, TestLoss: 0.3682, TestAccu: 0.9010
iteration 175: Time:119.99, TrainLoss: 0.3790, TrainAccu: 0.8975, TestLoss: 0.3674, TestAcc

iteration 248: Time:171.60, TrainLoss: 0.3231, TrainAccu: 0.9073, TestLoss: 0.3154, TestAccu: 0.9060
iteration 249: Time:172.33, TrainLoss: 0.3225, TrainAccu: 0.9073, TestLoss: 0.3148, TestAccu: 0.9060
iteration 250: Time:173.06, TrainLoss: 0.3219, TrainAccu: 0.9073, TestLoss: 0.3142, TestAccu: 0.9070
iteration 251: Time:173.76, TrainLoss: 0.3213, TrainAccu: 0.9073, TestLoss: 0.3137, TestAccu: 0.9070
iteration 252: Time:174.49, TrainLoss: 0.3206, TrainAccu: 0.9073, TestLoss: 0.3131, TestAccu: 0.9070
iteration 253: Time:175.18, TrainLoss: 0.3200, TrainAccu: 0.9073, TestLoss: 0.3125, TestAccu: 0.9070
iteration 254: Time:175.88, TrainLoss: 0.3194, TrainAccu: 0.9077, TestLoss: 0.3120, TestAccu: 0.9070
iteration 255: Time:176.58, TrainLoss: 0.3188, TrainAccu: 0.9077, TestLoss: 0.3114, TestAccu: 0.9070
iteration 256: Time:177.28, TrainLoss: 0.3182, TrainAccu: 0.9080, TestLoss: 0.3109, TestAccu: 0.9060
iteration 257: Time:177.96, TrainLoss: 0.3176, TrainAccu: 0.9077, TestLoss: 0.3103, TestAcc

iteration 330: Time:230.12, TrainLoss: 0.2815, TrainAccu: 0.9135, TestLoss: 0.2779, TestAccu: 0.9150
iteration 331: Time:230.81, TrainLoss: 0.2811, TrainAccu: 0.9135, TestLoss: 0.2776, TestAccu: 0.9150
iteration 332: Time:231.51, TrainLoss: 0.2807, TrainAccu: 0.9135, TestLoss: 0.2772, TestAccu: 0.9150
iteration 333: Time:232.20, TrainLoss: 0.2803, TrainAccu: 0.9135, TestLoss: 0.2769, TestAccu: 0.9150
iteration 334: Time:232.90, TrainLoss: 0.2799, TrainAccu: 0.9135, TestLoss: 0.2765, TestAccu: 0.9150
iteration 335: Time:233.61, TrainLoss: 0.2795, TrainAccu: 0.9135, TestLoss: 0.2762, TestAccu: 0.9150
iteration 336: Time:234.37, TrainLoss: 0.2791, TrainAccu: 0.9135, TestLoss: 0.2758, TestAccu: 0.9150
iteration 337: Time:235.05, TrainLoss: 0.2787, TrainAccu: 0.9137, TestLoss: 0.2755, TestAccu: 0.9150
iteration 338: Time:235.76, TrainLoss: 0.2783, TrainAccu: 0.9137, TestLoss: 0.2752, TestAccu: 0.9150
iteration 339: Time:236.45, TrainLoss: 0.2779, TrainAccu: 0.9137, TestLoss: 0.2748, TestAcc

iteration 412: Time:287.94, TrainLoss: 0.2545, TrainAccu: 0.9163, TestLoss: 0.2553, TestAccu: 0.9170
iteration 413: Time:288.68, TrainLoss: 0.2542, TrainAccu: 0.9165, TestLoss: 0.2551, TestAccu: 0.9170
iteration 414: Time:289.39, TrainLoss: 0.2539, TrainAccu: 0.9167, TestLoss: 0.2549, TestAccu: 0.9170
iteration 415: Time:290.14, TrainLoss: 0.2537, TrainAccu: 0.9167, TestLoss: 0.2547, TestAccu: 0.9170
iteration 416: Time:290.85, TrainLoss: 0.2534, TrainAccu: 0.9167, TestLoss: 0.2545, TestAccu: 0.9170
iteration 417: Time:291.56, TrainLoss: 0.2532, TrainAccu: 0.9167, TestLoss: 0.2543, TestAccu: 0.9170
iteration 418: Time:292.27, TrainLoss: 0.2529, TrainAccu: 0.9167, TestLoss: 0.2541, TestAccu: 0.9170
iteration 419: Time:292.99, TrainLoss: 0.2527, TrainAccu: 0.9167, TestLoss: 0.2539, TestAccu: 0.9170
iteration 420: Time:293.69, TrainLoss: 0.2524, TrainAccu: 0.9167, TestLoss: 0.2537, TestAccu: 0.9170
iteration 421: Time:294.45, TrainLoss: 0.2522, TrainAccu: 0.9167, TestLoss: 0.2535, TestAcc

iteration 494: Time:346.49, TrainLoss: 0.2372, TrainAccu: 0.9203, TestLoss: 0.2426, TestAccu: 0.9180
iteration 495: Time:347.21, TrainLoss: 0.2370, TrainAccu: 0.9205, TestLoss: 0.2425, TestAccu: 0.9180
iteration 496: Time:347.92, TrainLoss: 0.2369, TrainAccu: 0.9205, TestLoss: 0.2423, TestAccu: 0.9180
iteration 497: Time:348.65, TrainLoss: 0.2367, TrainAccu: 0.9205, TestLoss: 0.2422, TestAccu: 0.9180
iteration 498: Time:349.37, TrainLoss: 0.2365, TrainAccu: 0.9205, TestLoss: 0.2421, TestAccu: 0.9180
iteration 499: Time:350.07, TrainLoss: 0.2364, TrainAccu: 0.9205, TestLoss: 0.2420, TestAccu: 0.9180
iteration 500: Time:350.76, TrainLoss: 0.2362, TrainAccu: 0.9205, TestLoss: 0.2419, TestAccu: 0.9180
iteration 501: Time:351.46, TrainLoss: 0.2361, TrainAccu: 0.9205, TestLoss: 0.2418, TestAccu: 0.9180
iteration 502: Time:352.16, TrainLoss: 0.2359, TrainAccu: 0.9205, TestLoss: 0.2417, TestAccu: 0.9190
iteration 503: Time:352.86, TrainLoss: 0.2357, TrainAccu: 0.9203, TestLoss: 0.2416, TestAcc

iteration 576: Time:404.49, TrainLoss: 0.2264, TrainAccu: 0.9215, TestLoss: 0.2364, TestAccu: 0.9230
iteration 577: Time:405.19, TrainLoss: 0.2263, TrainAccu: 0.9217, TestLoss: 0.2363, TestAccu: 0.9230
iteration 578: Time:405.88, TrainLoss: 0.2262, TrainAccu: 0.9217, TestLoss: 0.2363, TestAccu: 0.9230
iteration 579: Time:406.58, TrainLoss: 0.2261, TrainAccu: 0.9217, TestLoss: 0.2362, TestAccu: 0.9230
iteration 580: Time:407.27, TrainLoss: 0.2260, TrainAccu: 0.9217, TestLoss: 0.2362, TestAccu: 0.9230
iteration 581: Time:407.98, TrainLoss: 0.2259, TrainAccu: 0.9217, TestLoss: 0.2362, TestAccu: 0.9230
iteration 582: Time:408.67, TrainLoss: 0.2258, TrainAccu: 0.9217, TestLoss: 0.2361, TestAccu: 0.9230
iteration 583: Time:409.38, TrainLoss: 0.2257, TrainAccu: 0.9217, TestLoss: 0.2361, TestAccu: 0.9230
iteration 584: Time:410.07, TrainLoss: 0.2256, TrainAccu: 0.9217, TestLoss: 0.2360, TestAccu: 0.9230
iteration 585: Time:410.76, TrainLoss: 0.2255, TrainAccu: 0.9217, TestLoss: 0.2360, TestAcc

iteration 658: Time:462.64, TrainLoss: 0.2199, TrainAccu: 0.9235, TestLoss: 0.2343, TestAccu: 0.9240
iteration 659: Time:463.36, TrainLoss: 0.2198, TrainAccu: 0.9235, TestLoss: 0.2343, TestAccu: 0.9240
iteration 660: Time:464.08, TrainLoss: 0.2197, TrainAccu: 0.9235, TestLoss: 0.2343, TestAccu: 0.9240
iteration 661: Time:464.77, TrainLoss: 0.2197, TrainAccu: 0.9235, TestLoss: 0.2343, TestAccu: 0.9240
iteration 662: Time:465.47, TrainLoss: 0.2196, TrainAccu: 0.9235, TestLoss: 0.2343, TestAccu: 0.9240
iteration 663: Time:466.16, TrainLoss: 0.2196, TrainAccu: 0.9235, TestLoss: 0.2343, TestAccu: 0.9240
iteration 664: Time:466.90, TrainLoss: 0.2195, TrainAccu: 0.9235, TestLoss: 0.2342, TestAccu: 0.9240
iteration 665: Time:467.62, TrainLoss: 0.2194, TrainAccu: 0.9235, TestLoss: 0.2342, TestAccu: 0.9240
iteration 666: Time:468.36, TrainLoss: 0.2194, TrainAccu: 0.9235, TestLoss: 0.2342, TestAccu: 0.9240
iteration 667: Time:469.08, TrainLoss: 0.2193, TrainAccu: 0.9235, TestLoss: 0.2342, TestAcc

iteration 740: Time:521.72, TrainLoss: 0.2159, TrainAccu: 0.9253, TestLoss: 0.2346, TestAccu: 0.9220
iteration 741: Time:522.42, TrainLoss: 0.2159, TrainAccu: 0.9253, TestLoss: 0.2346, TestAccu: 0.9220
iteration 742: Time:523.13, TrainLoss: 0.2159, TrainAccu: 0.9253, TestLoss: 0.2346, TestAccu: 0.9220
iteration 743: Time:523.88, TrainLoss: 0.2158, TrainAccu: 0.9253, TestLoss: 0.2346, TestAccu: 0.9220
iteration 744: Time:524.59, TrainLoss: 0.2158, TrainAccu: 0.9253, TestLoss: 0.2346, TestAccu: 0.9220
iteration 745: Time:525.29, TrainLoss: 0.2158, TrainAccu: 0.9253, TestLoss: 0.2346, TestAccu: 0.9220
iteration 746: Time:525.98, TrainLoss: 0.2157, TrainAccu: 0.9255, TestLoss: 0.2346, TestAccu: 0.9220
iteration 747: Time:526.68, TrainLoss: 0.2157, TrainAccu: 0.9255, TestLoss: 0.2347, TestAccu: 0.9220
iteration 748: Time:527.38, TrainLoss: 0.2157, TrainAccu: 0.9255, TestLoss: 0.2347, TestAccu: 0.9210
iteration 749: Time:528.10, TrainLoss: 0.2156, TrainAccu: 0.9255, TestLoss: 0.2347, TestAcc

iteration 822: Time:579.82, TrainLoss: 0.2136, TrainAccu: 0.9263, TestLoss: 0.2361, TestAccu: 0.9180
iteration 823: Time:580.53, TrainLoss: 0.2136, TrainAccu: 0.9263, TestLoss: 0.2361, TestAccu: 0.9180
iteration 824: Time:581.23, TrainLoss: 0.2135, TrainAccu: 0.9263, TestLoss: 0.2361, TestAccu: 0.9180
iteration 825: Time:581.95, TrainLoss: 0.2135, TrainAccu: 0.9263, TestLoss: 0.2361, TestAccu: 0.9180
iteration 826: Time:582.65, TrainLoss: 0.2135, TrainAccu: 0.9263, TestLoss: 0.2362, TestAccu: 0.9180
iteration 827: Time:583.37, TrainLoss: 0.2135, TrainAccu: 0.9263, TestLoss: 0.2362, TestAccu: 0.9180
iteration 828: Time:584.08, TrainLoss: 0.2134, TrainAccu: 0.9263, TestLoss: 0.2362, TestAccu: 0.9180
iteration 829: Time:584.79, TrainLoss: 0.2134, TrainAccu: 0.9263, TestLoss: 0.2362, TestAccu: 0.9180
iteration 830: Time:585.50, TrainLoss: 0.2134, TrainAccu: 0.9263, TestLoss: 0.2362, TestAccu: 0.9180
iteration 831: Time:586.22, TrainLoss: 0.2134, TrainAccu: 0.9265, TestLoss: 0.2363, TestAcc

iteration 904: Time:638.59, TrainLoss: 0.2121, TrainAccu: 0.9273, TestLoss: 0.2381, TestAccu: 0.9190
iteration 905: Time:639.30, TrainLoss: 0.2121, TrainAccu: 0.9273, TestLoss: 0.2382, TestAccu: 0.9190
iteration 906: Time:640.02, TrainLoss: 0.2120, TrainAccu: 0.9273, TestLoss: 0.2382, TestAccu: 0.9190
iteration 907: Time:640.75, TrainLoss: 0.2120, TrainAccu: 0.9273, TestLoss: 0.2382, TestAccu: 0.9190
iteration 908: Time:641.46, TrainLoss: 0.2120, TrainAccu: 0.9273, TestLoss: 0.2382, TestAccu: 0.9190
iteration 909: Time:642.19, TrainLoss: 0.2120, TrainAccu: 0.9273, TestLoss: 0.2383, TestAccu: 0.9190
iteration 910: Time:642.92, TrainLoss: 0.2120, TrainAccu: 0.9273, TestLoss: 0.2383, TestAccu: 0.9190
iteration 911: Time:643.66, TrainLoss: 0.2120, TrainAccu: 0.9273, TestLoss: 0.2383, TestAccu: 0.9190
iteration 912: Time:644.38, TrainLoss: 0.2120, TrainAccu: 0.9273, TestLoss: 0.2383, TestAccu: 0.9190
iteration 913: Time:645.09, TrainLoss: 0.2119, TrainAccu: 0.9273, TestLoss: 0.2384, TestAcc

iteration 986: Time:697.39, TrainLoss: 0.2110, TrainAccu: 0.9283, TestLoss: 0.2405, TestAccu: 0.9190
iteration 987: Time:698.11, TrainLoss: 0.2110, TrainAccu: 0.9283, TestLoss: 0.2405, TestAccu: 0.9190
iteration 988: Time:698.84, TrainLoss: 0.2110, TrainAccu: 0.9283, TestLoss: 0.2405, TestAccu: 0.9190
iteration 989: Time:699.55, TrainLoss: 0.2110, TrainAccu: 0.9283, TestLoss: 0.2406, TestAccu: 0.9190
iteration 990: Time:700.28, TrainLoss: 0.2110, TrainAccu: 0.9283, TestLoss: 0.2406, TestAccu: 0.9190
iteration 991: Time:701.00, TrainLoss: 0.2110, TrainAccu: 0.9283, TestLoss: 0.2406, TestAccu: 0.9190
iteration 992: Time:701.72, TrainLoss: 0.2110, TrainAccu: 0.9283, TestLoss: 0.2407, TestAccu: 0.9190
iteration 993: Time:702.44, TrainLoss: 0.2109, TrainAccu: 0.9283, TestLoss: 0.2407, TestAccu: 0.9190
iteration 994: Time:703.16, TrainLoss: 0.2109, TrainAccu: 0.9283, TestLoss: 0.2407, TestAccu: 0.9190
iteration 995: Time:703.90, TrainLoss: 0.2109, TrainAccu: 0.9283, TestLoss: 0.2408, TestAcc

#### LBFGS

In [16]:
def LBFGS(KX_train, Y_train, KX_test, Y_test, n_iterations, history_size, learning_rate, lambda_param, epsilon):
    N = len(KX_train)
    w = torch.zeros((N,1), device=device, requires_grad = True)
    torch.nn.init.kaiming_uniform_(w, a=math.sqrt(1000))
    
    try:
        filename = './results/LBFGS_{}_{}.txt'.format(learning_rate, lambda_param)
        os.remove(filename)
        print('Removed previous results!')
    except:
        pass

    train_dataset = TensorDataset(KX_train, Y_train)
    train_loader = torch.utils.data.DataLoader(train_dataset,
        batch_size=batch_size, shuffle=True, sampler=None,
        num_workers=4, pin_memory=True)
    
    t0 = time.time()
    t_test = 0
    
    Z = torch.mm(KX_train, w)
    p = 1.0/(1.0+torch.exp(-Z))
    pred = (p>0.5)*2-1
    correct = pred.eq(Y_train.view_as(pred)).sum().item()
    accu = correct/len(Y_train)

    A = 1.0/(1.0+torch.exp(-Z*Y_train))
    l2_regularization = torch.sum(w ** 2)
    loss = -torch.mean(torch.log(A.view(-1)))+lambda_param*l2_regularization
    if w.grad is not None:
        w.grad.zero_()  # 1

    loss.backward()  # 2
    
    t_test_0 = time.time()
    test_loss, test_accu = test_loss_accu(KX_test, Y_test, w, lambda_param)
    t_test += time.time()-t_test_0
    
    result = "iteration {}: Time:{:.2f}, TrainLoss: {:.4f}, TrainAccu: {:.4f}, TestLoss: {:.4f}, TestAccu: {:.4f}".format(1, time.time()-t_test-t0,loss, accu, test_loss, test_accu)
    print(result)
    filename = './results/LBFGS_{}_{}.txt'.format(learning_rate, lambda_param)
    with open(filename, 'a') as fp: 
        fp.write(result+'\n')

    r, grad_past = torch.Tensor(N,1), torch.Tensor(N,1)
    r.copy_(w.grad)
    grad_past.copy_(w.grad)
    
    rhos, ss, ys = [], [], []
    
    for idx in range(2, n_iterations + 1):
        
        with torch.no_grad():  # 3
            w -= learning_rate * r
            
        Z = torch.mm(KX_train, w)
        p = 1.0/(1.0+torch.exp(-Z))
        pred = (p>0.5)*2-1
        correct = pred.eq(Y_train.view_as(pred)).sum().item()
        accu = correct/len(Y_train)

        A = 1.0/(1.0+torch.exp(-Z*Y_train))
        l2_regularization = torch.sum(w ** 2)
        loss = -torch.mean(torch.log(A.view(-1)))+lambda_param*l2_regularization
        if w.grad is not None:
            w.grad.zero_()  # 1

        loss.backward()  # 2
        
        s = (-learning_rate * r).view(-1)
        y = (w.grad - grad_past).view(-1)
        grad_past.copy_(w.grad)
        
        q = torch.zeros(N)
        q.copy_(w.grad.view(-1))
        y_s = s.dot(y)
        rho = 1.0/y_s
        
        
        if len(ss)>=history_size:
            ss.pop(0)
            ys.pop(0)
            rhos.pop(0)
            
        ss.append(s)
        ys.append(y)
        rhos.append(rho)
        
        H_diag = y_s/y.dot(y)
        
        num_old = len(ss)
        al = [None] * history_size
        for i in range(num_old - 1, -1, -1):
            al[i] = ss[i].dot(q) * rhos[i]
            q.add_(ys[i], alpha=-al[i])

        # multiply by initial Hessian
        # r/d is the final direction
        d = r = torch.mul(q, H_diag)
        for i in range(num_old):
            be_i = ys[i].dot(r) * rhos[i]
            r.add_(ss[i], alpha=al[i] - be_i)
        
        r = r.view(N,1)

#         p = 1.0/s.view(-1).dot(y.view(-1))
#         I = torch.eye(N)
#         H = torch.matmul(torch.matmul(I-p*torch.matmul(s,y.t()), H), I-p*torch.matmul(y,s.t()))+p*torch.matmul(s,s.t())

#         r = torch.matmul(H, w.grad)

        if i == 1 or i % 1 == 0:
            t_test_0 = time.time()
            test_loss, test_accu = test_loss_accu(KX_test, Y_test, w, lambda_param)
            t_test += time.time()-t_test_0

            result = "iteration {}: Time:{:.2f}, TrainLoss: {:.4f}, TrainAccu: {:.4f}, TestLoss: {:.4f}, TestAccu: {:.4f}".format(idx, time.time()-t_test-t0,loss, accu, test_loss, test_accu)
            print(result)
            filename = './results/LBFGS_{}_{}.txt'.format(learning_rate, lambda_param)
            with open(filename, 'a') as fp: 
                fp.write(result+'\n')

        if w.grad.norm()<epsilon:
            break

In [17]:
n_iterations = 1000
history_size = 10
learning_rates = [0.01]
lambda_param = 1e-4
epsilon = 1e-5

for learning_rate in learning_rates:
    LBFGS(KX_train2, Y_train2, KX_test2, Y_test2, n_iterations,history_size,  learning_rate, lambda_param, epsilon)

Removed previous results!
iteration 1: Time:0.01, TrainLoss: 0.7338, TrainAccu: 0.2670, TestLoss: 0.7430, TestAccu: 0.3320
iteration 2: Time:0.01, TrainLoss: 0.7285, TrainAccu: 0.3088, TestLoss: 0.7383, TestAccu: 0.3840
iteration 3: Time:0.02, TrainLoss: 0.7283, TrainAccu: 0.3113, TestLoss: 0.7381, TestAccu: 0.3850
iteration 4: Time:0.03, TrainLoss: 0.7272, TrainAccu: 0.3207, TestLoss: 0.7369, TestAccu: 0.3910
iteration 5: Time:0.04, TrainLoss: 0.7248, TrainAccu: 0.3390, TestLoss: 0.7342, TestAccu: 0.4020
iteration 6: Time:0.04, TrainLoss: 0.7212, TrainAccu: 0.3703, TestLoss: 0.7302, TestAccu: 0.4120
iteration 7: Time:0.05, TrainLoss: 0.7167, TrainAccu: 0.4040, TestLoss: 0.7252, TestAccu: 0.4310
iteration 8: Time:0.06, TrainLoss: 0.7116, TrainAccu: 0.4310, TestLoss: 0.7195, TestAccu: 0.4560
iteration 9: Time:0.06, TrainLoss: 0.7061, TrainAccu: 0.4655, TestLoss: 0.7134, TestAccu: 0.4870
iteration 10: Time:0.07, TrainLoss: 0.7003, TrainAccu: 0.4965, TestLoss: 0.7070, TestAccu: 0.5040
ite

iteration 96: Time:0.68, TrainLoss: 0.4071, TrainAccu: 0.8888, TestLoss: 0.4048, TestAccu: 0.8900
iteration 97: Time:0.69, TrainLoss: 0.4052, TrainAccu: 0.8892, TestLoss: 0.4031, TestAccu: 0.8910
iteration 98: Time:0.70, TrainLoss: 0.4033, TrainAccu: 0.8892, TestLoss: 0.4014, TestAccu: 0.8930
iteration 99: Time:0.71, TrainLoss: 0.4015, TrainAccu: 0.8900, TestLoss: 0.3997, TestAccu: 0.8930
iteration 100: Time:0.71, TrainLoss: 0.3996, TrainAccu: 0.8902, TestLoss: 0.3980, TestAccu: 0.8920
iteration 101: Time:0.72, TrainLoss: 0.3978, TrainAccu: 0.8905, TestLoss: 0.3964, TestAccu: 0.8930
iteration 102: Time:0.73, TrainLoss: 0.3960, TrainAccu: 0.8912, TestLoss: 0.3947, TestAccu: 0.8930
iteration 103: Time:0.73, TrainLoss: 0.3942, TrainAccu: 0.8920, TestLoss: 0.3931, TestAccu: 0.8930
iteration 104: Time:0.74, TrainLoss: 0.3924, TrainAccu: 0.8920, TestLoss: 0.3914, TestAccu: 0.8930
iteration 105: Time:0.75, TrainLoss: 0.3906, TrainAccu: 0.8918, TestLoss: 0.3898, TestAccu: 0.8930
iteration 106:

iteration 196: Time:1.40, TrainLoss: 0.2931, TrainAccu: 0.9032, TestLoss: 0.3034, TestAccu: 0.8930
iteration 197: Time:1.41, TrainLoss: 0.2924, TrainAccu: 0.9035, TestLoss: 0.3028, TestAccu: 0.8930
iteration 198: Time:1.42, TrainLoss: 0.2917, TrainAccu: 0.9035, TestLoss: 0.3022, TestAccu: 0.8940
iteration 199: Time:1.43, TrainLoss: 0.2909, TrainAccu: 0.9032, TestLoss: 0.3016, TestAccu: 0.8940
iteration 200: Time:1.43, TrainLoss: 0.2902, TrainAccu: 0.9028, TestLoss: 0.3010, TestAccu: 0.8940
iteration 201: Time:1.44, TrainLoss: 0.2895, TrainAccu: 0.9032, TestLoss: 0.3004, TestAccu: 0.8950
iteration 202: Time:1.45, TrainLoss: 0.2884, TrainAccu: 0.9025, TestLoss: 0.2995, TestAccu: 0.8950
iteration 203: Time:1.46, TrainLoss: 0.2878, TrainAccu: 0.9030, TestLoss: 0.2989, TestAccu: 0.8950
iteration 204: Time:1.46, TrainLoss: 0.2871, TrainAccu: 0.9030, TestLoss: 0.2983, TestAccu: 0.8950
iteration 205: Time:1.47, TrainLoss: 0.2864, TrainAccu: 0.9030, TestLoss: 0.2978, TestAccu: 0.8950
iteration 

iteration 298: Time:2.13, TrainLoss: 0.2538, TrainAccu: 0.9062, TestLoss: 0.2602, TestAccu: 0.9070
iteration 299: Time:2.14, TrainLoss: 0.2535, TrainAccu: 0.9065, TestLoss: 0.2601, TestAccu: 0.9070
iteration 300: Time:2.14, TrainLoss: 0.2532, TrainAccu: 0.9067, TestLoss: 0.2600, TestAccu: 0.9070
iteration 301: Time:2.15, TrainLoss: 0.2530, TrainAccu: 0.9065, TestLoss: 0.2599, TestAccu: 0.9070
iteration 302: Time:2.16, TrainLoss: 0.2527, TrainAccu: 0.9067, TestLoss: 0.2599, TestAccu: 0.9070
iteration 303: Time:2.17, TrainLoss: 0.2525, TrainAccu: 0.9070, TestLoss: 0.2598, TestAccu: 0.9070
iteration 304: Time:2.17, TrainLoss: 0.2522, TrainAccu: 0.9070, TestLoss: 0.2597, TestAccu: 0.9070
iteration 305: Time:2.18, TrainLoss: 0.2520, TrainAccu: 0.9073, TestLoss: 0.2596, TestAccu: 0.9070
iteration 306: Time:2.19, TrainLoss: 0.2518, TrainAccu: 0.9075, TestLoss: 0.2596, TestAccu: 0.9070
iteration 307: Time:2.20, TrainLoss: 0.2515, TrainAccu: 0.9073, TestLoss: 0.2595, TestAccu: 0.9070
iteration 

iteration 398: Time:2.84, TrainLoss: 0.2398, TrainAccu: 0.9097, TestLoss: 0.2584, TestAccu: 0.9030
iteration 399: Time:2.85, TrainLoss: 0.2397, TrainAccu: 0.9097, TestLoss: 0.2584, TestAccu: 0.9030
iteration 400: Time:2.86, TrainLoss: 0.2397, TrainAccu: 0.9100, TestLoss: 0.2585, TestAccu: 0.9030
iteration 401: Time:2.87, TrainLoss: 0.2396, TrainAccu: 0.9100, TestLoss: 0.2585, TestAccu: 0.9030
iteration 402: Time:2.87, TrainLoss: 0.2396, TrainAccu: 0.9100, TestLoss: 0.2585, TestAccu: 0.9030
iteration 403: Time:2.88, TrainLoss: 0.2395, TrainAccu: 0.9100, TestLoss: 0.2585, TestAccu: 0.9030
iteration 404: Time:2.89, TrainLoss: 0.2395, TrainAccu: 0.9100, TestLoss: 0.2586, TestAccu: 0.9030
iteration 405: Time:2.90, TrainLoss: 0.2394, TrainAccu: 0.9100, TestLoss: 0.2586, TestAccu: 0.9030
iteration 406: Time:2.90, TrainLoss: 0.2393, TrainAccu: 0.9100, TestLoss: 0.2586, TestAccu: 0.9030
iteration 407: Time:2.91, TrainLoss: 0.2393, TrainAccu: 0.9100, TestLoss: 0.2586, TestAccu: 0.9030
iteration 

iteration 499: Time:3.56, TrainLoss: 0.2333, TrainAccu: 0.9140, TestLoss: 0.2568, TestAccu: 0.9040
iteration 500: Time:3.57, TrainLoss: 0.2439, TrainAccu: 0.9073, TestLoss: 0.2467, TestAccu: 0.9120
iteration 501: Time:3.58, TrainLoss: 0.2435, TrainAccu: 0.9073, TestLoss: 0.2465, TestAccu: 0.9120
iteration 502: Time:3.59, TrainLoss: 0.2431, TrainAccu: 0.9077, TestLoss: 0.2463, TestAccu: 0.9120
iteration 503: Time:3.59, TrainLoss: 0.2427, TrainAccu: 0.9075, TestLoss: 0.2461, TestAccu: 0.9120
iteration 504: Time:3.60, TrainLoss: 0.2422, TrainAccu: 0.9077, TestLoss: 0.2460, TestAccu: 0.9120
iteration 505: Time:3.61, TrainLoss: 0.2418, TrainAccu: 0.9083, TestLoss: 0.2458, TestAccu: 0.9120
iteration 506: Time:3.62, TrainLoss: 0.2414, TrainAccu: 0.9080, TestLoss: 0.2456, TestAccu: 0.9130
iteration 507: Time:3.62, TrainLoss: 0.2410, TrainAccu: 0.9083, TestLoss: 0.2454, TestAccu: 0.9120
iteration 508: Time:3.63, TrainLoss: 0.2407, TrainAccu: 0.9085, TestLoss: 0.2452, TestAccu: 0.9120
iteration 

iteration 600: Time:4.28, TrainLoss: 0.2255, TrainAccu: 0.9177, TestLoss: 0.2407, TestAccu: 0.9070
iteration 601: Time:4.29, TrainLoss: 0.2254, TrainAccu: 0.9180, TestLoss: 0.2407, TestAccu: 0.9070
iteration 602: Time:4.30, TrainLoss: 0.2253, TrainAccu: 0.9183, TestLoss: 0.2407, TestAccu: 0.9060
iteration 603: Time:4.31, TrainLoss: 0.2253, TrainAccu: 0.9183, TestLoss: 0.2407, TestAccu: 0.9060
iteration 604: Time:4.31, TrainLoss: 0.2252, TrainAccu: 0.9183, TestLoss: 0.2407, TestAccu: 0.9060
iteration 605: Time:4.32, TrainLoss: 0.2252, TrainAccu: 0.9183, TestLoss: 0.2407, TestAccu: 0.9060
iteration 606: Time:4.33, TrainLoss: 0.2251, TrainAccu: 0.9183, TestLoss: 0.2407, TestAccu: 0.9060
iteration 607: Time:4.34, TrainLoss: 0.2251, TrainAccu: 0.9183, TestLoss: 0.2407, TestAccu: 0.9060
iteration 608: Time:4.34, TrainLoss: 0.2251, TrainAccu: 0.9183, TestLoss: 0.2407, TestAccu: 0.9070
iteration 609: Time:4.35, TrainLoss: 0.2250, TrainAccu: 0.9185, TestLoss: 0.2407, TestAccu: 0.9070
iteration 

iteration 700: Time:5.00, TrainLoss: 0.2230, TrainAccu: 0.9207, TestLoss: 0.2417, TestAccu: 0.9070
iteration 701: Time:5.00, TrainLoss: 0.2230, TrainAccu: 0.9210, TestLoss: 0.2417, TestAccu: 0.9070
iteration 702: Time:5.01, TrainLoss: 0.2230, TrainAccu: 0.9210, TestLoss: 0.2417, TestAccu: 0.9070
iteration 703: Time:5.02, TrainLoss: 0.2230, TrainAccu: 0.9210, TestLoss: 0.2417, TestAccu: 0.9070
iteration 704: Time:5.03, TrainLoss: 0.2230, TrainAccu: 0.9213, TestLoss: 0.2417, TestAccu: 0.9070
iteration 705: Time:5.03, TrainLoss: 0.2229, TrainAccu: 0.9215, TestLoss: 0.2417, TestAccu: 0.9070
iteration 706: Time:5.04, TrainLoss: 0.2229, TrainAccu: 0.9215, TestLoss: 0.2417, TestAccu: 0.9070
iteration 707: Time:5.05, TrainLoss: 0.2229, TrainAccu: 0.9215, TestLoss: 0.2418, TestAccu: 0.9070
iteration 708: Time:5.06, TrainLoss: 0.2229, TrainAccu: 0.9215, TestLoss: 0.2418, TestAccu: 0.9070
iteration 709: Time:5.06, TrainLoss: 0.2229, TrainAccu: 0.9215, TestLoss: 0.2418, TestAccu: 0.9070
iteration 

iteration 795: Time:5.72, TrainLoss: 0.2222, TrainAccu: 0.9217, TestLoss: 0.2439, TestAccu: 0.9070
iteration 796: Time:5.73, TrainLoss: 0.2221, TrainAccu: 0.9217, TestLoss: 0.2440, TestAccu: 0.9070
iteration 797: Time:5.73, TrainLoss: 0.2221, TrainAccu: 0.9223, TestLoss: 0.2442, TestAccu: 0.9070
iteration 798: Time:5.74, TrainLoss: 0.2219, TrainAccu: 0.9225, TestLoss: 0.2442, TestAccu: 0.9060
iteration 799: Time:5.75, TrainLoss: 0.2218, TrainAccu: 0.9225, TestLoss: 0.2442, TestAccu: 0.9060
iteration 800: Time:5.75, TrainLoss: 0.2218, TrainAccu: 0.9220, TestLoss: 0.2427, TestAccu: 0.9060
iteration 801: Time:5.76, TrainLoss: 0.2217, TrainAccu: 0.9220, TestLoss: 0.2427, TestAccu: 0.9060
iteration 802: Time:5.77, TrainLoss: 0.2215, TrainAccu: 0.9220, TestLoss: 0.2427, TestAccu: 0.9060
iteration 803: Time:5.78, TrainLoss: 0.2214, TrainAccu: 0.9220, TestLoss: 0.2427, TestAccu: 0.9060
iteration 804: Time:5.78, TrainLoss: 0.2212, TrainAccu: 0.9217, TestLoss: 0.2427, TestAccu: 0.9060
iteration 

iteration 895: Time:6.44, TrainLoss: 0.2116, TrainAccu: 0.9285, TestLoss: 0.2448, TestAccu: 0.9170
iteration 896: Time:6.44, TrainLoss: 0.2115, TrainAccu: 0.9285, TestLoss: 0.2448, TestAccu: 0.9170
iteration 897: Time:6.45, TrainLoss: 0.2114, TrainAccu: 0.9293, TestLoss: 0.2448, TestAccu: 0.9170
iteration 898: Time:6.46, TrainLoss: 0.2113, TrainAccu: 0.9295, TestLoss: 0.2448, TestAccu: 0.9170
iteration 899: Time:6.47, TrainLoss: 0.2112, TrainAccu: 0.9295, TestLoss: 0.2447, TestAccu: 0.9170
iteration 900: Time:6.47, TrainLoss: 0.2112, TrainAccu: 0.9295, TestLoss: 0.2447, TestAccu: 0.9170
iteration 901: Time:6.48, TrainLoss: 0.2111, TrainAccu: 0.9295, TestLoss: 0.2447, TestAccu: 0.9170
iteration 902: Time:6.49, TrainLoss: 0.2110, TrainAccu: 0.9295, TestLoss: 0.2447, TestAccu: 0.9170
iteration 903: Time:6.49, TrainLoss: 0.2109, TrainAccu: 0.9295, TestLoss: 0.2447, TestAccu: 0.9170
iteration 904: Time:6.50, TrainLoss: 0.2109, TrainAccu: 0.9295, TestLoss: 0.2447, TestAccu: 0.9170
iteration 

iteration 994: Time:7.16, TrainLoss: 0.2078, TrainAccu: 0.9320, TestLoss: 0.2463, TestAccu: 0.9190
iteration 995: Time:7.16, TrainLoss: 0.2078, TrainAccu: 0.9320, TestLoss: 0.2463, TestAccu: 0.9190
iteration 996: Time:7.17, TrainLoss: 0.2078, TrainAccu: 0.9320, TestLoss: 0.2463, TestAccu: 0.9190
iteration 997: Time:7.18, TrainLoss: 0.2078, TrainAccu: 0.9320, TestLoss: 0.2464, TestAccu: 0.9190
iteration 998: Time:7.18, TrainLoss: 0.2078, TrainAccu: 0.9320, TestLoss: 0.2464, TestAccu: 0.9190
iteration 999: Time:7.19, TrainLoss: 0.2077, TrainAccu: 0.9320, TestLoss: 0.2464, TestAccu: 0.9190
iteration 1000: Time:7.20, TrainLoss: 0.2077, TrainAccu: 0.9320, TestLoss: 0.2464, TestAccu: 0.9190
