### 3.3.3. numpy array,  Matrix multiplication

In [None]:
import numpy as np
x = np.array([1, 2])
x.shape

In [None]:
x.ndim

In [None]:
np.size(x)

In [None]:
Y = np.array([[1, 2, 3], [4, 5, 6]])
Y.shape

In [None]:
Y.shape[0]

In [None]:
Y.shape[1]

In [None]:
Y.ndim

In [None]:
np.size(Y)

In [None]:
z = np.dot(x, Y)
print(z)

### 4.2 Loss Function (손실 함수 )

In [1]:
import numpy as np
y = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]

sum of squares for error (SSE, 오차제곱합)

In [2]:
def sum_squares_error(y, t):
    return 0.5 * np.sum((y - t)**2)

In [3]:
sum_squares_error(np.array(y), np.array(t))

0.09750000000000003

In [4]:
y = [0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0]

In [5]:
sum_squares_error(np.array(y), np.array(t))

0.5975

cross entropy error (CEE, 교차 엔트로피 오차)

In [6]:
import numpy as np
y = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]

In [7]:
def cross_entropy_error(y, t):
    delta = 1e-7
    return -np.sum(t * np.log(y + delta))

In [11]:
np.log(2) # 자연로그

0.6931471805599453

In [13]:
np.log2(2.) # nat when using natural logarithm

1.0

In [8]:
cross_entropy_error(np.array(y), np.array(t))

0.510825457099338

In [9]:
y = [0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0]

In [10]:
cross_entropy_error(np.array(y), np.array(t))

2.302584092994546

### 4.2.3 Mini batch learning

In [14]:
import sys, os
sys.path.append(os.pardir) 
import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist

In [15]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

In [16]:
print(x_train.shape) 

(60000, 784)


In [17]:
print(t_train.shape) 

(60000, 10)


Pick just 10 training datasets for the batch size among 60000 datasets

In [18]:
train_size = x_train.shape[0]
batch_size = 10
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]

In [19]:
print(x_batch.shape, t_batch.shape)

(10, 784) (10, 10)


In [20]:
np.random.choice(60000, 10)

array([52464, 52403, 56279, 23228, 19908, 52572, 47463, 20971, 34712,
       12812])

### 4.2.4 (배치용) 교차엔트로피 오차 구현하기  220410

In [21]:
import numpy as np

In [22]:
def predict(network, x):
    w1, w2, w3 = network['W1'], network['W2'], network['W3']
    b1, b2, b3 = network['b1'], network['b2'], network['b3']

    a1 = np.dot(x, w1) + b1
    z1 = sigmoid(a1)
    a2 = np.dot(z1, w2) + b2
    z2 = sigmoid(a2)
    a3 = np.dot(z2, w3) + b3
    y = softmax(a3)

    return y

In [23]:
import pickle
with open("sample_weight.pkl", 'rb') as f:
    network = pickle.load(f)

In [24]:
import sys, os
sys.path.append(os.pardir)  # 부모 디렉터리의 파일을 가져올 수 있도록 
from dataset.mnist import load_mnist
from common.functions import sigmoid, softmax

In [25]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

In [26]:
train_size = x_train.shape[0]
batch_size = 10  # If 1, y.ndim becomes 1, else y.ndim is 2.
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]

In [27]:
x_batch.shape

(10, 784)

In [28]:
t_batch.shape

(10, 10)

In [29]:
y_batch = predict(network, x_batch)

In [30]:
print(x_batch.size, t_batch.size, y_batch.size)

7840 100 100


In [31]:
print(x_batch.shape, t_batch.shape, y_batch.shape)

(10, 784) (10, 10) (10, 10)


In [32]:
t_batch

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])

In [33]:
y_batch

array([[1.00862482e-04, 3.17373633e-05, 6.39175298e-03, 1.05794439e-04,
        9.15013433e-01, 6.08591945e-04, 2.91680521e-03, 2.60034273e-03,
        1.14935646e-02, 6.07372075e-02],
       [1.10671303e-08, 9.94798899e-01, 8.18546978e-04, 2.21788161e-03,
        3.42789463e-05, 2.78159219e-04, 5.84611262e-05, 6.80758094e-04,
        1.02012244e-03, 9.28153313e-05],
       [9.94641916e-04, 3.54993099e-05, 1.77187268e-02, 1.00136122e-05,
        4.95269716e-01, 2.15609302e-03, 4.79867548e-01, 1.48523785e-03,
        6.57293422e-04, 1.80520117e-03],
       [1.77723775e-03, 4.16408002e-05, 4.42623626e-03, 7.75701919e-05,
        3.81212652e-04, 6.66161021e-03, 9.86588180e-01, 3.39906546e-06,
        4.14990318e-05, 1.26156328e-06],
       [9.10279414e-06, 9.59375873e-02, 8.21403742e-01, 1.56256836e-02,
        3.95061288e-05, 3.14915233e-04, 5.02072871e-02, 7.66992707e-06,
        1.64407101e-02, 1.37741745e-05],
       [7.58109731e-04, 9.40936338e-03, 5.60527034e-02, 1.29904679e-03,
   

In [34]:
# y = y_batch[0]   # ndim of y_batch[0] is 1 and batch_size = 1
y = y_batch
t = t_batch

In [35]:
y.shape

(10, 10)

In [36]:
y.ndim

2

In [37]:
t.ndim

2

In [38]:
print(t)

[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


In [39]:
t

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])

In [40]:
y

array([[1.00862482e-04, 3.17373633e-05, 6.39175298e-03, 1.05794439e-04,
        9.15013433e-01, 6.08591945e-04, 2.91680521e-03, 2.60034273e-03,
        1.14935646e-02, 6.07372075e-02],
       [1.10671303e-08, 9.94798899e-01, 8.18546978e-04, 2.21788161e-03,
        3.42789463e-05, 2.78159219e-04, 5.84611262e-05, 6.80758094e-04,
        1.02012244e-03, 9.28153313e-05],
       [9.94641916e-04, 3.54993099e-05, 1.77187268e-02, 1.00136122e-05,
        4.95269716e-01, 2.15609302e-03, 4.79867548e-01, 1.48523785e-03,
        6.57293422e-04, 1.80520117e-03],
       [1.77723775e-03, 4.16408002e-05, 4.42623626e-03, 7.75701919e-05,
        3.81212652e-04, 6.66161021e-03, 9.86588180e-01, 3.39906546e-06,
        4.14990318e-05, 1.26156328e-06],
       [9.10279414e-06, 9.59375873e-02, 8.21403742e-01, 1.56256836e-02,
        3.95061288e-05, 3.14915233e-04, 5.02072871e-02, 7.66992707e-06,
        1.64407101e-02, 1.37741745e-05],
       [7.58109731e-04, 9.40936338e-03, 5.60527034e-02, 1.29904679e-03,
   

In [41]:
if y.ndim == 1:      
    t = t.reshape(1, t.size)  
    y = y.reshape(1, y.size) 
batch_size = y.shape[0] 
logsum = 0
delta = 1e-7

In [42]:
y.ndim

2

In [43]:
for i in range(batch_size):
    logsum += -np.sum(t[i,:] * np.log(y[i,:] + delta))

In [44]:
logsum

10.181800874881446

In [45]:
logsum / batch_size

1.0181800874881446

In [48]:
def cross_entropy_error(y, t):  # t is one hot encoded.
    if y.ndim == 1:      
        t = t.reshape(1, t.size)  
        y = y.reshape(1, y.size) 
    batch_size = y.shape[0] 
    logsum = 0
    delta = 1e-7
    for i in len(batch_size):  # <-- 'int' object is not iterable
        logsum += -np.sum(t[i,:] * np.log(y[i,:] + delta))
    return logsum / batch_size

In [49]:
cross_entropy_error(y_batch, t_batch)

TypeError: object of type 'int' has no len()

In [28]:
def cross_entropy_error(y, t):  # t is one hot encoded.
    if y.ndim == 1:      
        t = t.reshape(1, t.size)  
        y = y.reshape(1, y.size) 
    batch_size = y.shape[0] 
    logsum = 0
    delta = 1e-7
    for i in range(batch_size):
        logsum += -np.sum(t[i,:] * np.log(y[i,:] + delta))
    return logsum / batch_size

In [29]:
batch_size

10

In [30]:
t

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [31]:
cross_entropy_error(y_batch, t_batch)

0.2997706563677639

### cross_entropy_error module is a bit modified.

In [32]:
import sys, os
sys.path.append(os.pardir) 
import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist

In [33]:
def cross_entropy_error(y, t):  # t is one hot encoded.
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    batch_size = y.shape[0]
    delta = 1e-7
    return -np.sum(t * np.log(y+delta)) / batch_size

In [34]:
print(x_batch.shape, t_batch.shape, y_batch.shape)

(10, 784) (10, 10) (10, 10)


In [35]:
print(x_batch.size, t_batch.size, y_batch.size)

7840 100 100


In [36]:
t = t_batch
y = y_batch

In [37]:
print(t)

[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [38]:
print(y)

[[1.15293276e-03 7.07504705e-06 2.51241704e-06 1.03445305e-03
  1.79391577e-06 9.95387137e-01 1.05647159e-05 9.21523167e-07
  2.39434815e-03 8.24990366e-06]
 [2.39063389e-04 3.85447202e-04 1.44925639e-01 2.57859435e-02
  4.32290100e-02 6.03880268e-04 3.08554241e-04 1.32424980e-01
  4.03055325e-02 6.11791968e-01]
 [2.36499945e-05 5.64022874e-03 5.88911993e-04 1.63000040e-02
  2.77122129e-02 9.86170955e-03 4.56593480e-05 1.41162008e-01
  2.35425942e-02 7.75123060e-01]
 [2.52238951e-05 1.28152617e-03 3.42768314e-03 3.77341220e-03
  2.04070332e-03 1.31857842e-02 4.56919806e-04 1.32784135e-05
  9.74742293e-01 1.05306611e-03]
 [8.72282908e-06 9.94733455e-06 1.37058887e-04 7.79044058e-05
  5.58543086e-01 1.60983545e-04 3.02890054e-04 2.43881438e-03
  1.23003405e-03 4.37090486e-01]
 [4.00488498e-05 2.73894670e-06 1.62178214e-04 3.73545627e-05
  1.61923375e-02 5.42967937e-05 1.34163483e-05 6.60358835e-03
  9.01244057e-04 9.75992739e-01]
 [2.03989519e-04 6.41779334e-05 3.03544360e-03 3.57602767e

In [39]:
print(t.size)

100


In [40]:
cross_entropy_error(y, t)

0.2997706563677639

In [41]:
import numpy as np
def cross_entropy_error(y, t):  # t is label encoded.
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    batch_size = y.shape[0] # delta = 1e-7 is deleted.
    return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

In [42]:
cross_entropy_error(y, t)

IndexError: arrays used as indices must be of integer (or boolean) type

### When t is label encoded:

In [43]:
t.shape

(10, 10)

In [54]:
t.ndim

2

In [46]:
t

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [49]:
t_label = np.argmax(t, axis=1)

In [50]:
t_label

array([5, 7, 9, 8, 4, 9, 6, 0, 3, 9], dtype=int64)

In [52]:
t_label.shape

(10,)

In [53]:
t_label.ndim

1

In [51]:
cross_entropy_error(y, t_label)

0.2997708797454834

### 4.3 Numerical differentiation

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def numerical_diff(f, x):
    h = 1e-50
    return (f(x+h) - f(x)) / (h)

In [None]:
np.float32(1e-50)

In [None]:
def numerical_diff(f, x):
    h = 1e-4
    return (f(x+h) - f(x-h)) / (2*h)

In [None]:
def function_1(x):
    return 0.01*x**2 + 0.1*x

In [None]:
x = np.arange(0.0, 20.0, 0.1)
y = function_1(x)
plt.xlabel("x")
plt.ylabel("f(x)")
plt.plot(x, y)
plt.show()

In [None]:
numerical_diff(function_1, 5)

In [None]:
numerical_diff(function_1, 10)

Partial differential

In [None]:
def function_2(x):
    return x[0]**2 + x[1]**2   # or return np.sum(x**2)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
import numpy as np


fig = plt.figure()
ax = fig.gca(projection='3d')

# Make data.
X = np.arange(-3, 3, 0.25)
Y = np.arange(-3, 3, 0.25)
X, Y = np.meshgrid(X, Y)
R = (X**2 + Y**2)
Z = (R)

# Plot the surface.
surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False)

# Customize the z axis.
ax.set_zlim(-1.01, 15.01)
ax.zaxis.set_major_locator(LinearLocator(10))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))

# Add a color bar which maps values to colors.
fig.colorbar(surf, shrink=0.5, aspect=5)

plt.show()


In [None]:
# When x0 = 3, x1 = 4, partial derivative of f over x0 ?
def function_tmp1(x0):
    return x0*x0 + 4.0**2.0

In [None]:
numerical_diff(function_tmp1, 3.0)

In [None]:
# When x0 = 3, x1 = 4, partial derivative of f over x1 ?
def function_tmp2(x1):
    return 3.0**2.0 + x1*x1

In [None]:
numerical_diff(function_tmp2, 4.0)

### lambda function

파이썬에서 "lambda" 는 런타임에 생성해서 사용할 수 있는 익명 함수 입니다.

In [None]:
def inc(n):
    return lambda x: x + n

In [None]:
f = inc(2)
g = inc(4)
print(f(12))

In [None]:
print(g(12))

In [None]:
print(inc(2)(12))

### 4.4 Gradient

In [None]:
import numpy as np
import matplotlib.pylab as plt

In [None]:
x = np.array([3.0, 4.0])

In [None]:
grad = np.zeros_like(x)
print(grad)

In [None]:
x0 = np.arange(-2, 2.5, 0.25)
x1 = np.arange(-2, 2.5, 0.25)
X, Y = np.meshgrid(x0, x1)
np.size(Y)

In [None]:
4.5/0.25 

In [None]:
4.5/0.25* 4.5/0.25

In [None]:
X = X.flatten()
Y = Y.flatten()
np.size(X)

$\left( \frac{\partial f}{\partial x_0},  \frac{\partial f}{\partial x_1},  \frac{\partial f}{\partial x_2}, \cdots    \right)$ 처럼 모든 변수의 편미분을 벡터로 정리한 것을 gradient (기울기) 라고 함.

In [None]:
def numerical_gradient(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x) # x와 형상이 같은 배열을 생성
    
    for idx in range(x.size):
        tmp_val = x[idx]
        
        # f(x+h) 계산
        x[idx] = float(tmp_val) + h
        fxh1 = f(x)
        
        # f(x-h) 계산
        x[idx] = tmp_val - h 
        fxh2 = f(x) 
        
        grad[idx] = (fxh1 - fxh2) / (2*h)
        x[idx] = tmp_val # 값 복원
        
    return grad

In [None]:
def function_2(x):
    if x.ndim == 1:
        return np.sum(x**2)
    else:
        return np.sum(x**2, axis=1)

In [None]:
numerical_gradient(function_2, np.array([3.0, 4.0]))

In [None]:
numerical_gradient(function_2, np.array([0.0, 4.0]))

In [None]:
numerical_gradient(function_2, np.array([3.0, 0.0]))

### 4.4.1 Gradient descent (경사하강법)

In [None]:
import numpy as np
import matplotlib.pylab as plt
from gradient_2d import numerical_gradient

$x_i = x_i -\eta \frac{\partial f}{\partial x_i}$

In [None]:
def gradient_descent(f, init_x, lr=0.01, step_num=100):
    x = init_x

    for i in range(step_num):
        grad = numerical_gradient(f, x)
        x = x - lr * grad

    return x

In [None]:
def function_2(x):
    return x[0]**2 + x[1]**2

In [None]:
init_x = np.array([-3.0, 4.0])
gradient_descent(function_2, init_x, lr=0.1, step_num=100)

Learning rate : lr = 10.0

In [None]:
init_x = np.array([-3.0, 4.0])
gradient_descent(function_2, init_x, lr=10.0, step_num=100)

Learning rate : lr = 1e-10

In [None]:
init_x = np.array([-3.0, 4.0])
gradient_descent(function_2, init_x, lr=1e-10, step_num=100)

### 4.4.2 신경망에서의 기울기

In [None]:
import sys, os
sys.path.append(os.pardir)  # 부모 디렉터리의 파일을 가져올 수 있도록 설정
import numpy as np
from common.functions import softmax, cross_entropy_error
from common.gradient import numerical_gradient

In [None]:
class simpleNet:
    def __init__(self):
        self.W = np.random.randn(2,3) # 정규분포로 초기화

    def predict(self, x):
        return np.dot(x, self.W)

    def loss(self, x, t):
        z = self.predict(x)
        y = softmax(z)
        loss = cross_entropy_error(y, t)

        return loss

In [None]:
net = simpleNet()
print(net.W)

In [None]:
x = np.array([0.6, 0.9])
p = net.predict(x)
print(p)

In [None]:
np.argmax(p)

In [None]:
t = np.array([0, 0, 1])  # 정답 레이블
net.loss(x, t)

Obtaining gradient of W

In [None]:
def f(W):
    return net.loss(x, t)

In [None]:
dW = numerical_gradient(f, net.W)
print(dW)

Using lambda

In [None]:
f = lambda w:net.loss(x, t)
dW = numerical_gradient(f, net.W)
print(dW)

### 4.5.1 2층 신경망 클래스 구현하기

In [None]:
import sys, os
sys.path.append(os.pardir)  # 부모 디렉터리의 파일을 가져올 수 있도록 설정
from common.functions import *
from common.gradient import numerical_gradient

In [None]:
class TwoLayerNet:          # ch04\twp_layer_net.py

    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        # 가중치 초기화
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
    
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        
        return y
        
    # x : 입력 데이터, t : 정답 레이블
    def loss(self, x, t):
        y = self.predict(x)
        
        return cross_entropy_error(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # x : 입력 데이터, t : 정답 레이블
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
        
    def gradient(self, x, t):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        grads = {}
        
        batch_num = x.shape[0]
        
        # forward
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        
        # backward
        dy = (y - t) / batch_num
        grads['W2'] = np.dot(z1.T, dy)
        grads['b2'] = np.sum(dy, axis=0)
        
        da1 = np.dot(dy, W2.T)
        dz1 = sigmoid_grad(a1) * da1
        grads['W1'] = np.dot(x.T, dz1)
        grads['b1'] = np.sum(dz1, axis=0)

        return grads

In [None]:
net = TwoLayerNet(input_size=784, hidden_size=100, output_size=10)
net.params['W1'].shape

In [None]:
net.params['b1'].shape

In [None]:
net.params['W2'].shape

In [None]:
net.params['b2'].shape

In [None]:
x = np.random.rand(1, 784)  # Dummy input data (1장 분량)
y = net.predict(x)

In [None]:
print(np.size(y))

In [None]:
x = np.random.rand(100, 784)  # Dummy input data (100장 분량)
y = net.predict(x)

In [None]:
print(np.size(y))

In [None]:
x = np.random.rand(100, 784)  # Dummy input  data (100장 분량)
t = np.random.rand(100, 10)   # Dummy target data (100장 분량)

In [None]:
grads = net.numerical_gradient(x, t)  # gradient (기울기) 계산

In [None]:
grads['W1'].shape

In [None]:
grads['b1'].shape

In [None]:
grads['W2'].shape

In [None]:
grads['b2'].shape

### 4.5.2 ~ 4.5.3 미니배치 학습 구현 및 시험 데이터로 평가

In [None]:
import sys, os
sys.path.append(os.pardir)  # 부모 디렉터리의 파일을 가져올 수 있도록 설정
import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet

In [None]:
# 데이터 읽기
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

In [None]:
# 하이퍼파라미터
iters_num = 10000  # 반복 횟수를 적절히 설정한다.
train_size = x_train.shape[0]
batch_size = 100   # 미니배치 크기
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []
#%%
# iteration per one epoch  -  one epoch를 60000 / 100 = 600 이라고 설정함.
iter_per_epoch = max(train_size / batch_size, 1)

### 1 epoch 는 학습에서 training data (훈련 데이터)를 모두 소진했을 때의 횟수. 

예를 들면, 10000개의 training data를 200개의 mini batch (미니배치)로 학습할 경우, stochastic gradient descent (확률적 경사하강법)을 50회 반복하면 모든 훈련 데이터를 소진한다고 보는 것입니다. 

이 경우 50회가 1 epoch가 됨.

In [None]:
print(train_size)

In [None]:
print(iter_per_epoch)

In [None]:
for i in range(iters_num):
    # 미니배치 획득
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 기울기 계산
    #grad = network.numerical_gradient(x_batch, t_batch)
    grad = network.gradient(x_batch, t_batch)
    
    # 매개변수 갱신
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    # 학습 경과 기록
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    # 1에폭당 정확도 계산
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))

In [None]:
print(np.size(train_loss_list))

In [None]:
plt.plot(train_loss_list) # 171223 15:30 Sa. 
plt.show()

In [None]:
plt.plot(train_loss_list[0:1000]) # 200603 12:45 We. 
plt.show()

In [None]:
print(len(train_acc_list))

In [None]:
train_acc_list

In [None]:
# 그래프 그리기
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_acc_list))
plt.plot(x, train_acc_list, label='train acc')
plt.plot(x, test_acc_list, label='test acc', linestyle='--')
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()