In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"]

In [4]:
y = y.astype(np.uint8)

In [5]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [6]:
enc.fit(y[:,np.newaxis])

  """Entry point for launching an IPython kernel.


OneHotEncoder()

In [7]:
Y = enc.transform(y[:,np.newaxis]).toarray()

  """Entry point for launching an IPython kernel.


In [8]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]

In [9]:
X_train = X_train / 255
X_test = X_test / 255

In [10]:
X_train.shape

(60000, 784)

In [11]:
X_train, X_valid, y_train, y_valid = X_train[:59000], X_train[59000:], y_train[:59000], y_train[59000:]

In [12]:
X_valid = np.hstack((np.ones((np.size(X_valid, 0),1)),X_valid))

In [13]:
X_valid.shape

(1000, 785)

In [14]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [15]:
def softmax(X, W):
    K = np.size(W, 1)
    A = np.exp(X @ W)
    B = np.diag(1 / (np.reshape(A @ np.ones((K,1)), -1))) # reshape(, -1) -> (10, 1) -> (10)
    Y = B @ A
    return Y

In [16]:
def compute_cost(X, T, W, L2_Regular):
    epsilon = 1e-5
    N = len(T)
    K = np.size(T, 1)
    # 기존 cost + L2
    #cost = - (1/N) * np.ones((1,N)) @ ((np.multiply(np.log(softmax(X, W) + epsilon), T))) @ np.ones((K,1)) + L2_Regular
    cost = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1))
    return cost

In [17]:
def predict(X, W):
    return np.argmax((X @ W), axis=1)

In [18]:
def batch_gd(X, T, W, learning_rate, iterations, batch_size):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]
    
    for i in range(iterations):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
        
        # lambda 적용..?
        valid = predict(X_valid, W)
        
        #lamda = float(sum(valid == np.argmax(y_valid, axis=1)))/ float(len(y_valid))
        #lamda = (1 - float(sum(valid == np.argmax(y_valid, axis=1)))/ float(len(y_valid))) * 0.001
        lamda = 0
        # L2 구하기: W 제곱의 합 * lambda/2n
        
        L2_Regular = (lamda/(2 * len(X))) * np.ones((1, len(W))) @ (W ** 2) @ np.ones((np.size(W, 1), 1))
        L2_norm_d = W * 2 * lamda # L2 norm의 미분값
        
        W = W*(1-L2_norm_d) - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch))
        # W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch)) 
        cost_history[i] = compute_cost(X_batch, T_batch, W, L2_Regular)
        if i % 1000 == 0:
            #print(cost_history[i][0])
            print("lamda", lamda)
            print("L2_norm_d", np.ones((1, len(W))) @ W ** 2 @ np.ones((np.size(W, 1), 1)))
            print("-------------")

    return (cost_history, W)

In [19]:
def batch_gd(X, T, W, learning_rate, iterations, batch_size):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]
    
    for j in range(1000):
        lamda = -0.01 + 0.002 * j
        W = np.zeros((np.size(X, 1), np.size(T, 1)))
        for i in range(iterations):
            j = i % N
            X_batch = X_shuffled[j:j+batch_size]
            T_batch = T_shuffled[j:j+batch_size]
            # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
            if X_batch.shape[0] < batch_size:
                X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
                T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))

            # lambda 적용..?
            #valid = predict(X_valid, W)

            #lamda = float(sum(valid == np.argmax(y_valid, axis=1)))/ float(len(y_valid))
            #lamda = (1 - float(sum(valid == np.argmax(y_valid, axis=1)))/ float(len(y_valid))) * 0.001
            # L2 구하기: W 제곱의 합 * lambda/2n

            L2_Regular = (lamda/(2 * len(X))) * np.ones((1, len(W))) @ (W ** 2) @ np.ones((np.size(W, 1), 1))
            L2_norm_d = W * 2 * lamda # L2 norm의 미분값

            W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch) + L2_norm_d)
            # W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch)) 
            cost_history[i] = compute_cost(X_batch, T_batch, W, L2_Regular)
            #if i % 1000 == 0:
                #print(cost_history[i][0])
                
        X_ = np.hstack((np.ones((np.size(X_test, 0),1)),X_test))
        T_ = y_test
        y_pred = predict(X_, W)
        score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_test))
        print("lamda", lamda)
        print("L2_norm_d", np.ones((1, len(W))) @ W ** 2 @ np.ones((np.size(W, 1), 1)))
        print(score)
        print("-------------")

    return (cost_history, W)

In [20]:
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01

initial_cost = compute_cost(X, T, W, 0)

print("Initial Cost is: {} \n".format(initial_cost[0][0]))

(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64)

Initial Cost is: 2.3024850979937743 

lamda -0.01
L2_norm_d [[140.05919262]]
0.9165
-------------
lamda -0.008
L2_norm_d [[136.57780404]]
0.9163
-------------
lamda -0.006
L2_norm_d [[133.24720557]]
0.916
-------------
lamda -0.004
L2_norm_d [[130.05949527]]
0.9159
-------------
lamda -0.002
L2_norm_d [[127.00722035]]
0.9162
-------------
lamda 0.0
L2_norm_d [[124.08335114]]
0.9158
-------------
lamda 0.002
L2_norm_d [[121.2812566]]
0.9153
-------------
lamda 0.004
L2_norm_d [[118.59468108]]
0.9152
-------------
lamda 0.006
L2_norm_d [[116.01772254]]
0.9151
-------------
lamda 0.008000000000000002
L2_norm_d [[113.54481192]]
0.9147
-------------
lamda 0.01
L2_norm_d [[111.17069367]]
0.9147
-------------
lamda 0.011999999999999999
L2_norm_d [[108.89040747]]
0.9148
-------------
lamda 0.014
L2_norm_d [[106.69927094]]
0.9148
-------------
lamda 0.016
L2_norm_d [[104.59286339]]
0.9147
-------------
lamda 0.018000000000000002
L2_norm_d [[102.56701045]]
0.9146
-------------
lamda 0.0199999999

KeyboardInterrupt: 

In [178]:
## Accuracy
X_ = np.hstack((np.ones((np.size(X_test, 0),1)),X_test))
T_ = y_test
y_pred = predict(X_, W_optimal)
score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_test))

print(score)
# basic: 0.9144
# L2 Regular: 0.9125
# 0.9166
# 0.9162
# 0.9121
# 0.917 0.8

0.9165
