# Softmax  
$ h = WX + b $  
$ p_i = {\exp(h_i)\over\sum{\exp(h_i)}} $  
$ L = -\sum{T_i\log(p_i)} $  
$ {\partial L\over\partial h_i} = p_i - T_i $  
$ {\partial h_i\over\partial W_i} = X $

$ {\partial L\over\partial h_i}$ 설명    
<img src="img/fig a-5.png">

In [79]:
from load_cifar_10 import *
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [106]:
class Softmax:
    def __init__(self):
        
        self.params = {}
        self.params['W'] = 0.0001 * np.random.randn(3072, 10)
        self.params['b'] = np.ones(10)
    def forward(self, X):
        #Softmax 함수
        W = self.params['W']
        b = self.params['b']
        #p = np.exp(np.dot(X, W) + b)
        h = np.dot(X, W) + b
        #stable a
        a = np.exp(h - np.max(h, axis = 1).reshape(-1,1))
        p = a/np.sum(a, axis = 1).reshape(-1,1)
        return p
    
    def loss(self, X, T):
        
        p = self.forward(X)
        
        n = T.shape[0]
        
        log_likelihood = -np.log(p[range(n), T])
        Loss = np.sum(log_likelihood) / n
        #Loss는 데이터 개수 전부 더한거 아닌가?
        #Loss = np.sum(log_likehood)
        return Loss
    
    def accuracy(self, X, T):
        p = self.forward(X) #예측
        predict = np.argmax(p, axis = 1) #예측 결과 index 1darray 로 출력 
        
        return 1 - np.count_nonzero(predict - T)/len(T)
        
    def gradient(self, X, T, learning_rate = 0.0001):
        
        p = self.forward(X)
        #T = np.array(T)
        t = np.zeros((T.shape[0], np.max(T) + 1))
        t[np.arange(T.shape[0]), T] = 1
        #t는 인덱스 레이블 T를 One hot 벡터로 바꾼 것
        
        #목적함수에 대한 가중치 미분값을 담을 zero array 생성
        grads = {}
        grads['W'] = np.zeros((3072, 10))
        grads['b'] = np.zeros(10)
        #목적함수에 대한 가중치 미분값 합 구하기
        grads['W'] = (1/len(T)) * np.dot(X.T, p-t)
        grads['b'] = (1/len(T)) * np.sum(p-t, axis = 0)

        self.params['W'] -= learning_rate * grads['W']
        self.params['b'] -= learning_rate * grads['b']

In [2]:
def Processing_data(train, test):
    #change dtype
    train = np.array(train, dtype=np.float64)
    test = np.array(test, dtype=np.float64)
    
    #Reshaping
    train = np.reshape(train, (train.shape[0], -1))
    test = np.reshape(test, (test.shape[0], -1))
    
    #Normalizing
    mean_image = np.mean(train, axis = 0)
    #print(train.dtype)
    train -= mean_image
    test -= mean_image
    
    return train, test

In [3]:
cifar_10_dir = 'cifar-10-batches-py'

In [109]:
train_data, train_filenames, train_labels, test_data, test_filenames, test_labels, label_names = \
load_cifar_10_data(cifar_10_dir)

In [110]:
train_data, test_data = Processing_data(train_data, test_data)

float64


In [6]:
train_data.shape
train_labels.shape
test_data.shape
test_labels.shape

(50000, 3072)

(50000,)

(10000, 3072)

(10000,)

In [7]:
train_data = train_data[:20]
train_labels = train_labels[:20]
test_data = test_data[:10]
test_labels = test_labels[:10]

In [9]:
train

array([b'leptodactylus_pentadactylus_s_000004.png',
       b'camion_s_000148.png', b'tipper_truck_s_001250.png',
       b'american_elk_s_001521.png', b'station_wagon_s_000293.png',
       b'coupe_s_001735.png', b'cassowary_s_001300.png',
       b'cow_pony_s_001168.png', b'sea_boat_s_001584.png',
       b'tabby_s_001355.png'], dtype='|S40')

In [107]:
softmax = Softmax()

In [111]:
for i in range(50):
    softmax.gradient(train_data, train_labels)
    if i % 5 ==0:
        print("Accuracy : " , softmax.accuracy(train_data, train_labels))
        print("Loss     : " , softmax.loss(train_data, train_labels))

0.15976
28.213080503499956
0.22799999999999998
32.43722659612448
0.23197999999999996
31.042650906750453
0.23141999999999996
27.68210914175905
0.22162000000000004
36.63419428172896
0.21955999999999998
41.70288534450684
0.26961999999999997
26.68141851596112
0.2388
34.00342800448953
0.23462000000000005
31.040791766268374
0.23997999999999997
33.321209963051665
