# Tobig's 14기 2주차 Optimization 과제
### Made by 이지용

# Gradient Descent 구현하기

### 1) "..." 표시되어 있는 빈 칸을 채워주세요  
### 2) 강의내용과 코드에 대해 공부한 내용을 적어서 과제를 채워주세요

In [80]:
import pandas as pd
import numpy as np
import random

In [81]:
data = pd.read_csv('assignment_2.csv')
data.head()

Unnamed: 0,Label,bias,experience,salary
0,1,1,0.7,48000
1,0,1,1.9,48000
2,1,1,2.5,60000
3,0,1,4.2,63000
4,0,1,6.0,76000


## Train Test 데이터 나누기
### 데이터셋을 train/test로 나눠주는 메소드  
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [82]:
from sklearn.model_selection import train_test_split

In [83]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 1:], data.iloc[:, 0], test_size=0.25, random_state = 0)

In [84]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((150, 3), (50, 3), (150,), (50,))

## Scaling  

experience와 salary의 단위, 평균, 분산이 크게 차이나므로 scaler를 사용해 단위를 맞춰줍니다. 

In [85]:
#경사하강법을 사용할 때는 반드시 모든 특성이 같은 스케일을 갖도록 만들어야 한다. 그래야 수렴하는 데 짧은 시간이 걸린다.

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
bias_train = X_train["bias"]
bias_train = bias_train.reset_index()["bias"]
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_train["bias"] = bias_train
X_train.head()

Unnamed: 0,bias,experience,salary
0,1,0.187893,-1.143335
1,1,1.185555,0.043974
2,1,-0.310938,-0.351795
3,1,-1.629277,-1.34122
4,1,-1.3086,0.043974


이때 scaler는 X_train에 fit 해주시고, fit한 scaler를 X_test에 적용시켜줍니다.  
똑같이 X_test에다 fit하면 안돼요!

In [86]:
bias_test = X_test["bias"]
bias_test = bias_test.reset_index()["bias"]
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
X_test["bias"] = bias_test
X_test.head()

Unnamed: 0,bias,experience,salary
0,1,-1.344231,-0.615642
1,1,0.50857,0.307821
2,1,-0.310938,0.571667
3,1,1.363709,1.956862
4,1,-0.987923,-0.747565


In [87]:
# parameter 개수
N = len(X_train.loc[0])

In [88]:
# 초기 parameter들을 임의로 설정해줍니다.
parameters = np.array([random.random() for i in range(N)])
parameters

array([0.23501448, 0.6370105 , 0.59580049])

### * LaTeX   

Jupyter Notebook은 LaTeX 문법으로 수식 입력을 지원하고 있습니다.  
http://triki.net/apps/3466  
https://jjycjnmath.tistory.com/117

## Logistic Function

## $p = {1 \over 1 + e^{-(\beta_{0} + \beta_{1}x)}}$

In [89]:
def logistic(X, parameters):
    z = 0
    for i in range(len(parameters)) :
        z += X[i] * parameters[i]
    p = 1 / (1 + np.exp(-z))
    
    return p

In [90]:
logistic(X_train.iloc[1], parameters)

0.734275759414701

## Object Function

Object Function : 목적함수는 Gradient Descent를 통해 최적화 하고자 하는 함수입니다.  
로지스틱 회귀의 목적함수를 작성해주세요
## $l(p) = -{1 \over N} \sum_{i=1}^{n}{[y_{i}log(p_{i}) + (1-y_{i})log(1-p_{i})]}$  
  
+ 전체 데이터를 사용하는 batch gradient descent의 경우 N으로 나누어 주는 것이 맞다.

In [91]:
def cross_entropy_i(X, y, parameters) :
    p = logistic(X, parameters)                            # 위에서 작성한 함수를 활용하세요
    loss = y * np.log(p) + (1-y) * log(1-p)
    return -loss

In [92]:
def cross_entropy(X_set, y_set, parameters) :
    loss = 0  
    for i in range(X_set.shape[0]):
        X = X_set.iloc[i, :]
        y = y_set.iloc[i]
        p = logistic(X, parameters) 
        loss += y * np.log(p) + (1-y) * np.log(1-p)
    return -loss / X_set.shape[0]

In [93]:
cross_entropy(X_test, y_test, parameters)

1.0733776087172844

## Gradient of Cross Entropy

## ${\partial\over{\partial \theta_j}}l(p)= -{1 \over N}\sum_{}^{}{(y_{i}-p_{i})x_{ij}}$  
  
+ 전체 데이터를 사용하는 batch gradient descent의 경우 N으로 나누어 주는 것이 맞다.

In [94]:
# cross_entropy를 theta_j에 대해 미분한 값을 구하는 함수
def get_gradient_ij_cross_entropy(X, y, parameters, j):
    p = logistic(X, parameters)
    gradient = (y-p) * X[j]
    return -gradient

In [95]:
get_gradient_ij_cross_entropy(X_train.iloc[0, :], y_train.iloc[0], parameters, 1)

-0.1091482631097553

## Batch Gradient Descent  

Batch Gradient Descent : 학습 한번에 전체 데이터에 대해서 기울기(=Gradient)를 구한다.

In [96]:
def get_gradients_bgd(X_train, y_train, parameters) :
    gradients = [0 for i in range(len(parameters))]
    
    for i in range(X_train.shape[0]):
        X = X_train.iloc[i, :]
        y = y_train.iloc[i]
        for j in range(len(parameters)):
            gradients[j] += get_gradient_ij_cross_entropy(X, y, parameters,j) / X_train.shape[0]
            
    return gradients

In [97]:
gradients_bgd = get_gradients_bgd(X_train, y_train, parameters)
gradients_bgd

[0.26154037881092196, 0.10079688431933197, 0.3073212832443908]

## Stochastic Gradient Descent  

Stochastic Gradient Descent : 학습 한번에 임의의 데이터 하나에 대해서만 기울기(=Gradient)를 구한다.

In [98]:
def get_gradients_sgd(X_train, y, parameters) :
    gradients = [0 for i in range(len(parameters))]
    r = int(random.random()*X_train.shape[0])
    X = X_train.iloc[r, :]
    y = y_train.iloc[r]
        
    for j in range(len(parameters)):
        gradients[j] = get_gradient_ij_cross_entropy(X, y, parameters,j)
        
    return gradients

In [99]:
gradients_sgd = get_gradients_sgd(X_train, y_train, parameters)
gradients_sgd

[0.7329621899464898, 0.13771849469037306, 0.8057897045752572]

## Update Parameters  

In [100]:
def update_parameters(parameters, gradients, learning_rate) :
    for i in range(len(parameters)) :
        gradients[i] *= learning_rate
    parameters -= gradients
    return parameters

In [101]:
update_parameters(parameters, gradients_bgd, 0.01)

array([0.23239907, 0.63600253, 0.59272728])

## Gradient Descent  

위에서 작성한 함수들을 조합해서 Gradient Descent를 진행하는 함수를 완성해주세요

learning_rate = 학습 시 스텝의 크기. 학습률이 너무 크면 학습시간이 적게 걸리지만 global minimum에서 멀어질 수 있고, 학습률이 너무 작으면 학습시간이 많이 걸리고 local minimum으로 갈 확률이 커진다.  
max_iter = 최대 반복 횟수  
tolerance = 허용오차. 허용오차보다 작아지면 (loss가 수렴하면) 거의 최솟값에 도달한 것이므로 알고리즘을 중지한다.

In [102]:
def gradient_descent(X_train, y_train, learning_rate=0.01, max_iter=100000, tolerance=0.0001, optimizer="bgd") :
    count = 1
    point = 100 if optimizer == "bgd" else 10000
    N = len(X_train.iloc[0])
    parameters = np.array([random.random() for i in range(N)])
    gradients = [0 for i in range(N)]
    loss = 0
    
    while count < max_iter :
        
        if optimizer == "bgd" :
            gradients = get_gradients_bgd(X_train, y_train, parameters)
        elif optimizer == "sgd" :
            gradients = get_gradients_sgd(X_train, y_train, parameters)
            # loss, 중단 확인
        if count%point == 0 :
            new_loss = cross_entropy(X_train, y_train, parameters)
            print(count, "loss: ",new_loss, "params: ", parameters, "gradients: ", gradients)
            
            #중단 조건
            if abs(new_loss-loss) < tolerance : #tolerance를 len(y_train)으로 나누지는 않았습니다! (시간이 너무 오래걸려서요 ㅜㅜ)
                break
            loss = new_loss
                
            
                
        parameters = update_parameters(parameters, gradients, learning_rate)
        count += 1
    return parameters

In [103]:
new_param_bgd = gradient_descent(X_train, y_train)
new_param_bgd

100 loss:  0.6610136832832992 params:  [-0.17980118  0.30967514  0.16991084] gradients:  [0.17693611910162732, -0.015512830017243275, 0.18972904473839206]
200 loss:  0.6060316621867174 params:  [-0.33702844  0.33794688 -0.00108748] gradients:  [0.13865477064885207, -0.03959224728377187, 0.15370335976206226]
300 loss:  0.5687923807669935 params:  [-0.46038231  0.38558391 -0.14122876] gradients:  [0.10910902838634964, -0.054495684552909734, 0.12785224046892288]
400 loss:  0.5417272876101894 params:  [-0.55790586  0.44461183 -0.2596124 ] gradients:  [0.08676087264192894, -0.06275484567437424, 0.10980785154785537]
500 loss:  0.5207518549087546 params:  [-0.63591166  0.50962035 -0.36275119] gradients:  [0.06985949208715926, -0.06675208439206008, 0.097046995281208]
600 loss:  0.503671047012797 params:  [-0.6991142   0.57722828 -0.45495815] gradients:  [0.05699206173327469, -0.06814673490930157, 0.08773179862665158]
700 loss:  0.48925524696121975 params:  [-0.75100318  0.6453942  -0.53904169]

5400 loss:  0.32906022408466207 params:  [-1.30672056  2.40066422 -2.28485763] gradients:  [0.0057205706643429725, -0.020280464870758883, 0.019420218152247083]
5500 loss:  0.3282535623729943 params:  [-1.31239818  2.42076644 -2.30410283] gradients:  [0.005634118700184003, -0.019922186139381508, 0.01906846691378284]
5600 loss:  0.3274751473707747 params:  [-1.31799029  2.44051563 -2.32300154] gradients:  [0.0055495779646090974, -0.019574441057124862, 0.018727254494538197]
5700 loss:  0.32672367190967005 params:  [-1.32349879  2.4599221  -2.34156406] gradients:  [0.005466891970350074, -0.019236775523326317, 0.0183961212440659]
5800 loss:  0.3259979054701648 params:  [-1.3289255   2.47899573 -2.35980026] gradients:  [0.005386006920149981, -0.01890876095916604, 0.01807463355003417]
5900 loss:  0.32529668872845496 params:  [-1.33427219  2.49774594 -2.37771958] gradients:  [0.005306871400678162, -0.018589992571571873, 0.017762382056385466]
6000 loss:  0.3246189285581033 params:  [-1.33954059

10600 loss:  0.3079168691196356 params:  [-1.51927809  3.12790548 -2.9755519 ] gradients:  [0.0029144609642860527, -0.009723163462571623, 0.009159053228430384]
10700 loss:  0.3077320986782444 params:  [-1.52217657  3.1375731  -2.98465766] gradients:  [0.0028822653898970055, -0.009611287560152195, 0.009051717293528756]
10800 loss:  0.3075515713142946 params:  [-1.52504311  3.1471298  -2.99365701] gradients:  [0.002850577932244602, -0.00950131312555471, 0.008946241006566736]
10900 loss:  0.3073751667322343 params:  [-1.52787821  3.15657744 -3.00255179] gradients:  [0.0028193874753912364, -0.009393193485796512, 0.008842578053446916]
11000 loss:  0.3072027688324979 params:  [-1.53068235  3.16591786 -3.01134379] gradients:  [0.002788683210097136, -0.009286883493553345, 0.008740683648671935]
11100 loss:  0.3070342655351266 params:  [-1.53345603  3.17515285 -3.02003475] gradients:  [0.002758454624112139, -0.00918233946507756, 0.008640514472733632]
11200 loss:  0.3068695486121027 params:  [-1.

array([-1.59411527,  3.37622891, -3.20882636])

## Hyper Parameter Tuning

Hyper Parameter들을 매번 다르게 해서 학습을 진행해 보세요. 다른 점들을 발견할 수 있습니다.

In [104]:
new_param_sgd = gradient_descent(X_train, y_train, learning_rate=0.01, max_iter=100000, tolerance=0.0001, optimizer="sgd")
new_param_sgd

10000 loss:  0.3082366151209839 params:  [-1.52177894  3.09306755 -2.99082588] gradients:  [0.29818306047707704, -0.37957791553096276, -0.45893496405346057]
20000 loss:  0.2998456524311901 params:  [-1.75576061  3.8278484  -3.56701914] gradients:  [0.48816113452001186, -0.4822656519073101, -0.751331119765138]
30000 loss:  0.2986904749333766 params:  [-1.80062944  4.06156537 -3.78809866] gradients:  [0.09161285057045816, 0.10534785341865044, 0.1248872848875075]
40000 loss:  0.2986427450112795 params:  [-1.84300994  4.19719901 -3.88207703] gradients:  [0.02263589994401144, 0.0002204529089488953, 0.011447106923489957]


array([-1.84300994,  4.19719901, -3.88207703])

Batch Gradient Descent를 이용하면 시간이 오래 걸린다. (전체 데이터를 사용하기 때문이다.)  
그러나 Stochastic Gradient Descent를 이용하면 시간은 짧게 걸린다. (랜덤하게 한 자료만을 골라서 사용하기 때문이다.)

In [107]:
new_param_bgd1 = gradient_descent(X_train, y_train, tolerance = 0.01) #tolerance를 높임
new_param_bgd1

100 loss:  0.8706463644215906 params:  [0.41099396 0.60807901 0.40978645] gradients:  [0.3020330440664404, 0.0717445125619561, 0.2716870406752564]
200 loss:  0.7290629590855446 params:  [0.13486991 0.55403978 0.16072047] gradients:  [0.2493185225078109, 0.035455429221407084, 0.22517120839450772]
300 loss:  0.6374508466116798 params:  [-0.08890297  0.53618322 -0.04135887] gradients:  [0.19874235533605758, 0.0009646425724988842, 0.1795962278203253]
400 loss:  0.5798860217138031 params:  [-0.26588802  0.54887709 -0.2020722 ] gradients:  [0.1564008929191305, -0.024971273029981395, 0.14331206646118258]
500 loss:  0.5423423578989851 params:  [-0.40523545  0.58279954 -0.33185299] gradients:  [0.12343951731996146, -0.04166049499184284, 0.11759187642065191]
600 loss:  0.5161362543207566 params:  [-0.51569471  0.6297537  -0.44018238] gradients:  [0.09838357006005281, -0.05140771869377538, 0.1000035187785643]
700 loss:  0.49655257259757235 params:  [-0.60422709  0.6840393  -0.53381329] gradients:

array([-0.82537564,  0.92021835, -0.82904615])

stop 조건이 완화되었기 때문에 tolerance값이 매우 작았을 때보다 (stop 조건이 더 강화되었을 때) 시간은 덜 소요되지만, 찾은 최적값이 global minimum loss일지는 정확하지 않다. (abs(new_loss-loss)가 0만큼의 작은 값으로 수렴하지는 않았기 때문이다.)

In [108]:
new_param_sgd1 = gradient_descent(X_train, y_train, learning_rate=0.05, max_iter=100000, tolerance=0.0001, optimizer="sgd")
new_param_sgd1

10000 loss:  0.30468900529745196 params:  [-1.74382893  4.46759366 -3.86873795] gradients:  [0.002799653575721985, -0.0045614120995812095, -0.0022775932011110897]
20000 loss:  0.2998085400876687 params:  [-1.66886504  4.2212814  -3.9867748 ] gradients:  [0.005548435295077723, -0.006469902997431278, -0.0019519138525176077]
30000 loss:  0.3036923565653971 params:  [-1.94846687  4.19322638 -3.66729919] gradients:  [0.014021397765032948, -0.023344338998658162, -0.017880902748647617]
40000 loss:  0.3069402973883745 params:  [-2.02833891  4.65187149 -3.96165637] gradients:  [0.03053533911419363, -0.011670595133130833, -0.0026855477372689828]
50000 loss:  0.30236305268367747 params:  [-1.88820146  4.47445718 -3.92241949] gradients:  [0.17525467083222973, 0.31392995213102726, 0.3429491619659808]
60000 loss:  0.30368475861503696 params:  [-1.93901921  4.16930199 -4.30569713] gradients:  [0.044886664762745214, -0.028351098174998743, -0.015790920878440026]
70000 loss:  0.3003845878171088 params: 

array([-1.72859725,  4.26486366, -3.98777016])

learning_rate가 작았을 때와 결과값은 비슷하지만, gradients 값이 작아졌고 시행 횟수가 더 많아졌다. (학습률이 높아지면서 어디로 튈지 모르는 불안정성이 더 커졌기 때문인 것으로 생각된다.)

## Predict Label

In [105]:
y_predict = []
for i in range(len(y_test)):
    p = logistic(X_test.iloc[i,:], new_param_bgd)
    if p> 0.5 :
        y_predict.append(1)
    else :
        y_predict.append(0)

## Confusion Matrix

In [106]:
from sklearn.metrics import *
tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
confusion_matrix(y_test, y_predict)

array([[38,  2],
       [ 2,  8]], dtype=int64)