In [1]:
from check_grad import check_grad
import numpy as np
from matplotlib import pyplot as plt
from utils import *

In [2]:
def logistic_predict(weights, data):
    data_with_bias = np.hstack((data, np.ones((data.shape[0], 1))))

    logit = np.dot(data_with_bias, weights)

    y = sigmoid(logit)

    return y

In [3]:
def evaluate(targets, y):
    ce = -np.mean(targets * np.log(y) + (1 - targets) * np.log(1 - y))

    predictions = (y >= 0.5).astype(int)

    frac_correct = np.mean(predictions == targets)

    return ce, frac_correct

In [4]:
def logistic(weights, data, targets, hyperparameters):

    y = logistic_predict(weights, data)

    if hyperparameters['weight_regularization'] is True:
        f, df = logistic_pen(weights, data, targets, hyperparameters)
    else:
        f = -np.mean(targets * np.log(y) + (1 - targets) * np.log(1 - y))

        data_with_bias = np.hstack((data, np.ones((data.shape[0], 1))))
        df = np.dot(data_with_bias.T, (y - targets))

    return f, df, y

In [5]:
def logistic_pen(weights, data, targets, hyperparameters):
    N = len(targets)
    M = len(weights) - 1  

    data_with_bias = np.hstack((data, np.ones((N, 1))))

    logit = np.dot(data_with_bias, weights)

    y = sigmoid(logit)

    f = -np.mean(targets * np.log(y) + (1 - targets) * np.log(1 - y))

    df = np.dot(data_with_bias.T, (y - targets))

    reg_term = (hyperparameters['weight_decay'] / 2) * np.sum(weights[:-1] ** 2)

    f += reg_term
    df[:-1] += hyperparameters['weight_decay'] * weights[:-1]

    return f, df

In [6]:
def train(hyperparameters):
    print_interval = hyperparameters.setdefault('print_interval', 1)
    if hyperparameters['mnist_train_small']:
        train_inputs, train_targets = load_train_small()
    else:
        train_inputs, train_targets = load_train()

    valid_inputs, valid_targets = load_valid()

    N, M = train_inputs.shape

    weights = np.random.randn(M + 1, 1)

    run_check_grad(hyperparameters)

    logging = np.zeros((hyperparameters['num_iterations'], 5))
    for t in range(hyperparameters['num_iterations']):

        f, df, predictions = logistic(weights, train_inputs, train_targets, hyperparameters)
        
        cross_entropy_train, frac_correct_train = evaluate(train_targets, predictions)

        if np.isnan(f) or np.isinf(f):
            raise ValueError("nan/inf error")

        weights = weights - hyperparameters['learning_rate'] * df / N

        predictions_valid = logistic_predict(weights, valid_inputs)

        cross_entropy_valid, frac_correct_valid = evaluate(valid_targets, predictions_valid)
        
        logging[t] = [f / N, cross_entropy_train, frac_correct_train*100, cross_entropy_valid, frac_correct_valid*100]
        
        if t % print_interval != 0:
            continue
        print(f"ITERATION:{t+1:4d}  "
              f"TRAIN NLOGL:{f / N:4.2f}  "
              f"TRAIN CE:{cross_entropy_train:.6f}  "
              f"TRAIN FRAC:{frac_correct_train*100:5.1f}  "
              f"VALID CE:{cross_entropy_valid:.6f}   "
              f"VALID FRAC:{frac_correct_valid*100:5.1f}")
        
    return logging, weights

def run_check_grad(hyperparameters):
    """Performs gradient check on logistic function.
    """


    num_examples = 7
    num_dimensions = 9

    weights = np.random.randn(num_dimensions+1, 1)
    data    = np.random.randn(num_examples, num_dimensions)
    targets = (np.random.rand(num_examples, 1) > 0.5).astype(int)

    diff = check_grad(logistic,      
                      weights,
                      0.001,         
                      data,
                      targets,
                      hyperparameters)

    print("diff =", diff)

In [7]:
def run_train(hyperparameters):
    num_runs = 1
    weights = []
    logging = np.zeros((hyperparameters['num_iterations'], 5))
    for i in range(num_runs):
        _logging, _weights = train(hyperparameters)
        logging += _logging
        weights.append(_weights)
    logging /= num_runs

    plot_loss_curve(logging)
    plot_accuracy_curve(logging)
    
    return logging, weights

def plot_loss_curve(logging):
    plt.figure(figsize=(12, 8))
    plt.plot(range(1, logging.shape[0] + 1), logging[:, 1], label='Training Loss')
    plt.plot(range(1, logging.shape[0] + 1), logging[:, 3], label='Validation Loss')
    plt.title('Loss Curve')
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

def plot_accuracy_curve(logging):
    plt.figure(figsize=(12, 8))
    plt.plot(range(1, logging.shape[0] + 1), logging[:, 2], label='Training Accuracy')
    plt.plot(range(1, logging.shape[0] + 1), logging[:, 4], label='Validation Accuracy')
    plt.title('Accuracy Curve')
    plt.xlabel('Iteration')
    plt.ylabel('Accuracy (%)')
    plt.legend()
    plt.show()

def run_test(weights: list):
    inputs, targets = load_test()
    
    for i, weight in enumerate(weights, 1):
        predictions = logistic_predict(weight, inputs)
        cross_entropy, frac_correct = evaluate(targets, predictions)
        print(f'WEIGHTS #{i}   TEST CE: {cross_entropy:.6f}   TEST FRAC: {frac_correct*100:5.1f}')

In [None]:
hyperparameters = {
    'mnist_train_small': False,      # boolean, True for using small dataset
    'learning_rate': 0.01,          # learning rate for gradient descent
    'weight_regularization': True,   # boolean, True for using regularization
    'num_iterations': 1000,           # number of iterations for training
    'weight_decay': 0.1,           # regularization strength lambda
    'print_interval': 1              # adjust print interval higher if the number of iterations is large. not affects logging
}

logging, trained_weights = run_train(hyperparameters)

In [None]:
run_test(trained_weights)

<a id='logistic_regression'></a>
#### Logistic Regression
#### mnist_train과 mnist_train_small의 두종류 데이터 집합 분석 및 평가

hyperparameters = {</br>
    </br>'mnist_train_small': False & True      
    </br>'learning_rate': 0.01,       #최적의 파라미터
    </br>'weight_regularization': False,  
    </br>'num_iterations': 1000,          #최적의 파라미터
    </br>'weight_decay': 0.1,         
    </br>'print_interval': 1          
}
</br>

</br>
ITERATION:1000  TRAIN NLOGL:0.00  TRAIN CE:0.459104  TRAIN FRAC: 86.5  VALID CE:0.820872   VALID FRAC: 80.0

#### mnist_train data
![plot sample](./plot_sample01_png.png)

#### mnist_train_small

ITERATION:1000  TRAIN NLOGL:0.00  TRAIN CE:0.011276  TRAIN FRAC:100.0  VALID CE:2.309556   VALID FRAC: 64.0
![plot sample](./image02.png)

- mnist_train_small 같은 경우 데이터 셋이 상대적으로 mani_train 보다 적기 때문에 크로스 엔트로피 손실이 낮다. 

- 데이터 셋이 적기 때문에 mnist_train_small 데이터 셋은 훈련정확도가 높지만 학습 데이터에 의존성이 커 Overfitting의 성향을 나타내고 있다. 이에 따라 validation accurcy에서 mnist_train_small 은 약한 모습을 보이고 있다.

- mnist_train 이 mnist_train_small 보다 일반화 능력이 높으며 안정적인 성능을 보이고 있다.

#### 훈련 반복 평가

ITERATION:1000  TRAIN NLOGL:0.00  TRAIN CE:0.459648  TRAIN FRAC: 88.0  VALID CE:0.920228   VALID FRAC: 78.0
![plot sample](./image03.png)

- 5회 이하 정도의 훈련 반복시 일시적으로 정확도의 값이 올라간다. 하지만 훈련이 계속 반복되는 경우 위의그래프처럼 성능이 오히려 떨어지는 경향을 보이고 있다.

- 정규화를 적용하지 않았기 때문에 훈련을 진행할수록 모델이 훈련 데이터에 맞춰줘 train data에만 특화가 되는 Overfitting 현상이 일어난다. 이로 인해 새로운 데이터에 대한 일반화 능력이 떨어지게 된다.

- Gradient Descent가 수렴하지 못하고 발산하기 때문에 loss 함수의 최소값을 찾지 못하고 무한히 큰 loss 함수를 가지게 될 수 있어 모델의 성능이 저하된다.

<a id='regularized_logistic_regression'></a>
#### Regularized Logistic Regression

hyperparameters = {</br>
    </br>'mnist_train_small': False # 최적의 파라민터     
    </br>'learning_rate': 0.01,       #최적의 파라미터
    </br>'weight_regularization': True,  #regularized_logistic 적용
    </br>'num_iterations': 1000,          #최적의 파라미터
    </br>'weight_decay': 1 & 0.1 & 0.01 & 0.001         
    </br>'print_interval': 1          
}
</br>


#### $\lambda$ = 1
ITERATION:1000  TRAIN NLOGL:0.19  TRAIN CE:0.402023  TRAIN FRAC: 84.5  VALID CE:1.279483   VALID FRAC: 72.0
![plot sample](./image04.png)

#### $\lambda$ = 0.1
ITERATION:1000  TRAIN NLOGL:0.20  TRAIN CE:0.300512  TRAIN FRAC: 90.5  VALID CE:0.842592   VALID FRAC: 74.0
![plot sample](./image07.png)

#### $\lambda$ = 0.01
ITERATION:1000  TRAIN NLOGL:0.02  TRAIN CE:0.354615  TRAIN FRAC: 88.0  VALID CE:0.935801   VALID FRAC: 78.0
![plot sample](./image05.png)

#### $\lambda$ = 0.001
ITERATION:1000  TRAIN NLOGL:0.00  TRAIN CE:0.479517  TRAIN FRAC: 86.5  VALID CE:0.559413   VALID FRAC: 84.0
![plot sample](./image06.png)

- $\lambda$ 값이 낮아질수록 엔트로피 손실 값은 낮아지며 훈련 데이터와 검증 데이터의 정확도가 증가하고 있다.

- $\lambda$ 낮아질수록 regularization의 강도를 약화시켜 복잡한 패턴으로 훈련을 할 수 있게 된다. 즉 모델의 복잡성이 증가하게 되어 validation accuracy가 증가하는 모습을 볼 수 있다.

- 위를 통하여 현재 모델의 최적의 $\lambda$ = 0.001 이다.

#### logistic_regression 과 regularized_logistic_regression 성능비교

-  regularized_logistic_regression
</br>

</br> WEIGHTS #1   TEST CE: 0.655075   TEST FRAC:  82.0 </br>

-  logistic_regression
</br> 

</br> WEIGHTS #1   TEST CE: 0.741427   TEST FRAC:  74.0



- 두개의 성능 평가를 보게 되면 새로운 test data set에 대해 regularized 된 logistic regression 이 더 정확도와 cross entropy 가 더 우수하게 나왔다.

- logistic regression 의 loss 함수는 regularzation이 없이 학습을 진행하였기 때문에 모델이 훈련 데이터에 overfitting 되기가 쉽다는 단점이 있다. 

- regularized-logistic 은가중치의 페널티를 부여하여 모델이 일반화하는데 도움을 준다.
그렇기 때문에 지금까지 보지 못한 test-set에 데이터는 regularized-logistic-regression이 우수한 성능을 보일 수 있다.