In [None]:
!pip install scikit-learn==1.0.2

Collecting scikit-learn==1.0.2
  Downloading scikit_learn-1.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 0.25.0 requires scikit-learn>=1.2.2, but you have scikit-learn 1.0.2 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.0.2


In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.datasets import load_boston

boston = load_boston()
bostonDF = pd.DataFrame(boston.data, columns=boston.feature_names)
bostonDF['PRICE'] = boston.target
bostonDF.head()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


### Stochastic Gradient Descent와 Mini Batch Gradient Descent 구현
* SGD 는 전체 데이터에서 **한건만** 임의로 선택하여 Gradient Descent 로 Weight/Bias Update 계산한 뒤 Weight/Bias 적용
* Mini Batch GD는 전체 데이터에서 **Batch 건수만큼** 데이터를 선택하여 Gradient Descent로 Weight/Bias Update 계산한 뒤 Weight/Bias 적용

In [None]:
# RM, LSTAT 속성만 스케일링
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(bostonDF[['RM', 'LSTAT']])

### 반복 시 순차적으로 일정한 batch 크기만큼의 데이터를 전체 학습데이터에 걸쳐서 가져오는 Mini-Batch GD 수행

In [None]:
def get_update_value_batch(bias, w1, w2, rm_batch, lstat_batch, target_batch, learning_rate=0.01):

    # 데이터 건수
    N = target_batch.shape[0]

    # 예측 값
    predicted_batch = w1 * rm_batch+ w2 * lstat_batch + bias

    # 실제값과 예측값의 차이
    diff_batch = target_batch - predicted_batch

    # bias 를 array 기반으로 구하기 위해서 설정
    bias_factors = np.ones((N,))

    # weight와 bias를 얼마나 update할 것인지를 계산
    w1_update = -(2/N)*learning_rate*(np.dot(rm_batch.T, diff_batch))
    w2_update = -(2/N)*learning_rate*(np.dot(lstat_batch.T, diff_batch))
    bias_update = -(2/N)*learning_rate*(np.dot(bias_factors.T, diff_batch))

    # Mean Squared Error값을 계산
    # mse_loss = np.mean(np.square(diff))

    # weight와 bias가 update되어야 할 값 반환
    return bias_update, w1_update, w2_update

In [None]:
# batch_gradient_descent()는 인자로 batch_size(배치 크기)를 입력 받음
def batch_gradient_descent(features, target, iter_epochs=1000, batch_size=30, verbose=1):

    # w1, w2는 numpy array 연산을 위해 1차원 array로 변환하되 초기 값은 0으로 설정
    # bias도 1차원 array로 변환하되 초기 값은 1로 설정.
    w1 = np.zeros((1,))
    w2 = np.zeros((1,))
    bias = np.zeros((1, ))
    print('최초 w1, w2, bias:', w1, w2, bias)

    # learning_rate와 RM, LSTAT 피처 지정. 호출 시 numpy array형태로 RM과 LSTAT으로 된 2차원 feature가 입력됨
    learning_rate = 0.01
    rm = features[:, 0]
    lstat = features[:, 1]

    # NumPy 난수 생성기의 시드(seed) 값을 2024로 설정 -> 같은 시드를 사용하면 항상 같은 무작위 수가 생성
    np.random.seed(2024)

    # iter_epochs 수만큼 반복하면서 weight와 bias update 수행
    for i in range(iter_epochs):

        # batch_size 만큼 데이터를 가져와서 weight/bias update를 수행하는 로직을 전체 데이터 건수만큼 반복
        # start(0), end(506), step(30)
        for batch_step in range(0, target.shape[0], batch_size):

            # batch_step부터 batch_size만큼 순차적인 데이터를 가져옴
            rm_batch = rm[batch_step : batch_size + batch_step]
            lstat_batch = lstat[batch_step : batch_size + batch_step]
            target_batch = target[batch_step : batch_size + batch_step]

            # Batch GD 기반으로 Weight/Bias의 Update를 구함
            bias_update, w1_update, w2_update = get_update_value_batch(bias, w1, w2, rm_batch, lstat_batch, target_batch, learning_rate)

            # Batch GD로 구한 weight/bias의 update 적용
            w1 = w1 - w1_update
            w2 = w2 - w2_update
            bias = bias - bias_update

            if verbose:
                print('Epoch:', i+1,'/', iter_epochs, 'batch step:', batch_step)

                # Loss는 전체 학습 데이터 기반으로 구해야 함
                predicted = w1 * rm + w2*lstat + bias
                diff = target - predicted
                mse_loss = np.mean(np.square(diff))
                print('w1:', w1, 'w2:', w2, 'bias:', bias, 'loss:', mse_loss)

    return w1, w2, bias

In [None]:
# 전체 데이터를 순차적으로 가져오는 batch_gradient_descent 함수 호출로 학습(epochs 500번, batch_size=30) - 모든 데이터를 학습하므로 오래걸림
w1, w2, bias = batch_gradient_descent(scaled_features, bostonDF['PRICE'].values, iter_epochs=500, batch_size=30, verbose=1)
print('##### 최종 w1, w2, bias #######')
print(w1, w2, bias)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Epoch: 354 / 500 batch step: 0
w1: [25.63681754] w2: [-23.59856786] bias: [16.13577515] loss: 30.540060069710925
Epoch: 354 / 500 batch step: 30
w1: [25.6299224] w2: [-23.59871148] bias: [16.12200024] loss: 30.545085325058558
Epoch: 354 / 500 batch step: 60
w1: [25.60814658] w2: [-23.60586014] bias: [16.08046181] loss: 30.56492597712335
Epoch: 354 / 500 batch step: 90
w1: [25.60169053] w2: [-23.60985298] bias: [16.06522279] loss: 30.5735075276295
Epoch: 354 / 500 batch step: 120
w1: [25.61146441] w2: [-23.57915213] bias: [16.10154573] loss: 30.553448874027207
Epoch: 354 / 500 batch step: 150
w1: [25.64605267] w2: [-23.57786479] bias: [16.14542527] loss: 30.53498963798446
Epoch: 354 / 500 batch step: 180
w1: [25.7140912] w2: [-23.56081123] bias: [16.24842632] loss: 30.522521686008563
Epoch: 354 / 500 batch step: 210
w1: [25.7513791] w2: [-23.54041121] bias: [16.30803687] loss: 30.53481679295732
Epoch: 354 / 500 batch step: 240
w1: [25.79

In [None]:
# weight와 bias로 예측 값 생성 후 dataFrame에 'PREDICTED_PRICE_BATCH' 컬럼 추가
predicted = scaled_features[:, 0]*w1 + scaled_features[:, 1]*w2 + bias
bostonDF['PREDICTED_PRICE_BATCH'] = predicted
bostonDF.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE,PREDICTED_PRICE_BATCH
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0,28.850257
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6,25.380192
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7,32.493701
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4,32.27533
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2,31.459523


### Mini BATCH GD를 Keras로 수행
* Keras는 기본적으로 (Random) Mini Batch GD를 수행

In [None]:
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam

model = Sequential([
  # 단 하나의 units 설정. input_shape는 2차원, 회귀이므로 activation은 설정하지 않음.
  # weight와 bias 초기화는 kernel_inbitializer와 bias_initializer를 이용.
  Dense(1, input_shape=(2, ), activation=None, kernel_initializer='zeros', bias_initializer='ones')
])

# Adam optimizer를 이용하고 Loss 함수는 Mean Squared Error, 성능 측정 역시 MSE를 이용하여 학습 수행.
model.compile(optimizer=Adam(learning_rate=0.01), loss='mse', metrics=['mse'])

# Keras는 반드시 Batch GD를 적용함. batch_size는 30으로(None이면 32), epoch 횟수는 500 할당
model.fit(scaled_features, bostonDF['PRICE'].values, batch_size=30, epochs=500)

In [None]:
# keras로 예측하고 dataFrame에 'KERAS_PREDICTED_PRICE_BATCH' 컬럼으로 추가하기
predicted = model.predict(scaled_features)
bostonDF['KERAS_PREDICTED_PRICE_BATCH'] = predicted

bostonDF.head()



Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE,PREDICTED_PRICE_BATCH,KERAS_PREDICTED_PRICE_BATCH
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0,28.850257,28.910387
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6,25.380192,25.478966
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7,32.493701,32.455235
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4,32.27533,32.262241
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2,31.459523,31.431549
