[모델 Autoencoder + 선형회귀]
1. Autoencoder를 사용해 데이터를 효율적으로 압축하고 특징을 추출한 후, 그 결과를 이용해 회귀 모델을 학습
- 3개의 은닉층을 사용하는 Autoencoder 모델구조
- 절단 정규 분포 (tf.truncated_normal), ReLU와 sigmoid 활성화 함수 사용
- 선형 회귀 모델 y=W*x+b
2. 손실함수 : MSE
3. 옵티마이저 : Adam Optimizer (learning rate=0.0005, 0.02)
4. 학습 epoch=300
- batch size=64

In [16]:
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import StandardScaler

In [17]:
# 데이터 로드 함수
def load_csv(file_path):
    data = np.genfromtxt(file_path, delimiter=',', skip_header=1, usecols=range(8))
    x_data = np.delete(data, 1, axis=1)  # 두 번째 열 제거 (입력 변수)
    y_data = data[:, 1].reshape(-1, 1)  # 두 번째 열이 목표 변수
    return x_data, y_data

# 데이터 분할 및 표준화
def split_data(x_data, y_data):
    x_test = x_data[:400]
    y_test = y_data[:400]
    x_train = x_data[400:]
    y_train = y_data[400:]

    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    
    return x_train, x_test, y_train, y_test

In [18]:
# 데이터 로드 및 분할
x_data, y_data = load_csv('data-ori.csv')
x_train, x_test, y_train, y_test = split_data(x_data, y_data)

In [19]:
autoencoder_lr = 0.0005  # Autoencoder의 학습률
regression_lr = 0.02  # Regression의 학습률
autoencoder_epochs = 300  # Autoencoder의 epoch
regression_epochs = 300  # Regression의 epoch
batch_size = 64
input_size = x_train.shape[1]
hidden1_size = 16  
hidden2_size = 64  
hidden3_size = 32  
output_size = 1

In [20]:
# 입력을 받기 위한 플레이스홀더 정의
x = tf.placeholder(tf.float32, shape=[None, input_size])
y = tf.placeholder(tf.float32, shape=[None, output_size])

In [21]:
# Autoencoder 구조 정의
def build_autoencoder(x):
    # 인코딩(Encoding)
    Wh_1 = tf.Variable(tf.truncated_normal([input_size, hidden1_size], stddev=0.05))
    bh_1 = tf.Variable(tf.zeros([hidden1_size]))
    H1_output = tf.nn.relu(tf.matmul(x, Wh_1) + bh_1)
    
    Wh_2 = tf.Variable(tf.truncated_normal([hidden1_size, hidden2_size], stddev=0.05))
    bh_2 = tf.Variable(tf.zeros([hidden2_size]))
    H2_output = tf.nn.relu(tf.matmul(H1_output, Wh_2) + bh_2)
    
    Wh_3 = tf.Variable(tf.truncated_normal([hidden2_size, hidden3_size], stddev=0.05))  # 추가된 레이어
    bh_3 = tf.Variable(tf.zeros([hidden3_size]))
    H3_output = tf.nn.relu(tf.matmul(H2_output, Wh_3) + bh_3)

    # 디코딩(Decoding)
    Wh_4 = tf.Variable(tf.truncated_normal([hidden3_size, hidden2_size], stddev=0.05))
    bh_4 = tf.Variable(tf.zeros([hidden2_size]))
    H4_output = tf.nn.relu(tf.matmul(H3_output, Wh_4) + bh_4)
    
    Wh_5 = tf.Variable(tf.truncated_normal([hidden2_size, hidden1_size], stddev=0.05))
    bh_5 = tf.Variable(tf.zeros([hidden1_size]))
    H5_output = tf.nn.relu(tf.matmul(H4_output, Wh_5) + bh_5)

    Wo = tf.Variable(tf.truncated_normal([hidden1_size, input_size], stddev=0.05))
    bo = tf.Variable(tf.zeros([input_size]))
    X_reconstructed = tf.nn.sigmoid(tf.matmul(H5_output, Wo) + bo)
    
    return X_reconstructed, H3_output

In [22]:
# 회귀를 위한 출력층 정의
def build_regression(x):
    W_reg = tf.Variable(tf.truncated_normal([hidden3_size, output_size], stddev=0.1))
    b_reg = tf.Variable(tf.zeros([output_size]))
    y_pred = tf.matmul(x, W_reg) + b_reg 
    
    return y_pred

In [23]:
# Autoencoder 선언
y_pred_autoencoder, extracted_features = build_autoencoder(x)

# Autoencoder의 손실 함수 정의
autoencoder_loss = tf.reduce_mean(tf.square(x - y_pred_autoencoder))

# Autoencoder 옵티마이저 정의 (학습률 autoencoder_lr 사용)
autoencoder_train_step = tf.train.AdamOptimizer(learning_rate=autoencoder_lr).minimize(autoencoder_loss)

In [24]:
# 회귀 모델 선언
y_pred_regression = build_regression(extracted_features)

# 회귀 손실 함수 정의 (Mean Squared Error)
regression_loss = tf.reduce_mean(tf.square(y - y_pred_regression))

# Regression 옵티마이저 정의 (학습률 regression_lr 사용)
regression_train_step = tf.train.AdamOptimizer(learning_rate=regression_lr).minimize(regression_loss)

In [25]:
# 세션을 열고 학습 진행
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    total_batch = int(len(x_train) / batch_size)
    
    # Step 1: Autoencoder 학습 (Pre-training)
    for epoch in range(autoencoder_epochs):
        for i in range(total_batch):
            batch_x = x_train[i * batch_size:(i + 1) * batch_size]
            _, loss_val = sess.run([autoencoder_train_step, autoencoder_loss], feed_dict={x: batch_x})
        
        if epoch % 30 == 0:
            print(f"Autoencoder Epoch {epoch}, Loss: {loss_val}")
    
    print("Step 1: Autoencoder Pre-training 완료")

    # Step 2: Regression 모델 학습 (Fine-tuning)
    for epoch in range(regression_epochs):
        for i in range(total_batch):
            batch_x = x_train[i * batch_size:(i + 1) * batch_size]
            batch_y = y_train[i * batch_size:(i + 1) * batch_size]
            _, loss_val = sess.run([regression_train_step, regression_loss], feed_dict={x: batch_x, y: batch_y})
        
        if epoch % 30 == 0:
            print(f"Regression Epoch {epoch}, Loss: {loss_val}")
    
    print("Step 2: Regression Fine-tuning 완료")
    
    # 테스트 데이터로 성능 평가
    y_pred = sess.run(y_pred_regression, feed_dict={x: x_test})
    
    # 예측값과 실제값의 차이 합 계산
    difference_sum = sum(abs(y_pred - y_test))
    print(f'\n차이 합: {difference_sum}')


Autoencoder Epoch 0, Loss: 1.4385449886322021
Autoencoder Epoch 30, Loss: 0.9035625457763672
Autoencoder Epoch 60, Loss: 0.8645066618919373
Autoencoder Epoch 90, Loss: 0.8425618410110474
Autoencoder Epoch 120, Loss: 0.8376906514167786
Autoencoder Epoch 150, Loss: 0.8273600339889526
Autoencoder Epoch 180, Loss: 0.8122987747192383
Autoencoder Epoch 210, Loss: 0.7707395553588867
Autoencoder Epoch 240, Loss: 0.74178546667099
Autoencoder Epoch 270, Loss: 0.7256460189819336
Step 1: Autoencoder Pre-training 완료
Regression Epoch 0, Loss: 0.3845107853412628
Regression Epoch 30, Loss: 0.005148601718246937
Regression Epoch 60, Loss: 0.0022980188950896263
Regression Epoch 90, Loss: 0.0037763267755508423
Regression Epoch 120, Loss: 0.005386823322623968
Regression Epoch 150, Loss: 0.002133948728442192
Regression Epoch 180, Loss: 0.0024484931491315365
Regression Epoch 210, Loss: 0.004260100424289703
Regression Epoch 240, Loss: 0.0010864827781915665
Regression Epoch 270, Loss: 0.0030475063249468803
Ste