In [1]:
import sys
import sklearn
import numpy as np
import os
import pandas as pd
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow import keras

# #cpu 사용
os.environ["CUDA_VISIBLE_DEVICES"] = '-1'

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ann"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
#     print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv("kc_house_data.csv")
df = shuffle(df, random_state=42)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21613 entries, 735 to 15795
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21613 entries, 735 to 15795
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long

In [5]:
df["date"] = pd.to_datetime(df["date"])
import datetime #현재 날짜 받아오기 위해 datatime 사용
df["now"] = pd.to_datetime(datetime.datetime.now().strftime("%Y-%m-%d"))
df["now-date"] = (df["now"] - df["date"]).dt.days
df["now-date"] #현재 날짜 - 거래날짜를 int형으로 변환

735      2801
2830     2847
4106     2853
16218    2625
19964    2885
         ... 
11964    2763
21575    2765
5390     2807
860      2909
15795    2631
Name: now-date, Length: 21613, dtype: int64

In [6]:
data_train = df.copy()
X_train = data_train.drop(["id", "date", "price", "now"], axis = 1)
y_train = data_train["price"]
X_train.describe()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,now-date
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,3.370842,2.114757,2079.899736,15106.97,1.494309,0.007542,0.234303,3.40943,7.656873,1788.390691,291.509045,1971.005136,84.402258,98077.939805,47.560053,-122.213896,1986.552492,12768.455652,2777.806922
std,0.930062,0.770163,918.440897,41420.51,0.539989,0.086517,0.766318,0.650743,1.175459,828.090978,442.575043,29.373411,401.67924,53.505026,0.138564,0.140828,685.391304,27304.179631,113.048011
min,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0,2568.0
25%,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0,2667.0
50%,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.5718,-122.23,1840.0,7620.0,2791.0
75%,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0,2877.0
max,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0,2958.0


In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_pipline = Pipeline([
    ('std_scaler',StandardScaler()),
    ('Imputer', SimpleImputer(strategy="median"))
])
X_train = num_pipline.fit_transform(X_train)

In [18]:
# 모델 생성
def build_model():
    class ResidualBlock(keras.layers.Layer):
        def __init__(self, n_neurons, **kwargs):
            super().__init__(**kwargs)
            self.main_layers = [
#                 keras.layers.Dense(n_neurons, kernel_initializer = 'lecun_normal', 
#                                    activation = "selu"),
                keras.layers.Dense(n_neurons, kernel_initializer = 'lecun_normal', 
                                   activation = "selu")]
            
        def call(self, inputs):
            Z = inputs
            for layer in self.main_layers:
                Z = layer(Z)
                skip_Z = inputs
            return Z + skip_Z
                
    def DenseLayers(filters, rate):
        model.add(keras.layers.Dense(filters, kernel_initializer = 'lecun_normal',
                                     activation = 'selu'))
        if rate > 0:
            model.add(keras.layers.Dropout(rate = rate))
        
    def ResidualBlockLayers(filters, rate):
        model.add(ResidualBlock(filters))
        if rate > 0:
            model.add(keras.layers.Dropout(rate = rate))
    
    #시퀀셜 api 모델
    model = keras.models.Sequential()
    model.add(keras.layers.Flatten(input_shape = (19,)))
    DenseLayers(256, 0)
    DenseLayers(128, 0)
    ResidualBlockLayers(128, 0.1)
    ResidualBlockLayers(128, 0.1)
#     ResidualBlockLayers(128, 0)
    DenseLayers(128, 0.2)
    model.add(keras.layers.Dense(1))
    
    model.compile(loss="mse", 
                  optimizer = "adam", 
                  metrics=["mse"])
    
    return model

In [19]:
def k_folding(num_epochs):
    k = 10
    num_val = len(X_train) // k
    all_scores = []
    all_accuracy_histories = []
    ave_accuracy_history = []
    check = []
    check_number = []
    
    np.random.seed(42)
    tf.random.set_seed(42)

    for i in range(k):
        print('processing fold #', i)

        X_val = X_train[i * num_val: (i + 1) * num_val]
        y_val = y_train[i * num_val: (i + 1) * num_val]
        X_train_part = np.concatenate(
            [X_train[:i * num_val],
            X_train[(i + 1) * num_val:]],
            axis = 0)
        y_train_part = np.concatenate(
            [y_train[:i * num_val],
            y_train[(i + 1) * num_val:]],
            axis = 0)

        model = build_model()
        history = model.fit(X_train_part, y_train_part,
                 epochs = num_epochs, validation_data = (X_val, y_val),
                 verbose = 0, batch_size = 32)
        
        #val_mse, val_mae = model.evaluate(X_val, y_val, verbose = 0)
        accuracy_history = history.history['val_mse']
        #all_scores.append(val_mae)
        all_accuracy_histories.append(accuracy_history)
        
    #accuracy가 epoch마다 수행한 검증 평균 정확도의 평균을 계산
    ave_accuracy_history = [
    np.mean([x[i] for x in all_accuracy_histories]) for i in range(num_epochs)
    ]
    
    for i in range(0, len(ave_accuracy_history)):
        if ave_accuracy_history[i] < 130000:
            check.append(ave_accuracy_history[i])
            check_number.append(i + 1)
            
    for i in range(0, len(check)):
        print(f'{check[i]} epoch = {check_number[i]}')

#         val_loss, val_mse = model.evaluate(X_val, y_val, verbose = 0)
#         all_scores.append(val_mse)
# #     model.summary()
# #     print(all_scores)
#     print(f"{np.sqrt(np.mean(all_scores))}, epoch {num_epochs}")

In [None]:
k_folding(500)

processing fold # 0


In [11]:
# # 모델 생성
# def build_model():
#     class ResidualBlock(keras.layers.Layer):
#         def __init__(self, n_neurons, **kwargs):
#             super().__init__(**kwargs)
#             self.main_layers = [
#                 keras.layers.Dense(n_neurons, kernel_initializer = 'lecun_normal', 
#                                    activation = "selu")]
            
#             self.skip_layers = []
            
#         def call(self, inputs):
#             Z = inputs
#             for layer in self.main_layers:
#                 Z = layer(Z)
#                 skip_Z = inputs
#             for layer in self.skip_layers:
#                 skip_Z = layer(skip_Z)
#             return Z + skip_Z
    
#     #시퀀셜 api 모델
#     model = keras.models.Sequential()
#     model.add(keras.layers.Flatten(input_shape = (19,)))
#     model.add(keras.layers.Dense(256, kernel_initializer = 'lecun_normal', activation = "selu"))
#     model.add(keras.layers.BatchNormalization())
#     model.add(ResidualBlock(256))
#     model.add(ResidualBlock(256))
#     model.add(keras.layers.Dense(128, kernel_initializer = 'lecun_normal', activation = "selu"))
#     model.add(keras.layers.Dropout(rate = 0.2))
    
#     model.add(keras.layers.Dense(1))
    
#     model.compile(loss="mse", 
#                   optimizer = "adam", 
#                   metrics=["mse"])
    
#     117679.0273787135, epoch 70 batch 32

In [12]:
# # 모델 생성
# def build_model():
#     class ResidualBlock(keras.layers.Layer):
#         def __init__(self, n_neurons, **kwargs):
#             super().__init__(**kwargs)
#             self.main_layers = [
#                 keras.layers.Dense(n_neurons, kernel_initializer = 'lecun_normal', 
#                                    activation = "selu"),
#                 keras.layers.BatchNormalization()]
            
#             self.skip_layers = []
            
#         def call(self, inputs):
#             Z = inputs
#             for layer in self.main_layers:
#                 Z = layer(Z)
#                 skip_Z = inputs
#             for layer in self.skip_layers:
#                 skip_Z = layer(skip_Z)
#             return Z + skip_Z
    
#     class ResidualRegressor(keras.models.Model):
#         def __init__(self, **kwargs):
#             super().__init__(**kwargs)
#             self.hidden1 = keras.layers.Dense(300, activation="selu",
#                                               kernel_initializer="lecun_normal")
#             self.hidden2 = keras.layers.BatchNormalization()
#             self.block1 = ResidualBlock(300)
#             self.block2 = keras.layers.Dense(300, activation="selu",
#                                               kernel_initializer="lecun_normal")
#             self.block3 = keras.layers.BatchNormalization()
#             self.block4 = keras.layers.Dense(300, activation="selu",
#                                               kernel_initializer="lecun_normal")
#             self.block5 = keras.layers.BatchNormalization()
#             self.out = keras.layers.Dense(1)

#         def call(self, inputs):
#             Z = self.hidden1(inputs)
#             Z = self.hidden2(Z)
#             Z = self.block1(Z)
#             Z = self.block2(Z)
#             Z = self.block3(Z)
#             Z = self.block4(Z)
#             Z = self.block5(Z)
#             return self.out(Z)
            
#     model = ResidualRegressor()
    
#     model.compile(loss="mse", 
#                   optimizer = "adam", 
#                   metrics=["mse"])
    
#     125017.94057494309 batch 32

In [13]:
# # 모델 생성
# def build_model():
#     model = keras.models.Sequential()
#     model.add(keras.layers.Flatten(input_shape = (18,)))
#     model.add(keras.layers.BatchNormalization())
#     model.add(keras.layers.Dense(300, kernel_initializer = 'lecun_normal', activation = "selu"))
#     model.add(keras.layers.BatchNormalization())
#     model.add(keras.layers.Dense(300, kernel_initializer = 'lecun_normal', activation = "selu"))
#     model.add(keras.layers.BatchNormalization())
#     model.add(keras.layers.Dense(300, kernel_initializer = 'lecun_normal', activation = "selu"))
#     model.add(keras.layers.BatchNormalization())
#     model.add(keras.layers.Dense(1))
    
#     model.compile(loss="mse", 
#                   optimizer = "adam", 
#                   metrics=["mse"])
    
#     139864.39512613637 batch 32

In [14]:
# def k_folding2(num_epochs):
#     early_stop_patiences = 20
#     k = 10
#     num_val = len(X_train) // k
#     all_scores = []
    
#     np.random.seed(42)
#     tf.random.set_seed(42)

#     #early stopping
#     early_stopping_cb = keras.callbacks.EarlyStopping(patience = early_stop_patiences,
#                                                      restore_best_weights = True)

#     #learning_rate down
#     lr_scheduler = keras.callbacks.ReduceLROnPlateau(factor = 0.5, patience = 10)
    
#     for i in range(k):
#         #callback
#         checkpoint_cb = keras.callbacks.ModelCheckpoint(f"my_proj03_model_{i}.h5",
#                                                    save_best_only = True)
#         print('processing fold #', i)

#         X_val = X_train[i * num_val: (i + 1) * num_val]
#         y_val = y_train[i * num_val: (i + 1) * num_val]
#         X_train_part = np.concatenate(
#             [X_train[:i * num_val],
#             X_train[(i + 1) * num_val:]],
#             axis = 0)
#         y_train_part = np.concatenate(
#             [y_train[:i * num_val],
#             y_train[(i + 1) * num_val:]],
#             axis = 0)

#         model = build_model()
#         model.fit(X_train_part, y_train_part,
#                  epochs = num_epochs, validation_data = (X_val, y_val),
#                  batch_size = 32,
#                  callbacks = [early_stopping_cb, checkpoint_cb])
        
#         model = keras.models.load_model(f"my_proj03_model_{i}.h5")
#         val_mse, val_mae = model.evaluate(X_val, y_val, verbose = 0)
#         all_scores.append(val_mse)
#         print(f"{val_mse}\n")
#     print(all_scores)
#     print(f"score : {np.sqrt(np.mean(all_scores))}")

In [15]:
# k_folding2(500)

In [16]:
# # 모델 생성
# def build_model():
#     model = keras.models.Sequential()
#     model.add(keras.layers.Flatten(input_shape = (18,)))
#     model.add(keras.layers.BatchNormalization())
#     model.add(keras.layers.Dense(128, kernel_initializer = 'lecun_normal', activation = "selu"))
#     model.add(keras.layers.BatchNormalization())
#     model.add(keras.layers.Dense(128, kernel_initializer = 'lecun_normal', activation = "selu"))
#     model.add(keras.layers.BatchNormalization())
#     model.add(keras.layers.Dense(32, kernel_initializer = 'lecun_normal', activation = "selu"))
#     model.add(keras.layers.BatchNormalization())
#     model.add(keras.layers.Dense(16, kernel_initializer = 'lecun_normal', activation = "selu"))
#     model.add(keras.layers.BatchNormalization())
#     model.add(keras.layers.Dense(16, kernel_initializer = 'lecun_normal', activation = "selu"))
#     model.add(keras.layers.BatchNormalization())
# #     model.add(keras.layers.Dropout(rate=0.5))
#     model.add(keras.layers.Dense(1))
    
#     model.compile(loss="mse", 
#                   optimizer = 'nadam', 
#                   metrics=["mse"])
    
#     197558.73356953874