In [1]:
# Import modules for dataframes
import pandas as pd 
import numpy as np 

# Import modules for TensorFlow
import tensorflow as tf 
from tensorflow import keras
from keras import layers
from keras import backend as K

# Import module for Kfold validation
from sklearn.model_selection import KFold

# Import modules for plotting
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

# Import others
import os

In [2]:
# Setting up working directory
# os.chdir("c:\\PythonProject\\EDA-Project")
# os.getcwd()

In [3]:
# Paths for data files
str_path_train_data = "./data/sKor_data_tot_train.csv"
str_path_test_data = "./data/sKor_data_tot_test.csv"

In [4]:
# Read training data set
df_raw_train = pd.read_csv(str_path_train_data, index_col= 0)
# df_raw_train["cat_year"] = df_raw_train["id_hs"].apply(lambda x: int(x.split("_")[0]))
df_raw_train.head(10)

Unnamed: 0,cat_loc_div,cat_sz_cty,cat_hus_typ,num_flr,num_out_wls,cat_hus_dir_fce,num_hus_blt_yr,num_hus_ar,num_bed,num_liv,num_bat,num_out_wds,cat_fuel_heat,cat_cool,cat_fuel_cook,num_hus_mems,cat_mhh_occu,num_tot_energy_heat,num_mhh_age,num_hh_ann_incm
0,1,2,3,3,2,1,4,105.6,3,1,1,11,1,1,4,2,5,13741.16,67,25000000
1,3,2,3,2,4,2,5,59.4,2,1,1,3,3,3,4,5,1,12531.197472,55,25000000
2,1,1,3,5,6,2,6,115.5,3,1,2,4,1,3,4,3,1,6329.650468,35,42000000
3,1,1,1,1,6,1,1,165.0,2,1,1,4,2,1,3,3,1,10775.93,45,25000000
4,1,2,2,4,4,1,5,99.0,3,1,1,4,2,1,3,4,1,8710.22,45,42000000
5,3,1,1,1,6,2,4,99.0,4,1,2,8,1,3,4,4,1,9325.095106,55,4
6,2,1,3,28,4,2,5,141.9,4,1,2,17,1,3,4,2,1,5691.611227,55,96000000
7,1,2,2,2,4,3,5,72.6,2,1,1,8,1,3,4,3,1,9653.92,55,42000000
8,1,1,2,3,5,1,4,66.0,2,1,1,7,1,3,4,2,1,5783.180994,35,96000000
9,3,2,3,2,4,2,5,79.2,2,1,1,8,1,3,4,3,1,12291.820247,55,54000000


In [5]:
# Read test data set
df_raw_test = pd.read_csv(str_path_test_data, index_col= 0)
# df_raw_test["cat_year"] = df_raw_test["id_hs"].apply(lambda x: int(x.split("_")[0]))
df_raw_test.head(10)

Unnamed: 0,cat_loc_div,cat_sz_cty,cat_hus_typ,num_flr,num_out_wls,cat_hus_dir_fce,num_hus_blt_yr,num_hus_ar,num_bed,num_liv,num_bat,num_out_wds,cat_fuel_heat,cat_cool,cat_fuel_cook,num_hus_mems,cat_mhh_occu,num_tot_energy_heat,num_mhh_age,num_hh_ann_incm
0,2,2,1,1,6,1,4,23.1,1,1,1,2,1,3,3,2,5,2947.48,67,18000000
1,3,2,1,1,5,2,5,46.2,2,0,0,7,2,1,3,1,5,4774.3,67,12000000
2,2,1,2,2,5,4,4,138.6,4,1,2,16,1,3,4,4,3,39091.290564,55,54000000
3,3,1,3,4,5,3,1,79.2,3,1,1,14,1,3,4,6,1,16780.7,55,54000000
4,3,2,3,8,4,4,4,72.6,3,1,1,5,1,3,3,4,1,8685.600719,35,25000000
5,1,1,3,13,2,1,4,69.3,2,1,1,6,1,3,4,4,3,8510.54,55,3
6,1,1,3,13,2,1,4,49.5,1,1,1,4,3,1,4,3,3,11136.165175,45,4
7,3,2,3,11,4,2,4,125.4,3,1,2,18,1,3,3,3,1,8029.53,55,90000000
8,3,2,1,1,6,2,1,33.0,2,1,1,4,2,3,3,2,3,9906.49,67,18000000
9,3,1,2,2,3,3,6,102.3,3,1,2,26,1,2,4,3,1,9446.150342,45,5


In [6]:
# Copy datasets for isolation.
df_train = df_raw_train.copy()
df_test = df_raw_test.copy()

In [7]:
# Columns for variables (features).
lst_cols = list(df_train.columns)

# Numerical feature columns 
lst_cols_num = [l for l in lst_cols if l.split("_")[0] == "num"]
lst_cols_num.pop(lst_cols_num.index("num_tot_energy_heat")) # Excluding target variable.
print(lst_cols_num)

# Categorical feature columns
lst_cols_cat = [l for l in lst_cols if l.split("_")[0] == "cat"]
print(lst_cols_cat) 

['num_flr', 'num_out_wls', 'num_hus_blt_yr', 'num_hus_ar', 'num_bed', 'num_liv', 'num_bat', 'num_out_wds', 'num_hus_mems', 'num_mhh_age', 'num_hh_ann_incm']
['cat_loc_div', 'cat_sz_cty', 'cat_hus_typ', 'cat_hus_dir_fce', 'cat_fuel_heat', 'cat_cool', 'cat_fuel_cook', 'cat_mhh_occu']


In [8]:
# If needed, filter data frame here and check data types
lst_cols_fil = lst_cols_num + lst_cols_cat + ["num_tot_energy_heat"]
df_train_fil = df_train[lst_cols_fil]   # Organizing columns.
df_test_fil = df_test[lst_cols_fil]     # Organizing columns.
print(df_train_fil.dtypes)
print("------BREAK------")
print(df_test_fil.dtypes)

num_flr                  int64
num_out_wls              int64
num_hus_blt_yr           int64
num_hus_ar             float64
num_bed                  int64
num_liv                  int64
num_bat                  int64
num_out_wds              int64
num_hus_mems             int64
num_mhh_age              int64
num_hh_ann_incm          int64
cat_loc_div              int64
cat_sz_cty               int64
cat_hus_typ              int64
cat_hus_dir_fce          int64
cat_fuel_heat            int64
cat_cool                 int64
cat_fuel_cook            int64
cat_mhh_occu             int64
num_tot_energy_heat    float64
dtype: object
------BREAK------
num_flr                  int64
num_out_wls              int64
num_hus_blt_yr           int64
num_hus_ar             float64
num_bed                  int64
num_liv                  int64
num_bat                  int64
num_out_wds              int64
num_hus_mems             int64
num_mhh_age              int64
num_hh_ann_incm          int64
cat_loc

In [9]:
# Convert categorical variables into category type
for cat in lst_cols_cat:
    df_train_fil[cat] = df_train_fil[cat].astype("category")
    df_test_fil[cat] = df_test_fil[cat].astype("category")
    
print(df_train_fil.dtypes)
print("------BREAK------")
print(df_test_fil.dtypes)

num_flr                   int64
num_out_wls               int64
num_hus_blt_yr            int64
num_hus_ar              float64
num_bed                   int64
num_liv                   int64
num_bat                   int64
num_out_wds               int64
num_hus_mems              int64
num_mhh_age               int64
num_hh_ann_incm           int64
cat_loc_div            category
cat_sz_cty             category
cat_hus_typ            category
cat_hus_dir_fce        category
cat_fuel_heat          category
cat_cool               category
cat_fuel_cook          category
cat_mhh_occu           category
num_tot_energy_heat     float64
dtype: object
------BREAK------
num_flr                   int64
num_out_wls               int64
num_hus_blt_yr            int64
num_hus_ar              float64
num_bed                   int64
num_liv                   int64
num_bat                   int64
num_out_wds               int64
num_hus_mems              int64
num_mhh_age               int64
num_hh_a

In [10]:
# Create dummy (one-hot coding) columns for each categorical labels
for cat in lst_cols_cat:
    df_train_fil = pd.get_dummies(df_train_fil, columns= [cat], prefix= cat, prefix_sep="_")
    df_test_fil = pd.get_dummies(df_test_fil, columns= [cat], prefix= cat, prefix_sep="_")
    
print(df_train_fil.dtypes)
print("------BREAK------")
print(df_test_fil.dtypes)

num_flr                  int64
num_out_wls              int64
num_hus_blt_yr           int64
num_hus_ar             float64
num_bed                  int64
num_liv                  int64
num_bat                  int64
num_out_wds              int64
num_hus_mems             int64
num_mhh_age              int64
num_hh_ann_incm          int64
num_tot_energy_heat    float64
cat_loc_div_1            uint8
cat_loc_div_2            uint8
cat_loc_div_3            uint8
cat_sz_cty_1             uint8
cat_sz_cty_2             uint8
cat_hus_typ_1            uint8
cat_hus_typ_2            uint8
cat_hus_typ_3            uint8
cat_hus_dir_fce_1        uint8
cat_hus_dir_fce_2        uint8
cat_hus_dir_fce_3        uint8
cat_hus_dir_fce_4        uint8
cat_fuel_heat_1          uint8
cat_fuel_heat_2          uint8
cat_fuel_heat_3          uint8
cat_fuel_heat_4          uint8
cat_fuel_heat_5          uint8
cat_cool_1               uint8
cat_cool_2               uint8
cat_cool_3               uint8
cat_fuel

In [11]:
# Define beackend function for R_Squared metrics
# Use it if necessary for model quality metrics.
def coeff_determination(y_true, y_pred):
    SS_res = K.sum(K.square( y_true-y_pred )) 
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [12]:
# Check physical devices.
print(tf.config.list_physical_devices("CPU"))
print(tf.config.list_physical_devices("GPU"))

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [13]:
# Define a function to generate DNN model for regression.
# For a general regression function, two dense layers were configured.
# Variable hyperparameter is number of nodes in the layers.
# More than two layers would be too excessive.
def build_and_compile_model(norm, in_int_nodes):
    
    model = keras.Sequential([
        norm,
        layers.Dense(in_int_nodes, activation='relu'),
        # layers.Dropout(rate= 0.2),
        layers.Dense(in_int_nodes, activation='relu'),
        # layers.Dropout(rate= 0.2),
        # layers.Dense(100, activation='relu'),
        layers.Dense(1)
    ])
    
    optimizer = tf.keras.optimizers.Adam(0.0005) # Set optimization step!
        
    model.compile(
        loss = "mean_absolute_percentage_error",
        optimizer = optimizer,
        metrics = [
                tf.keras.metrics.RootMeanSquaredError(), # RMSE                
                tf.keras.metrics.MeanAbsoluteError(), # MAE
                tf.keras.metrics.MeanAbsolutePercentageError(), # MAPE
                # coeff_determination, # Use if necessary
            ],        
    )
    return model

In [14]:
# Prepare datasets.
train_features = df_train_fil.copy()
test_featrues = df_test_fil.copy()
train_label = train_features.pop("num_tot_energy_heat")
test_label = test_featrues.pop("num_tot_energy_heat")

In [15]:
# Define trial nodes
lst_int_dnn_nodes = [10, 20, 30, 40, 50, 80]

# Define K value for Kfold CV
n_split = 10

# Define early stop call back
# As K-fold CV will be applied, call-back will monitor 'loss' not 'val_loss' from validation data.
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)

# Define normalization layer and adapt with training data
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(train_features))

# Adapt normalizing layer and check it.
first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
    print('First example:', first)
    print("------BREAK------")
    print('Normalized:', normalizer(first).numpy()) 

First example: [[       3.         2.         4.       105.6        3.         1.
         1.        11.         2.        67.  25000000.         1.
         0.         0.         0.         1.         0.         0.
         1.         1.         0.         0.         0.         1.
         0.         0.         0.         0.         1.         0.
         0.         0.         1.         0.         0.         0.
         0.         0.         1. ]]
------BREAK------
Normalized: [[-0.29 -1.7   0.09  0.66  0.38  0.17 -0.66  0.52 -0.74  1.2  -0.01  1.33
  -0.55 -0.82 -0.89  0.89 -0.78 -0.43  1.06  1.14 -0.66 -0.46 -0.3   0.74
  -0.48 -0.3  -0.2  -0.22  1.64 -0.11 -1.59 -0.63  0.83 -0.38 -0.98 -0.24
  -0.56 -0.2   2.14]]


In [16]:
# Convert Dataframes into arrays. (Dropping columns...)
train_features_array = np.array(train_features)
train_label_array = np.array(train_label)
test_featrues_array = np.array(test_featrues)
test_label_array = np.array(test_label)

In [17]:
# Empty lists to record training results.
lst_df_eval_nodes = []
lst_df_eval_nodes_cv = []

# LOOP1: Loop calculation for all number of input nodes in dense layers.
for node in lst_int_dnn_nodes:
    
    dic_model_eval_cv = {}  # Empty list to record model quality metrics.
    cv_idx = 0              # Set up K-fold sample index number.
    
    # LOOP2: Loop calculation for KFold samples. 
    for train_idx, val_idx in KFold(n_split, shuffle= True, random_state= 123).split(train_features_array, train_label_array):
        # Update K-fold sample index.
        cv_idx += 1
        print(f"Training for node number: {node} & CV sample: {cv_idx} has been commenced.")
        # Extract K-fold sample from raw datasets.
        cv_features_train = train_features_array[train_idx]     
        cv_features_val = train_features_array[val_idx]
        cv_label_train = train_label_array[train_idx]
        cv_label_val = train_label_array[val_idx]         
        # Build DNN model with defined normalizer and node numbers.
        dnn_model = build_and_compile_model(normalizer, node)
        # Train the model. You may adjust epoch number.
        dnn_model.fit(
            cv_features_train,
            cv_label_train,
            validation_split= 0, # This option is not for CV method.
            batch_size= 128,
            verbose=0, epochs=500,
            callbacks= [early_stop],
        )
        # Record evaluation of the trained model with test dataset.
        dic_model_eval_cv["cv_{}".format(cv_idx)] = list(dnn_model.evaluate(cv_features_val, cv_label_val, verbose=0))
    
    print(f"All K models for node number: {node} has been trained and evaluated.")
    
    # Post-process evaluation results from all K models.
    df_cv_res = pd.DataFrame(dic_model_eval_cv)
    df_cv_res = df_cv_res.iloc[1:,]
    df_cv_res = df_cv_res.transpose()
    
    # Calculate mean values of K-fold evaluation results for each node value.
    dic_single = pd.DataFrame([list(df_cv_res.mean())], columns= ["RMSE", "MAE", "MAPE"])
    dic_single.rename(index= {0:"node_{}".format(node)}, inplace=True)
    
    # Append lists for results recording.
    lst_df_eval_nodes.append(dic_single)
    lst_df_eval_nodes_cv.append(df_cv_res)

Training for node number: 10 & CV sample: 1 has been commenced.
Training for node number: 10 & CV sample: 2 has been commenced.
Training for node number: 10 & CV sample: 3 has been commenced.
Training for node number: 10 & CV sample: 4 has been commenced.
Training for node number: 10 & CV sample: 5 has been commenced.
Training for node number: 10 & CV sample: 6 has been commenced.
Training for node number: 10 & CV sample: 7 has been commenced.
Training for node number: 10 & CV sample: 8 has been commenced.
Training for node number: 10 & CV sample: 9 has been commenced.
Training for node number: 10 & CV sample: 10 has been commenced.
All K models for node number: 10 has been trained and evaluated.
Training for node number: 20 & CV sample: 1 has been commenced.
Training for node number: 20 & CV sample: 2 has been commenced.
Training for node number: 20 & CV sample: 3 has been commenced.
Training for node number: 20 & CV sample: 4 has been commenced.
Training for node number: 20 & CV samp

In [18]:
# Concatnate evaluation results for each node number of dense layer.
df_concat_eval_nodes = pd.concat(lst_df_eval_nodes, ignore_index= False, sort= False)
df_concat_eval_nodes

Unnamed: 0,RMSE,MAE,MAPE
node_10,5435.650195,3741.777515,33.72204
node_20,5331.768994,3671.88606,33.250787
node_30,5287.033691,3640.141479,33.090785
node_40,5269.679443,3634.826099,33.181406
node_50,5261.232617,3626.412036,33.028295
node_80,5195.071875,3583.489038,32.922137


In [19]:
# Store the evaluation results.
df_concat_eval_nodes.to_csv("./results/dnn_10_cv_node.csv")

In [20]:
# Check more details on 10-fold CV results.
lst_df_eval_nodes_cv

[                 1            2          3
 cv_1   5350.889160  3824.616211  33.039158
 cv_2   5119.219238  3656.346436  34.032211
 cv_3   5624.202148  3827.785156  32.083580
 cv_4   5093.550293  3488.932373  34.541592
 cv_5   5189.332031  3622.850342  34.221638
 cv_6   6642.273438  4078.069824  36.798416
 cv_7   5548.025879  3821.425049  33.330925
 cv_8   5172.862305  3659.938721  33.949535
 cv_9   5521.609375  3820.698486  33.814651
 cv_10  5094.538086  3617.112549  31.408697,
                  1            2          3
 cv_1   5274.268066  3787.189941  32.526669
 cv_2   5059.735840  3638.032959  33.961136
 cv_3   5558.224121  3785.165771  31.802855
 cv_4   5004.063477  3403.668213  33.970932
 cv_5   5100.689941  3567.480225  33.850037
 cv_6   6459.402344  3960.286865  35.838200
 cv_7   5488.666992  3776.479980  33.168800
 cv_8   5061.389648  3572.966797  33.218372
 cv_9   5311.005371  3690.055420  33.122284
 cv_10  5000.244141  3537.534424  31.048584,
                  1           

In [21]:
# Build a final DNN model to use all training dataset.
# Let's use a node number which shows the best performance.
dnn_model_fin = build_and_compile_model(normalizer, 30)
dnn_model_fin.summary()

Model: "sequential_60"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization (Normalizatio  (None, 39)               79        
 n)                                                              
                                                                 
 dense_180 (Dense)           (None, 30)                1200      
                                                                 
 dense_181 (Dense)           (None, 30)                930       
                                                                 
 dense_182 (Dense)           (None, 1)                 31        
                                                                 
Total params: 2,240
Trainable params: 2,161
Non-trainable params: 79
_________________________________________________________________


In [22]:
# Train the final model with more epochs.
history = dnn_model_fin.fit(
    train_features_array,
    train_label_array,
    validation_split= 0, # No validation set.
    batch_size= 128,
    verbose=1, epochs=1000,
    # callbacks= [early_stop],
)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [23]:
# Final trained model evaluation with test data.
dnn_model_fin.evaluate(test_featrues_array, test_label_array, verbose=0)

[33.75315475463867, 5887.86865234375, 3793.4873046875, 33.75315475463867]

In [24]:
# Save trained model.
dnn_model_fin.save("./results/dnn_model_fin")

INFO:tensorflow:Assets written to: ./results/dnn_model_fin\assets


In [25]:
# Load trained model and check evaluation results.
saved_dnn_model_fin = tf.keras.models.load_model("./results/dnn_model_fin")
saved_dnn_model_fin.evaluate(test_featrues_array, test_label_array, verbose=0)

[33.75315475463867, 5887.86865234375, 3793.4873046875, 33.75315475463867]

In [26]:
# Visualisation Templete.
eda_tempelete_01_white = dict(
    layout = go.Layout(
        # Layout properties
        title_font_size= 14,
        title_x= 0.1,
        font_size= 11,
        font_color= "#000000",
        font_family= "Times New Roman",
        margin_b = 65,
        margin_l = 60,
        margin_r = 30,
        margin_t = 50,
        plot_bgcolor= "#ffffff",
        # X axis properties
        xaxis_color= "#000000",
        xaxis_linecolor= "#000000",
        xaxis_ticks= "inside",        
        xaxis_tickfont_color= "#000000",
        xaxis_tickfont_family= "Times New Roman",
        xaxis_mirror= True,
        xaxis_showline= True,
        xaxis_showgrid= False,
        # Y axis properties
        yaxis_color= "#000000",
        yaxis_linecolor= "#000000",
        yaxis_ticks= "inside",
        yaxis_tickfont_color= "#000000",
        yaxis_tickfont_family= "Times New Roman",
        yaxis_mirror= True,
        yaxis_showline= True,
        yaxis_showgrid= False,
    )
)

In [27]:
# Predictions from the trained model with test dataset.
test_predictions = saved_dnn_model_fin.predict(test_featrues_array).flatten()
residual = test_predictions - test_label_array
residual



array([   783.69041016,   2602.1375    , -26868.743689  , ...,
         2055.87484238,   2291.95122911,   4391.87089844])

In [31]:
# Residual plot.
fig_residual = px.scatter(
    x= test_predictions,
    y= residual,
    trendline= "ols",
    trendline_color_override= "#fc4040",    
)

fig_residual.update_traces(
    marker_symbol= "circle-open",
    marker_color= "#000000"
)

fig_residual.update_layout(
    title= "Residuals for DNN predictions",
    xaxis_title= "Predicted Energy Consumption [MCal]",
    yaxis_title= "Residual [MCal]",
    # yaxis_zeroline= False,
    xaxis_zeroline= False,
    width= 350,
    height= 350,
    template= eda_tempelete_01_white,    
)

fig_residual.show()

In [32]:
# Prediction Plot.
fig_prediction =  go.Figure()

# lim_pred = max(test_label_array)
lim_pred = 30000

fig_prediction.add_traces(
    go.Scatter(
        x= test_label_array,
        y= test_predictions,
        mode= "markers",
        marker_symbol= "circle-open",
        marker_color= "#000000",        
    )
)

fig_prediction.add_traces(
    go.Scatter(
        x= [0,lim_pred],
        y= [0,lim_pred],
        mode= "lines",        
        marker_color= "#fc4040",        
    )
)

fig_prediction.update_layout(
    title= "Prediction of DNN",
    xaxis_title= "True Values [MCal]",    
    xaxis_fixedrange= True,
    xaxis_range = [0, lim_pred],
    yaxis_title= "Predictions [MCal]",
    yaxis_fixedrange= True,
    yaxis_range = [0, lim_pred],
    width= 350,
    height= 350,
    showlegend= False,
    template= eda_tempelete_01_white,    
)