In [32]:
# https://github.com/jeffheaton/t81_558_deep_learning/blob/master/t81_558_class_04_3_regression.ipynb

In [16]:
# import the required library
import pandas as pd
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])

In [3]:
df.head()

Unnamed: 0,id,job,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product
0,1,vv,c,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,0.0711,b
1,2,kd,c,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,0.400809,c
2,3,pe,c,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b
3,4,11,c,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b
4,5,kl,d,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a


In [4]:
df.isna().sum()

id                 0
job                0
area               0
income            59
aspect             0
subscriptions      0
dist_healthy       0
save_rate          0
dist_unhealthy     0
age                0
pop_dense          0
retail_dense       0
crime              0
product            0
dtype: int64

In [5]:
"""
Our purpose is to predict the age by analysisng shopping pattern
we can see income has only missing value
"""


'\nOur purpose is to predict the age by analysisng shopping pattern\nwe can see income has only missing value\n'

In [6]:
pd.get_dummies(df["job"], prefix="job").head(4)

Unnamed: 0,job_11,job_al,job_am,job_ax,job_bf,job_by,job_cv,job_de,job_dz,job_e2,...,job_pe,job_po,job_pq,job_pz,job_qp,job_qw,job_rn,job_sa,job_vv,job_zz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Generate dummies for job
df = pd.concat([df,pd.get_dummies(df["job"], prefix="job") ], axis=1)
df.drop("job", axis = 1, inplace = True)

In [8]:
df.head(2)

Unnamed: 0,id,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,...,job_pe,job_po,job_pq,job_pz,job_qp,job_qw,job_rn,job_sa,job_vv,job_zz
0,1,c,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,...,0,0,0,0,0,0,0,0,1,0
1,2,c,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Generate dummies for area
df = pd.concat([df, pd.get_dummies(df["area"], prefix = "area")], axis = 1)
df.drop("area", axis = 1, inplace=True)


In [10]:
# Generate dummies for product
df = pd.concat([df, pd.get_dummies(df["product"], prefix="product")], axis = 1)
df.drop("product", axis = 1, inplace=True)

In [12]:
# Fill median income to na 
df["income"].isna().sum()

59

In [14]:
df['income'].fillna(value=df["income"].median(), inplace=True)

In [17]:
# Standardize ranges
df["income"] = zscore(df["income"])
df["aspect"] = zscore(df["aspect"])
df["save_rate"] = zscore(df["subscriptions"])


In [18]:
df.head()

Unnamed: 0,id,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,...,area_b,area_c,area_d,product_a,product_b,product_c,product_d,product_e,product_f,product_g
0,1,-0.60755,-0.664918,1,9.017895,-0.208449,11.738935,49,0.885827,0.492126,...,0,1,0,0,1,0,0,0,0,0
1,2,0.338053,-0.207748,2,7.766643,0.839031,6.805396,51,0.874016,0.34252,...,0,1,0,0,0,1,0,0,0,0
2,3,-0.184205,1.127906,1,3.632069,-0.208449,13.671772,44,0.944882,0.724409,...,0,1,0,0,1,0,0,0,0,0
3,4,-0.526467,-0.440815,1,5.372942,-0.208449,4.333286,50,0.889764,0.444882,...,0,1,0,0,1,0,0,0,0,0
4,5,-2.851675,1.638861,3,3.822477,1.886511,5.967121,38,0.744094,0.661417,...,0,0,1,1,0,0,0,0,0,0


In [19]:
x_cols = df.drop(columns=["age", "id"]) # age is y and id is no meaning in regression
x_cols.head()

Unnamed: 0,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,pop_dense,retail_dense,crime,job_11,...,area_b,area_c,area_d,product_a,product_b,product_c,product_d,product_e,product_f,product_g
0,-0.60755,-0.664918,1,9.017895,-0.208449,11.738935,0.885827,0.492126,0.0711,0,...,0,1,0,0,1,0,0,0,0,0
1,0.338053,-0.207748,2,7.766643,0.839031,6.805396,0.874016,0.34252,0.400809,0,...,0,1,0,0,0,1,0,0,0,0
2,-0.184205,1.127906,1,3.632069,-0.208449,13.671772,0.944882,0.724409,0.207723,0,...,0,1,0,0,1,0,0,0,0,0
3,-0.526467,-0.440815,1,5.372942,-0.208449,4.333286,0.889764,0.444882,0.361216,1,...,0,1,0,0,1,0,0,0,0,0
4,-2.851675,1.638861,3,3.822477,1.886511,5.967121,0.744094,0.661417,0.068033,0,...,0,0,1,1,0,0,0,0,0,0


In [20]:
x = x_cols.values
y = df["age"].values

In [21]:
# Split Data to train and Test
x_train, x_test , y_train, y_test = train_test_split(x,y, test_size=0.2,
                                                     random_state=42)
x_train.shape, x_test.shape

((1600, 53), (400, 53))

## Design DNN Model

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

In [23]:
## Build the model
model = Sequential()
model.add(Dense(25, input_dim = x.shape[1], activation = "relu")) #Hidden 1
model.add(Dense(10, activation = "relu")) # Hidden 2 
model.add(Dense(1))

model.compile(loss = "mean_squared_error", optimizer = "adam")

monitor = EarlyStopping(monitor = "val_loss", min_delta=1e-3,
                       patience=5, verbose=1, mode = "auto")

model.fit(x_train, y_train, validation_data=(x_test, y_test),
         callbacks=[monitor], verbose=2, epochs = 1000)

Epoch 1/1000
50/50 - 2s - loss: 1842.9844 - val_loss: 1521.2675 - 2s/epoch - 36ms/step
Epoch 2/1000
50/50 - 0s - loss: 1129.6904 - val_loss: 599.1052 - 126ms/epoch - 3ms/step
Epoch 3/1000
50/50 - 0s - loss: 335.4672 - val_loss: 231.3333 - 123ms/epoch - 2ms/step
Epoch 4/1000
50/50 - 0s - loss: 241.8071 - val_loss: 203.6827 - 103ms/epoch - 2ms/step
Epoch 5/1000
50/50 - 0s - loss: 209.5699 - val_loss: 172.1028 - 101ms/epoch - 2ms/step
Epoch 6/1000
50/50 - 0s - loss: 169.8161 - val_loss: 132.4540 - 102ms/epoch - 2ms/step
Epoch 7/1000
50/50 - 0s - loss: 123.9975 - val_loss: 93.5410 - 101ms/epoch - 2ms/step
Epoch 8/1000
50/50 - 0s - loss: 83.7607 - val_loss: 63.6562 - 101ms/epoch - 2ms/step
Epoch 9/1000
50/50 - 0s - loss: 53.8779 - val_loss: 41.6163 - 98ms/epoch - 2ms/step
Epoch 10/1000
50/50 - 0s - loss: 34.4512 - val_loss: 28.6969 - 96ms/epoch - 2ms/step
Epoch 11/1000
50/50 - 0s - loss: 23.7140 - val_loss: 22.0576 - 98ms/epoch - 2ms/step
Epoch 12/1000
50/50 - 0s - loss: 18.3811 - val_loss:

50/50 - 0s - loss: 0.5855 - val_loss: 0.8007 - 105ms/epoch - 2ms/step
Epoch 99/1000
50/50 - 0s - loss: 0.5811 - val_loss: 0.7730 - 102ms/epoch - 2ms/step
Epoch 100/1000
50/50 - 0s - loss: 0.5726 - val_loss: 0.7016 - 104ms/epoch - 2ms/step
Epoch 101/1000
50/50 - 0s - loss: 0.5455 - val_loss: 0.7070 - 107ms/epoch - 2ms/step
Epoch 102/1000
50/50 - 0s - loss: 0.5538 - val_loss: 0.8342 - 103ms/epoch - 2ms/step
Epoch 103/1000
50/50 - 0s - loss: 0.6363 - val_loss: 0.7145 - 101ms/epoch - 2ms/step
Epoch 104/1000
50/50 - 0s - loss: 0.5449 - val_loss: 0.7082 - 105ms/epoch - 2ms/step
Epoch 105/1000
50/50 - 0s - loss: 0.5722 - val_loss: 1.1711 - 108ms/epoch - 2ms/step
Epoch 00105: early stopping


<keras.callbacks.History at 0x24b93de6730>

### Mean Square Error

The mean square error is the sum of the squared differences between the prediction (
) and the expected (). MSE values are not of a particular unit. If an MSE value has decreased for a model, that is good. However, beyond this, there is not much more you can determine. Low MSE values are desired.

In [24]:
from sklearn import metrics

# predict by our pretrained model
pred = model.predict(x_test)

# Measure MSE error
score = metrics.mean_squared_error(pred, y_test)
print("Final Score (MSE): {}".format(score))

Final Score (MSE): 1.1711487756253063


## Lift Chart

In [None]:
def chart_regression(pred,y, sort = True):
    """
    plot LIFT Chart, 
    showing in sorting show where are the outliers are exist 
    in lower regions or in higher regions
    """
    t = pd.DataFrame()