In [0]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit, train_test_split
from sklearn import metrics
from scipy.stats import zscore
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping 
import time
import statistics

In [0]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [6]:
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])
df.drop("id", axis=1, inplace=True)

# Generate dummies for job
df = pd.concat([df,pd.get_dummies(df['job'],prefix="job")],axis=1)
df.drop('job', axis=1, inplace=True)

# Generate dummies for area
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area")],axis=1)
df.drop('area', axis=1, inplace=True)

# Generate dummies for product
df = pd.concat([df,pd.get_dummies(df['product'],prefix="product")],axis=1)
df.drop('product', axis=1, inplace=True)
nonna = df[df["income"].notna()]
isna = df[df["income"].isnull()]
nx = nonna.drop(["income", "age"], axis=1).values
ny = nonna["income"].values
nx_train, nx_test, ny_train, ny_test = train_test_split(nx, ny, test_size=0.2)
reg = LinearRegression().fit(nx_train, ny_train)
isna["income"] = reg.predict(isna.drop(["income", "age"], axis=1).values)
df = pd.concat([isna, nonna], axis=0)
reg.score(nx_test, ny_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.8454637982792446

In [0]:
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['save_rate'] = zscore(df['save_rate'])
df['subscriptions'] = zscore(df['subscriptions'])
df["dist_unhealthy"] = zscore(df["dist_unhealthy"])
df["dist_healthy"] = zscore(df["dist_healthy"])
x_columns = df.columns.drop('age')
x = df[x_columns].values
y = df['age'].values

In [8]:
df.head()

Unnamed: 0,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,job_11,job_al,job_am,job_ax,job_bf,job_by,job_cv,job_de,job_dz,job_e2,job_f8,job_gj,job_gv,job_kd,job_ke,job_kl,job_kp,job_ks,job_kw,job_mm,job_nb,job_nn,job_ob,job_pe,job_po,job_pq,job_pz,job_qp,job_qw,job_rn,job_sa,job_vv,job_zz,area_a,area_b,area_c,area_d,product_a,product_b,product_c,product_d,product_e,product_f,product_g
35,-0.150065,-0.4946,-0.208449,-0.670247,-0.387695,-1.105761,50,0.874016,0.417323,0.238394,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
69,0.245186,-0.996591,-0.208449,-0.000942,-0.542432,-1.169511,47,0.858268,0.503937,0.263349,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
101,0.848228,1.360973,-1.255928,1.294946,-0.353308,-0.310185,44,0.925197,0.692913,0.086156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
134,0.130981,-1.40894,-0.208449,-0.247778,0.282834,-0.487775,44,0.826772,0.586614,0.303499,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
155,0.424151,0.724521,-0.208449,0.644628,0.661081,2.321132,43,0.992126,0.775591,0.443469,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0


In [9]:
SPLITS = 50

boot = ShuffleSplit(n_splits=SPLITS, test_size=0.1, random_state=69)

mean_benchmark = []
epochs_needed = []
num = 0

for train, test in boot.split(x):
  num += 1
  print(f"Cycle #{num}\n")

  x_train = x[train]
  x_test = x[test]
  y_train = y[train]
  y_test = y[test]

  model = Sequential()
  model.add(Dense(50, input_dim=x.shape[1], activation="relu", ))
  model.add(Dropout(0.5))
  model.add(Dense(25, activation="relu"))
  model.add(Dense(1))
  model.compile(loss="mean_squared_error", optimizer="adam")
  
  monitor = EarlyStopping(monitor="val_loss", min_delta=1e-3, patience=10,
                          verbose=0, mode="auto", restore_best_weights=True)
  
  start_time = time.time()
  model.fit(x_train, y_train, validation_data=(x_test, y_test),
           callbacks=[monitor], verbose=0, epochs=500)
  time_took = time.time() - start_time
  
  epochs = monitor.stopped_epoch
  epochs_needed.append(epochs)
 
  pred = model.predict(x_test)

  score = np.sqrt(metrics.mean_squared_error(y_test, pred))
  mean_benchmark.append(score)

  m1 = statistics.mean(mean_benchmark)
  m2 = statistics.mean(epochs_needed)
  mdev = statistics.pstdev(mean_benchmark)
  print(f"score={score:.6f}, mean score={m1:.6f}, stdev={mdev:.6f}, epochs={epochs}, mean epochs={int(m2)}, time={hms_string(time_took)}")

Cycle #1

score=1.037836, mean score=1.037836, stdev=0.000000, epochs=89, mean epochs=89, time=0:00:08.14
Cycle #2

score=1.193826, mean score=1.115831, stdev=0.077995, epochs=88, mean epochs=88, time=0:00:07.96
Cycle #3

score=1.331748, mean score=1.187803, stdev=0.120065, epochs=98, mean epochs=91, time=0:00:08.77
Cycle #4

score=1.484980, mean score=1.262098, stdev=0.165440, epochs=102, mean epochs=94, time=0:00:08.96
Cycle #5

score=1.234768, mean score=1.256632, stdev=0.148378, epochs=102, mean epochs=95, time=0:00:09.03
Cycle #6

score=1.449047, mean score=1.288701, stdev=0.153261, epochs=96, mean epochs=95, time=0:00:08.87
Cycle #7

score=1.118257, mean score=1.264352, stdev=0.153917, epochs=79, mean epochs=93, time=0:00:07.21
Cycle #8

score=1.201830, mean score=1.256537, stdev=0.145454, epochs=106, mean epochs=95, time=0:00:09.34
Cycle #9

score=1.512303, mean score=1.284955, stdev=0.158956, epochs=70, mean epochs=92, time=0:00:06.24
Cycle #10

score=1.100035, mean score=1.266

In [13]:
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])
# Generate dummies for job
df = pd.concat([df,pd.get_dummies(df['job'],prefix="job")],axis=1)
df.drop('job', axis=1, inplace=True)

# Generate dummies for area
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area")],axis=1)
df.drop('area', axis=1, inplace=True)
nonna = df[df["income"].notna()]
isna = df[df["income"].isnull()]
nx = nonna.drop(["income", "product"], axis=1).values
ny = nonna["income"].values
nx_train, nx_test, ny_train, ny_test = train_test_split(nx, ny, test_size=0.2)
reg = LinearRegression().fit(nx_train, ny_train)
isna["income"] = reg.predict(isna.drop(["income", "product"], axis=1).values)
df = pd.concat([isna, nonna], axis=0)
reg.score(nx_test, ny_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.8128780309405559

In [0]:
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['save_rate'] = zscore(df['save_rate'])
df['subscriptions'] = zscore(df['subscriptions'])
df["dist_unhealthy"] = zscore(df["dist_unhealthy"])

x_col = df.columns.drop("product").drop("id")
x = df[x_col].values

dummies = pd.get_dummies(df['product']) # Classification
products = dummies.columns
y = dummies.values

In [28]:
Splits = 50

boot = StratifiedShuffleSplit(n_splits=SPLITS, test_size=0.1, random_state=69)

mean_benchmark = []
epochs_needed = []
cycle = 0

for train, test in boot.split(x, df["product"]):
  cycle += 1
  print(f"Cycle #{cycle}")

  x_train = x[train]
  x_test = x[test]
  y_train = y[train]
  y_test = y[test]

  model = Sequential()
  model.add(Dense(50, input_dim=x.shape[1], activation="relu"))
  model.add(Dropout(0.5))
  model.add(Dense(25, activation="relu"))
  model.add(Dense(y.shape[1], activation="softmax"))
  model.compile(loss="categorical_crossentropy", optimizer="adam")

  monitor = EarlyStopping(monitor="val_loss", min_delta=1e-3, patience=10,
                          verbose=0, mode="auto", restore_best_weights=True)

  start_time = time.time()
  model.fit(x_train, y_train, validation_data=(x_test, y_test),
            epochs=500, verbose=0, callbacks=[monitor])
  time_took = time.time() - start_time

  epochs = monitor.stopped_epoch

  pred = model.predict(x_test)
  y_compare = np.argmax(y_test, axis=1)
  score = metrics.log_loss(y_compare, pred)

  epochs_needed.append(epochs)
  mean_benchmark.append(score)

  m1 = statistics.mean(mean_benchmark)
  m2 = statistics.mean(epochs_needed)
  mdev = statistics.pstdev(mean_benchmark)

  print(f"log_loss={score:.6f}, mean log_loss={m1:.6f}, stdev={mdev:.6f}, epochs={epochs}, mean epochs={int(m2)}, time={hms_string(time_took)}")

Cycle #1
log_loss=0.712854, mean log_loss=0.712854, stdev=0.000000, epochs=67, mean epochs=67, time=0:00:06.27
Cycle #2
log_loss=0.712304, mean log_loss=0.712579, stdev=0.000275, epochs=65, mean epochs=66, time=0:00:06.07
Cycle #3
log_loss=0.704724, mean log_loss=0.709961, stdev=0.003710, epochs=71, mean epochs=67, time=0:00:06.67
Cycle #4
log_loss=0.724125, mean log_loss=0.713502, stdev=0.006924, epochs=62, mean epochs=66, time=0:00:05.74
Cycle #5
log_loss=0.719642, mean log_loss=0.714730, stdev=0.006662, epochs=58, mean epochs=64, time=0:00:05.42
Cycle #6
log_loss=0.683200, mean log_loss=0.709475, stdev=0.013231, epochs=158, mean epochs=80, time=0:00:14.24
Cycle #7
log_loss=0.746456, mean log_loss=0.714758, stdev=0.017819, epochs=101, mean epochs=83, time=0:00:09.25
Cycle #8
log_loss=0.711388, mean log_loss=0.714337, stdev=0.016705, epochs=84, mean epochs=83, time=0:00:07.75
Cycle #9
log_loss=0.756264, mean log_loss=0.718995, stdev=0.020535, epochs=63, mean epochs=81, time=0:00:06.19

num 1


In [26]:
#####OPTIMAL########
import pandas as pd
import os
import numpy as np
import time
import tensorflow.keras.initializers
import statistics
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedShuffleSplit
from tensorflow.keras.layers import LeakyReLU,PReLU

SPLITS = 100

# Bootstrap
boot = StratifiedShuffleSplit(n_splits=SPLITS, test_size=0.1)

# Track progress
mean_benchmark = []
epochs_needed = []
num = 0

# Loop through samples
for train, test in boot.split(x,df['product']):
    start_time = time.time()
    num+=1

    # Split train and test
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]

    # Construct neural network
    # kernel_initializer = tensorflow.keras.initializers.he_uniform(seed=None)
    model = Sequential()
    model.add(Dense(100, input_dim=x.shape[1], activation=PReLU(), kernel_regularizer=regularizers.l2(1e-4)
    )) # Hidden 1
    model.add(Dropout(0.5))
    model.add(Dense(100, activation=PReLU(), activity_regularizer=regularizers.l2(1e-4)
    )) # Hidden 2
    model.add(Dropout(0.5))
    model.add(Dense(100, activation=PReLU(), activity_regularizer=regularizers.l2(1e-4)
    )) # Hidden 3
#    model.add(Dropout(0.5)) - Usually better performance without dropout on final layer
    model.add(Dense(y.shape[1],activation='softmax')) # Output
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
        patience=100, verbose=0, mode='auto', restore_best_weights=True)

    # Train on the bootstrap sample
    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
    epochs = monitor.stopped_epoch
    epochs_needed.append(epochs)
    
    # Predict on the out of boot (validation)
    pred = model.predict(x_test)
  
    # Measure this bootstrap's log loss
    y_compare = np.argmax(y_test,axis=1) # For log loss calculation
    score = metrics.log_loss(y_compare, pred)
    mean_benchmark.append(score)
    m1 = statistics.mean(mean_benchmark)
    m2 = statistics.mean(epochs_needed)
    mdev = statistics.pstdev(mean_benchmark)
    
    # Record this iteration
    time_took = time.time() - start_time
    print(f"#{num}: score={score:.6f}, mean score={m1:.6f}, stdev={mdev:.6f}, epochs={epochs}, mean epochs={int(m2)}, time={hms_string(time_took)}")

0.69716