In [0]:
import pandas as pd
import numpy as np
import os
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.linear_model import LinearRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation



In [0]:
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])
# Generate dummies for job
df = pd.concat([df,pd.get_dummies(df['job'],prefix="job")],axis=1)
df.drop('job', axis=1, inplace=True)

# Generate dummies for area
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area")],axis=1)
df.drop('area', axis=1, inplace=True)

# Generate dummies for product
df = pd.concat([df,pd.get_dummies(df['product'],prefix="product")],axis=1)
df.drop('product', axis=1, inplace=True)

In [4]:
nonna = df[df["income"].notna()]
isna = df[df["income"].isnull()]
nx = nonna.drop(["income", "age"], axis=1).values
ny = nonna["income"].values
nx_train, nx_test, ny_train, ny_test = train_test_split(nx, ny, test_size=0.2)
reg = LinearRegression().fit(nx_train, ny_train)
isna["income"] = reg.predict(isna.drop(["income", "age"], axis=1).values)
df = pd.concat([isna, nonna], axis=0)
reg.score(nx_test, ny_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


0.8328741332235645

In [0]:
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['save_rate'] = zscore(df['save_rate'])
df['subscriptions'] = zscore(df['subscriptions'])
df["dist_unhealthy"] = zscore(df["dist_unhealthy"])

In [6]:
df.head()

Unnamed: 0,id,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,job_11,job_al,job_am,job_ax,job_bf,job_by,job_cv,job_de,job_dz,job_e2,job_f8,job_gj,job_gv,job_kd,job_ke,job_kl,job_kp,job_ks,job_kw,job_mm,job_nb,job_nn,job_ob,job_pe,job_po,job_pq,job_pz,job_qp,job_qw,job_rn,job_sa,job_vv,job_zz,area_a,area_b,area_c,area_d,product_a,product_b,product_c,product_d,product_e,product_f,product_g
35,36,-0.187617,-0.4946,-0.208449,5.454545,-0.387695,-1.105761,50,0.874016,0.417323,0.238394,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
69,70,0.23754,-0.996591,-0.208449,9.289907,-0.542432,-1.169511,47,0.858268,0.503937,0.263349,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
101,102,0.893694,1.360973,-1.255928,16.71582,-0.353308,-0.310185,44,0.925197,0.692913,0.086156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
134,135,0.133534,-1.40894,-0.208449,7.875447,0.282834,-0.487775,44,0.826772,0.586614,0.303499,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
155,156,0.45587,0.724521,-0.208449,12.989263,0.661081,2.321132,43,0.992126,0.775591,0.443469,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0


In [0]:
x_columns = df.columns.drop('age').drop('id')
x = df[x_columns].values
y = df['age'].values

In [15]:
kf = KFold(5, shuffle=True, random_state=42)

oos_y = []
oos_pred = []
fold = 0
for train, test in kf.split(x):
    fold += 1
    print(f"Fold #{fold}")
    
    x_train = x[train]
    x_test = x[test]
    y_train = y[train]
    y_test = y[test]
    
    model = Sequential()
    model.add(Dense(20, input_dim=x.shape[1], activation="relu"))
    model.add(Dense(10, activation="relu"))
    model.add(Dense(1))
    model.compile(loss="mean_squared_error", optimizer="adam")
    model.fit(x_train, y_train, validation_data=(x_test, y_test), verbose=0, epochs=500)
    
    pred = model.predict(x_test)
    
    oos_y.append(y_test)
    oos_pred.append(pred)
    
    score = np.sqrt(metrics.mean_squared_error(pred, y_test))
    print(f"Fold score (RMSE): {score}")
    
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_y, oos_pred))



Fold #1
Fold score (RMSE): 0.536621344367143
Fold #2
Fold score (RMSE): 0.9061623428878205
Fold #3
Fold score (RMSE): 0.6020487421118796
Fold #4
Fold score (RMSE): 0.5377725083217846
Fold #5
Fold score (RMSE): 0.5707149905393886


In [17]:
print(f"Final score {score}")

Final score 0.6459830056448728


In [0]:
oos_y = pd.DataFrame(oos_y)
oos_pred = pd.DataFrame(oos_pred)
oosDF = pd.concat( [df, oos_y, oos_pred],axis=1 )

In [0]:
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])
# Generate dummies for job
df = pd.concat([df,pd.get_dummies(df['job'],prefix="job")],axis=1)
df.drop('job', axis=1, inplace=True)

# Generate dummies for area
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area")],axis=1)
df.drop('area', axis=1, inplace=True)


In [36]:
nonna = df[df["income"].notna()]
isna = df[df["income"].isnull()]
nx = nonna.drop(["income", "product"], axis=1).values
ny = nonna["income"].values
nx_train, nx_test, ny_train, ny_test = train_test_split(nx, ny, test_size=0.2)
reg = LinearRegression().fit(nx_train, ny_train)
isna["income"] = reg.predict(isna.drop(["income", "product"], axis=1).values)
df = pd.concat([isna, nonna], axis=0)
reg.score(nx_test, ny_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


0.773531504009265

In [0]:
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['save_rate'] = zscore(df['save_rate'])
df['subscriptions'] = zscore(df['subscriptions'])
df["dist_unhealthy"] = zscore(df["dist_unhealthy"])

x_col = df.columns.drop("product").drop("id")
x = df[x_col].values

dummies = pd.get_dummies(df['product']) # Classification
products = dummies.columns
y = dummies.values

In [40]:
kf = StratifiedKFold(5, shuffle=True, random_state=69)

oos_y = []
oos_pred = []
fold = 0


for train, test in kf.split(x, df["product"]):
  fold += 1
  print(f"Fold #{fold}")

  x_train = x[train]
  x_test = x[test]
  y_train = y[train]
  y_test = y[test]

  model = Sequential()
  model.add(Dense(25, input_dim=x.shape[1], activation="relu"))
  model.add(Dense(10, activation="relu"))
  model.add(Dense(y.shape[1], activation="softmax"))
  model.compile(loss="categorical_crossentropy", optimizer="adam")
  model.fit(x_train, y_train, validation_data=(x_test, y_test), verbose=0, epochs=500)

  pred = model.predict(x_test)
  pred_category = np.argmax(pred, axis=1)
  y_category = np.argmax(y_test, axis=1)
  score = metrics.accuracy_score(y_category, pred_category)
  print(f"Fold accuracy {score}")

  oos_y.append(y_category)
  oos_pred.append(pred_category)

oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)

score = metrics.accuracy_score(oos_y, oos_pred)
print(f"accuracy {score}")

Fold #1
Fold RMSE 0.8246211251235321
Fold #2
Fold RMSE 0.8381527307120105
Fold #3
Fold RMSE 0.8558621384311845
Fold #4
Fold RMSE 0.8351646544245033
Fold #5
Fold RMSE 0.8336666000266533
RMSE 0.7015


In [50]:
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])
# Generate dummies for job
df = pd.concat([df,pd.get_dummies(df['job'],prefix="job")],axis=1)
df.drop('job', axis=1, inplace=True)

# Generate dummies for area
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area")],axis=1)
df.drop('area', axis=1, inplace=True)

# Generate dummies for product
df = pd.concat([df,pd.get_dummies(df['product'],prefix="product")],axis=1)
df.drop('product', axis=1, inplace=True)

nonna = df[df["income"].notna()]
isna = df[df["income"].isnull()]
nx = nonna.drop(["income", "age"], axis=1).values
ny = nonna["income"].values
nx_train, nx_test, ny_train, ny_test = train_test_split(nx, ny, test_size=0.2)
reg = LinearRegression().fit(nx_train, ny_train)
isna["income"] = reg.predict(isna.drop(["income", "age"], axis=1).values)
df = pd.concat([isna, nonna], axis=0)
reg.score(nx_test, ny_test)

df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['save_rate'] = zscore(df['save_rate'])
df['subscriptions'] = zscore(df['subscriptions'])
df["dist_unhealthy"] = zscore(df["dist_unhealthy"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [0]:
x_columns = df.columns.drop('age').drop('id')
x = df[x_columns].values
y = df['age'].values

In [52]:
x_main, x_holdout, y_main, y_holdout = train_test_split(x, y, test_size=0.1)

kf = KFold(5, shuffle=True, random_state=69)

oos_y = []
oos_pred = []
fold = 0

for train, test in kf.split(x_main):
  fold += 1
  print(f"Fold #{fold}")

  x_train = x[train]
  x_test = x[test]
  y_train = y[train]
  y_test = y[test]

  model = Sequential()
  model.add(Dense(25, input_dim=x.shape[1], activation="relu"))
  model.add(Dense(10, activation="relu"))
  model.add(Dense(1))
  model.compile(loss="mean_squared_error", optimizer="adam")
  model.fit(x_train, y_train, validation_data=(x_test, y_test), verbose=0, epochs=500)

  pred = model.predict(x_test)
  score = np.sqrt(metrics.mean_squared_error(y_test, pred))
  print(f"Fold RMSE: {score}")

  oos_y.append(y_test)
  oos_pred.append(pred)

  holdout_pred = model.predict(x_holdout)
  score = np.sqrt(metrics.mean_squared_error(y_holdout, holdout_pred))
  print(f"Fold holdout RMSE: {score}")


oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)

score = np.sqrt(metrics.mean_squared_error(oos_y, oos_pred))
print(f"total RMSE: {score}")



Fold #1
Fold RMSE: 0.9296196436310094
Fold holdout RMSE: 1.0366690143975446
Fold #2
Fold RMSE: 0.530721087694666
Fold holdout RMSE: 0.7741477278327994
Fold #3
Fold RMSE: 0.6344112708708834
Fold holdout RMSE: 0.8648449022433842
Fold #4
Fold RMSE: 0.7618854103749648
Fold holdout RMSE: 1.0052624038585476
Fold #5
Fold RMSE: 0.6486535599352393
Fold holdout RMSE: 0.8142324028732599
total RMSE: 0.7140806725735337
