In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
%load_ext autoreload
%autoreload 2

### Get the data and pre-process the data

In [2]:
df = pd.read_csv('../data/data_ready.csv')
df.head()

Unnamed: 0,LotFrontage,LotArea,Street,Utilities,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,SaleType_New,SaleType_Oth,SaleType_VWD,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,4.955827,10.366309,1,1,6,5,1960,1960,4.727388,3,...,0,0,0,1,0,0,0,0,1,0
1,4.394449,9.360741,1,1,5,6,1961,1961,0.0,3,...,0,0,0,1,0,0,0,0,1,0
2,4.406719,9.565775,1,1,6,6,1958,1958,4.691348,3,...,0,0,0,1,0,0,0,0,1,0
3,4.543295,9.320181,1,1,7,5,1968,1968,0.0,4,...,0,0,0,1,0,0,0,0,1,0
4,4.317488,9.534668,1,1,5,5,1997,1998,0.0,3,...,0,0,0,1,0,0,0,0,1,0


In [3]:
y = df.SalePrice.values
X = df.drop('SalePrice', axis=1).values
print("X shape: {}".format(X.shape))
print("y shape: {}".format(y.shape))

X shape: (2930, 271)
y shape: (2930,)


In [4]:
# Shuffle the data
p = np.random.permutation(X.shape[0])
X = X[p]
y = y[p]

In [5]:
# Get training, validation, and test data
N = X.shape[0] # 2930 observations in total
num_trn = 2200
num_val = 230
num_tst = N - num_trn - num_val # 500
trn_mask = range(0,num_trn)
val_mask = range(num_trn,num_trn+num_val)
tst_mask = range(num_trn+num_val,N)
X_trn = X[trn_mask]
y_trn = y[trn_mask]
X_val = X[val_mask]
y_val = y[val_mask]
X_tst = X[tst_mask]
y_tst = y[tst_mask]

In [6]:
# Standardize Features
mean = X_trn.mean()
std  = X_trn.std()
X_trn = (X_trn - mean) / std
X_val = (X_val - mean) / std
X_tst = (X_tst - mean) / std

### Simple Prediction with Sklearn

In [39]:
from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(X_trn, y_trn)
y_val_pred_linear = model.predict(X_val)
y_tst_pred_linear = model.predict(X_tst)

### Simple SVR with Sklearn

In [40]:
from sklearn.svm import SVR
svr = SVR(kernel='rbf', C=5e4) # large C means small margin
svr.fit(X_trn, y_trn)
y_val_pred_svr = svr.predict(X_val)
y_tst_pred_svr = svr.predict(X_tst)

### Accuracy Evaluation

In [10]:
def evaluate(y, y_pred):
  rmse = np.sqrt(((y - y_pred)**2).mean())
  print("RMSE: {}".format(rmse))
  sse = ((y - y_pred)**2).sum()
  sst = ((y - y.mean())**2).sum()
  r2 = 1.0 - sse / sst
  print("R2: {}".format(r2))

In [41]:
evaluate(y_val, y_val_pred_linear)
evaluate(y_val, y_val_pred_svr)

RMSE: 3595187.8018426555
R2: -90687636748230.16
RMSE: 0.12322332669413108
R2: 0.8934653752924656


## TensorFlow

In [12]:
model_number = 0

In [59]:
import scripts.hedonic as hd

# hyperparameters
w_scale = 1e-3
reg = 0
init_lr = 1e-3
decay_steps = 100
decay_rate = 0.94
keep_prob_value = 1.0

# build the graph
g = tf.Graph()
with g.as_default():
  is_training = tf.placeholder(tf.bool, name="is_training")
  X = tf.placeholder(tf.float32, [None,271], name="X")
  y = tf.placeholder(tf.float32, [None], name="y")
  
  global_step = tf.Variable(0, dtype=tf.int64, trainable=False)
  learning_rate = tf.train.exponential_decay(init_lr,
      global_step, decay_steps, decay_rate)
  
  # the core model: calculate scores from X
  out = X
#   with tf.name_scope("FC1"):
#     out = hd.affine(out, [271, 128], w_scale)
#     out = hd.batch_norm(out, is_training)
#     out = tf.nn.tanh(out)
#     out = hd.dropout(out, is_training, keep_prob_value)
  with tf.name_scope("FCOut"):
    out = hd.affine(out, [271, 1], w_scale) 
  scores = out
  
  loss = hd.loss(scores, y, reg)
  rmse = hd.rmse(scores, y)
  tf.summary.scalar("RMSE", rmse)
  r2 = hd.r2(scores, y)
  tf.summary.scalar("R2", r2)
  
  train_op = hd.train_op(loss, learning_rate, global_step)
  
  init_op = tf.global_variables_initializer()
  saver = tf.train.Saver()
  merged = tf.summary.merge_all()

In [60]:
# Prepare data
def trn_dict(batch_size):
  num_train = X_trn.shape[0]
  mask = np.random.choice(num_train, batch_size)
  return {
    is_training: True,
    X: X_trn[mask],
    y: y_trn[mask]
  }

def small_trn_dict(batch_size):
  # ignore batch size
  return {
    is_training: True,
    X: X_trn[:100],
    y: y_trn[:100]
  }

def val_dict():
  return {
    is_training: False,
    X: X_val,
    y: y_val
  }

In [61]:
model_number += 1
with tf.Session(graph=g) as sess:
  print("model{}".format(model_number))
  writer_graph = tf.summary.FileWriter('./graphs/nn/model{}'.format(model_number))
  writer_graph.add_graph(sess.graph)
  writer_graph.close()
  writer_trn = tf.summary.FileWriter('./graphs/nn/model{}/trn'.format(model_number))
  writer_val = tf.summary.FileWriter('./graphs/nn/model{}/val'.format(model_number))
  batch_size = 64
  
  sess.run(init_op)
  
  for i in range(20000):
    _, loss_trn, rmse_trn, summary = sess.run([train_op, loss, rmse, merged], feed_dict=trn_dict(batch_size))
    writer_trn.add_summary(summary, global_step.eval())
    rmse_val, summary = sess.run([rmse, merged], feed_dict=val_dict())
    writer_val.add_summary(summary, global_step.eval())
    if i != 0 and i % 500 == 0:
      print("({}) training loss: {:.6f}".format(i, loss_trn))
      print("rmse_trn: {:.4f}, rmse_val: {:.4f}".format(rmse_trn, rmse_val))
    
  # close writers
  writer_graph.close()
  writer_trn.close()
  writer_val.close()

model14
(500) training loss: 0.160371
rmse_trn: 0.4005, rmse_val: 0.3906
(1000) training loss: 0.144892
rmse_trn: 0.3806, rmse_val: 0.3806
(1500) training loss: 0.134752
rmse_trn: 0.3671, rmse_val: 0.3806
(2000) training loss: 0.150328
rmse_trn: 0.3877, rmse_val: 0.3805
(2500) training loss: 0.147956
rmse_trn: 0.3847, rmse_val: 0.3805
(3000) training loss: 0.171427
rmse_trn: 0.4140, rmse_val: 0.3805
(3500) training loss: 0.176987
rmse_trn: 0.4207, rmse_val: 0.3809
(4000) training loss: 0.159729
rmse_trn: 0.3997, rmse_val: 0.3806
(4500) training loss: 0.152059
rmse_trn: 0.3899, rmse_val: 0.3807
(5000) training loss: 0.130689
rmse_trn: 0.3615, rmse_val: 0.3806
(5500) training loss: 0.131090
rmse_trn: 0.3621, rmse_val: 0.3805
(6000) training loss: 0.114629
rmse_trn: 0.3386, rmse_val: 0.3805
(6500) training loss: 0.210705
rmse_trn: 0.4590, rmse_val: 0.3805
(7000) training loss: 0.174122
rmse_trn: 0.4173, rmse_val: 0.3807
(7500) training loss: 0.198806
rmse_trn: 0.4459, rmse_val: 0.3805
(80

KeyboardInterrupt: 