In [1]:
from __future__ import print_function
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd
from sklearn import metrics
import seaborn as sns
import tensorflow as tf

from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
# from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)

# read csv
train_data = pd.read_csv("./dataSet/train.csv")
test_data = pd.read_csv("./dataSet/test.csv")

test_data_copy = test_data.copy()
train_data_copy = train_data.copy()


In [15]:
# display.display(train_data.describe())
display.display(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 88 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-

None

In [3]:
def construct_features(input_data):
    return (set([tf.feature_column.numeric_column(my_feature)
                 for my_feature in input_data]))

In [4]:
def train(train_example, learning_rate, batch_size, steps, periods, mod_dir):
    # Use sample method to seperate train dataset and validation dataset
    train_set = train_example.sample(frac=0.8, replace=False, random_state=100)
    cv_set = train_example.loc[set(train_example.index) - set(train_set.index)]
    feature_columns = construct_features(train_example.drop('SalePrice', axis=1))
    
    # Create train and cv input function
    train_input = tf.estimator.inputs.pandas_input_fn(
        x = train_set.drop('SalePrice', axis=1),
        y = train_set.SalePrice,
        num_epochs=None,
        shuffle=True,
        batch_size=batch_size
    )
    
    train_eval_input = tf.estimator.inputs.pandas_input_fn(
        x = train_set.drop('SalePrice', axis=1),
        y = train_set.SalePrice,
        num_epochs=1,
        shuffle=True,
    )
    
    cv_input = tf.estimator.inputs.pandas_input_fn(
        x = cv_set.drop('SalePrice', axis=1),
        y = cv_set.SalePrice,
        num_epochs=1,
        shuffle=False,
    )
    
    # Declare optimizer for estimator
    my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 2.5)
    estimator = tf.estimator.DNNRegressor(
        hidden_units=[256, 128, 64],
        feature_columns=feature_columns,
        optimizer=my_optimizer,
        model_dir=mod_dir
    )
    
    training_rmse = []
    validation_rmse = []
    
    for i in range (periods):
        # Train model
        print('%d period:' % (i + 1), end='')
        estimator.train(input_fn=train_input, steps=steps)
        
        # Evaluate model with validation dataset
        eval_cv = estimator.evaluate(input_fn=cv_input)
        print(eval_cv['average_loss'], end=' ')
        
        # Take a break and compute predictions.
        training_predictions = estimator.predict(input_fn=train_eval_input)
        training_predictions = np.array([item['predictions'][0] for item in training_predictions])
        validation_predictions = estimator.predict(input_fn=cv_input)
        validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
    
        # Compute training and validation loss.
        training_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(training_predictions, train_set.SalePrice))
        validation_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(validation_predictions, cv_set.SalePrice))        
        print("  RMSE : %0.2f, %0.2f" % (training_root_mean_squared_error, validation_root_mean_squared_error))
        
        # Add the loss metrics from this period to our list.
        training_rmse.append(training_root_mean_squared_error)
        validation_rmse.append(validation_root_mean_squared_error)
    print("Model training finished.")

  
    # Output a graph of loss metrics over periods.
    plt.ylabel("RMSE")
    plt.xlabel("Periods")
    plt.title("Root Mean Squared Error vs. Periods")
    plt.tight_layout()
    plt.plot(training_rmse, label="training")
    plt.plot(validation_rmse, label="validation")
    plt.legend()
    
    return estimator
        

In [5]:
def my_stupid_one_hot(data, feature, one_hot_map, feature_map):
    for i in range(len(feature_map)):
        data[feature_map[i]] = data[feature] == one_hot_map[i]
    return data

feature_map = ('ExterQual1', 'ExterQual2', 'ExterQual3', 'ExterQual4')
one_hot_map = ('Gd', 'TA', 'Ex', 'Fa')

data = my_stupid_one_hot(train_data, 'ExterQual', one_hot_map, feature_map)
data = my_stupid_one_hot(test_data, 'ExterQual', one_hot_map, feature_map)
# display.display(train_data.head(10))

feature_map = ('BsmtQual1', 'BsmtQual2', 'BsmtQual3', 'BsmtQual4')
one_hot_map = ('Gd', 'TA', 'Ex', 'Fa')

train_data = my_stupid_one_hot(train_data, 'BsmtQual', one_hot_map, feature_map)
test_data = my_stupid_one_hot(test_data, 'BsmtQual', one_hot_map, feature_map)


In [6]:
def normalize(series):
    return (series - series.mean()) / series.std()


In [20]:
# model missing data
missing_data_set = pd.DataFrame({
    '1stFlrSF':normalize(train_data['1stFlrSF']),
    'GrLivArea':normalize(train_data['GrLivArea']),
    'OverallQual':normalize(train_data['OverallQual']),
    'FullBath':normalize(train_data['FullBath']),
    'TotalBsmtSF':normalize(train_data['TotalBsmtSF']),
    'BsmtQual':train_data['BsmtQual'],
    'SalePrice':np.log(train_data['SalePrice']),
})

estimator = train(
    learning_rate = 0.01,
    batch_size = 10,
    steps = 100,
    periods = 15,
    train_example = missing_data_set,
    mod_dir='./tensorboard/missing2/train'
)

1 period:

InvalidArgumentError: Restoring from checkpoint failed. This is most likely due to a mismatch between the current graph and the graph from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:

tensor_name = dnn/hiddenlayer_0/kernel; shape in shape_and_slice spec [6,256] does not match the shape stored in checkpoint: [9,256]
	 [[node save/RestoreV2_1 (defined at /nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow_estimator/python/estimator/estimator.py:1403) ]]

Caused by op 'save/RestoreV2_1', defined at:
  File "/nfs/2018/f/fhong/.brew/Cellar/python/3.7.1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/nfs/2018/f/fhong/.brew/Cellar/python/3.7.1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/nfs/2018/f/fhong/.brew/Cellar/python/3.7.1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/asyncio/base_events.py", line 528, in run_forever
    self._run_once()
  File "/nfs/2018/f/fhong/.brew/Cellar/python/3.7.1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/asyncio/base_events.py", line 1764, in _run_once
    handle._run()
  File "/nfs/2018/f/fhong/.brew/Cellar/python/3.7.1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/asyncio/events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/tornado/gen.py", line 1233, in inner
    self.run()
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2843, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2869, in _run_cell
    return runner(coro)
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3044, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3209, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/nfs/2018/f/fhong/.brew/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3291, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-20-9fe1b3fbdca6>", line 18, in <module>
    mod_dir='./tensorboard/missing2/train'
  File "<ipython-input-4-db3d209aa40f>", line 46, in train
    estimator.train(input_fn=train_input, steps=steps)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 358, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1124, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1158, in _train_model_default
    saving_listeners)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1403, in _train_with_estimator_spec
    log_step_count_steps=log_step_count_steps) as mon_sess:
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/monitored_session.py", line 508, in MonitoredTrainingSession
    stop_grace_period_secs=stop_grace_period_secs)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/monitored_session.py", line 934, in __init__
    stop_grace_period_secs=stop_grace_period_secs)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/monitored_session.py", line 648, in __init__
    self._sess = _RecoverableSession(self._coordinated_creator)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/monitored_session.py", line 1122, in __init__
    _WrappedSession.__init__(self, self._create_session())
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/monitored_session.py", line 1127, in _create_session
    return self._sess_creator.create_session()
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/monitored_session.py", line 805, in create_session
    self.tf_sess = self._session_creator.create_session()
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/monitored_session.py", line 562, in create_session
    self._scaffold.finalize()
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/monitored_session.py", line 219, in finalize
    self._saver.build()
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/saver.py", line 844, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/saver.py", line 881, in _build
    build_save=build_save, build_restore=build_restore)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/saver.py", line 507, in _build_internal
    restore_sequentially, reshape)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/saver.py", line 385, in _AddShardedRestoreOps
    name="restore_shard"))
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/saver.py", line 332, in _AddRestoreOps
    restore_sequentially)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/training/saver.py", line 580, in bulk_restore
    return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/ops/gen_io_ops.py", line 1572, in restore_v2
    name=name)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/framework/ops.py", line 3300, in create_op
    op_def=op_def)
  File "/nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow/python/framework/ops.py", line 1801, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): Restoring from checkpoint failed. This is most likely due to a mismatch between the current graph and the graph from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:

tensor_name = dnn/hiddenlayer_0/kernel; shape in shape_and_slice spec [6,256] does not match the shape stored in checkpoint: [9,256]
	 [[node save/RestoreV2_1 (defined at /nfs/2018/f/fhong/Library/Python/3.7/lib/python/site-packages/tensorflow_estimator/python/estimator/estimator.py:1403) ]]


In [17]:
# Final prediction
def test_input(test_example):
    test_input_fn = tf.estimator.inputs.pandas_input_fn(
          x=test_example,
          num_epochs=1, # only to predict
          shuffle=False,
    )
    return test_input_fn

tmp = test_data['TotalBsmtSF'].copy().dropna()
tmp = test_data['TotalBsmtSF'].sum() / len(test_data['TotalBsmtSF'])
test_data['TotalBsmtSF'] = test_data['TotalBsmtSF'].fillna(tmp)
# print(test_simple_example['TotalBsmtSF'].isnull().sum())

tmp = test_data['GrLivArea'].copy().dropna()
tmp = test_data['GrLivArea'].sum() / len(test_data['GrLivArea'])
test_data['GrLivArea'] = test_data['GrLivArea'].fillna(tmp)

display.display(test_data.info())
final_input = test_input(test_data)

predictions = list(estimator.predict(input_fn=final_input))
predicted_classes = [math.exp(float(prediction['predictions'][0])) for prediction in predictions]

evaluation = test_data_copy[''].copy().to_frame()
evaluation["SalePrice"] = predicted_classes
evaluation.to_csv("evaluation_submission.csv", index=False)
# evaluation["actual"] = train_data.SalePrice
display.display(evaluation.describe())
display.display(evaluation)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 88 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-

None

InternalError: Unable to get element as bytes.

In [19]:
predictions = list(estimator.predict(input_fn=final_input))

InternalError: Unable to get element as bytes.