<a href="https://colab.research.google.com/github/jhwang1992/KaggleHousePricesPrediction/blob/master/kagglepriceprediction_part4_Estimator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# update to TensorFlow 2.0

In [0]:
# update to tensorflow 2.0

!pip install --upgrade tensorflow-gpu

In [1]:
# check whether the update is successful

from __future__ import absolute_import, division, print_function, unicode_literals

try:
  import tensorflow.compat.v2 as tf
except Exception:
  pass

tf.enable_v2_behavior()

print(tf.__version__)

2.1.0


# pickle load data

In [0]:
from google.colab import files
uploaded = files.upload()

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import pickle

import warnings as wrn
wrn.filterwarnings('ignore', category = DeprecationWarning) 
wrn.filterwarnings('ignore', category = FutureWarning) 
wrn.filterwarnings('ignore', category = UserWarning)

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import make_scorer, mean_squared_error

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import feature_column
from tensorflow.keras import layers
from keras.utils.vis_utils import plot_model
from tensorflow import keras

Using TensorFlow backend.


In [0]:
with open('df_train.pkl', 'rb') as f:
    df_train = pickle.load(f)

with open('df_test.pkl', 'rb') as f:
    df_test = pickle.load(f)

In [4]:
print('df_train shape: ', df_train.shape)
print('df_test shape: ', df_test.shape)

df_train shape:  (1450, 76)
df_test shape:  (1459, 75)


# identify numeric and categorical columns

In [5]:
numericColumns = []
categoricalColumns = []

for column in df_train.columns:
  if df_train[column].dtypes==int or df_train[column].dtypes==float:
    numericColumns.append(column)
  else:
    categoricalColumns.append(column)

numericColumns.remove('Id')
numericColumns.remove('SalePrice')

print( len(numericColumns), 'numeric columns: ', numericColumns)
print( len(categoricalColumns), 'categorical columns: ', categoricalColumns)
print( 'ID and SalePrice are seperated')

54 numeric columns:  ['MSSubClass', 'LotFrontage', 'LotArea', 'LotShape', 'LandContour', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'HeatingQC', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
20 categorical columns:  ['MSZoning', 'Street', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtCond',

# data preprocessing using pandas dataframe and scikit-learn

1. impute numeric and categorical columns

2. labelencode string categorical columns

3. split the data into train, validation

4. standardscale the input datasets

In [0]:
# categorical columns fillna
# firstly, to avoid loss nan during NN training
# secondly, to be able to pass to labelencoder

for column in categoricalColumns:
  df_train[column].fillna('missing', inplace = True)
  df_test[column].fillna('missing', inplace = True)

In [0]:
# labelencode string categorical column to integer categorical column
# tf.dataset cannot take in mixed data type, and thus need to change to numeric
# take care of df_test labelencoder transform, there are unique labels not in df_train

import bisect

for column in categoricalColumns:
  le = LabelEncoder()
  le.fit(df_train[column])
  df_train[column] = le.transform(df_train[column])
  le_classes = le.classes_.tolist()
  
  # to handle categorical feature only in testing data
  # handle int and string categorical columns differently
  if type(le_classes[0]) is str:
    df_test[column] = df_test[column].map(lambda s: 'other' if s not in le.classes_ else s)
    bisect.insort_left(le_classes, 'other')
    le.classes_ = le_classes
    df_test[column] = le.transform(df_test[column])
  else:
    df_test[column] = df_test[column].map(lambda s: -1 if s not in le.classes_ else s)
    bisect.insort_left(le_classes, -1)
    le.classes_ = le_classes
    df_test[column] = le.transform(df_test[column])

In [8]:
# split data into train/validation/testing pandas dataframes
# validation dataframe is to be passed into fit()

df_train, df_validation = train_test_split(df_train, test_size=0.2)
print(len(df_train), 'train examples')
print(len(df_validation), 'validation examples')
df_test.SalePrice = 0
print(len(df_test), 'test examples')

1160 train examples
290 validation examples
1459 test examples


In [0]:
# numeric columns fillna, to avoid loss nan during NN training

for column in numericColumns:
  df_train[column].fillna(df_train[column].median(), inplace = True)
  df_validation[column].fillna(df_train[column].median(), inplace = True)
  df_test[column].fillna(df_test[column].median(), inplace = True)

In [0]:
# standardscale the input data

scaler = StandardScaler()
df_train[numericColumns] = scaler.fit_transform(df_train[numericColumns])
df_validation[numericColumns] = scaler.transform(df_validation[numericColumns])
df_test[numericColumns] = scaler.transform(df_test[numericColumns])

In [0]:
df_test['SalePrice'] = 0

# step 1: build input_fn

In [0]:
# function to build tf.dataset from pandas dataframes
# three seperate tensorflow.dataset need to be built for train,validation,and testing
# tf.dataset has repeat(equivalent to epoch) and batch_size
# steps = repeat * (total sample number/batch_size)

def input_fn(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('SalePrice')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

# step 2: define feature_column

In [0]:
feature_columns = []

for col in numericColumns:
  col = feature_column.numeric_column(col)
  feature_columns.append(col)

for col in categoricalColumns:
  col = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list(col,df_train[col].unique()))
  feature_columns.append(col)

# step 3: initiate prebuild estimator

In [14]:
# instantiate pre-built estimator
regressor = tf.estimator.LinearRegressor(
    feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpwmc9p8q4', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


# step 4: apply train, evaluate, predict methods of the evaluator

In [15]:
# train the estimator
regressor.train(
    input_fn=lambda: input_fn(df_train, shuffle=True, batch_size=32),
    steps=2000) 

# steps define how many epochs to loop through the dataset
# here, define 100 epochs for training

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constr

<tensorflow_estimator.python.estimator.canned.linear.LinearRegressorV2 at 0x7fcf20150128>

In [16]:
# evaluate the estimator
eval_result = regressor.evaluate(
    input_fn=lambda: input_fn(df_validation, shuffle=False, batch_size=32))

print(pd.Series(eval_result))
# here, only one epoch for estimator evaluation

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-02-11T13:37:05Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpwmc9p8q4/model.ckpt-37
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 1.75418s
INFO:tensorflow:Finished evaluation at 2020-02-11-13:37:07
INFO:tensorflow:Saving dict for global step 37: average_loss = 0.21613409, global_step = 37, label/mean = 12.0290985, loss = 0.21110424, prediction/mean = 12.024432
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 37: /tmp/tmpwmc9p8q4/model.ckpt-37
average

# generate prediction output

In [17]:
submission = []

for ele in regressor.predict(input_fn=lambda: input_fn(df_test, shuffle=False, batch_size=32)):
  submission.append(np.expm1(ele['predictions']))

df = pd.DataFrame(columns = ['Id', 'SalePrice'])
df['Id'] = df_test['Id']
df['SalePrice'] = submission

df.to_csv(r"submission.csv")

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpwmc9p8q4/model.ckpt-37
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
