In [None]:
from __future__ import print_function
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd
from sklearn import metrics
import seaborn as sns
import tensorflow as tf

from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
# from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)

# read csv
train_data = pd.read_csv("./dataSet/train.csv")
test_data = pd.read_csv("./dataSet/test.csv")

test_data_copy = test_data.copy()
train_data_copy = train_data.copy()


In [None]:
# display.display(train_data.describe())
# display.display(train_data.info())

In [None]:
def construct_features(input_data):
    return (set([tf.feature_column.numeric_column(my_feature)
                 for my_feature in input_data]))

In [None]:
def train(train_example, learning_rate, batch_size, steps, periods, mod_dir):
    # Use sample method to seperate train dataset and validation dataset
    train_set = train_example.sample(frac=0.8, replace=False, random_state=100)
    cv_set = train_example.loc[set(train_example.index) - set(train_set.index)]
    feature_columns = construct_features(train_example.drop('SalePrice', axis=1))
    
    # Create train and cv input function
    train_input = tf.estimator.inputs.pandas_input_fn(
        x = train_set.drop('SalePrice', axis=1),
        y = train_set.SalePrice,
        num_epochs=None,
        shuffle=True,
        batch_size=batch_size
    )
    
    train_eval_input = tf.estimator.inputs.pandas_input_fn(
        x = train_set.drop('SalePrice', axis=1),
        y = train_set.SalePrice,
        num_epochs=1,
        shuffle=True,
    )
    
    cv_input = tf.estimator.inputs.pandas_input_fn(
        x = cv_set.drop('SalePrice', axis=1),
        y = cv_set.SalePrice,
        num_epochs=1,
        shuffle=False,
    )
    
    # Declare optimizer for estimator
    my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 2.5)
    estimator = tf.estimator.DNNRegressor(
        hidden_units=[256, 128, 64],
        feature_columns=feature_columns,
        optimizer=my_optimizer,
        model_dir=mod_dir
    )
    
    training_rmse = []
    validation_rmse = []
    
    for i in range (periods):
        # Train model
        print('%d period:' % (i + 1), end='')
        estimator.train(input_fn=train_input, steps=steps)
        
        # Evaluate model with validation dataset
        eval_cv = estimator.evaluate(input_fn=cv_input)
        print(eval_cv['average_loss'], end=' ')
        
        # Take a break and compute predictions.
        training_predictions = estimator.predict(input_fn=train_eval_input)
        training_predictions = np.array([item['predictions'][0] for item in training_predictions])
        validation_predictions = estimator.predict(input_fn=cv_input)
        validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
    
        # Compute training and validation loss.
        training_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(training_predictions, train_set.SalePrice))
        validation_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(validation_predictions, cv_set.SalePrice))        
        print("  RMSE : %0.2f, %0.2f" % (training_root_mean_squared_error, validation_root_mean_squared_error))
        
        # Add the loss metrics from this period to our list.
        training_rmse.append(training_root_mean_squared_error)
        validation_rmse.append(validation_root_mean_squared_error)
    print("Model training finished.")

  
    # Output a graph of loss metrics over periods.
    plt.ylabel("RMSE")
    plt.xlabel("Periods")
    plt.title("Root Mean Squared Error vs. Periods")
    plt.tight_layout()
    plt.plot(training_rmse, label="training")
    plt.plot(validation_rmse, label="validation")
    plt.legend()
    
    return estimator
        

In [None]:
def my_stupid_one_hot(data, feature, one_hot_map, feature_map):
    for i in range(len(feature_map)):
        data[feature_map[i]] = data[feature] == one_hot_map[i]
    return data

feature_map = ('ExterQual1', 'ExterQual2', 'ExterQual3', 'ExterQual4')
one_hot_map = ('Gd', 'TA', 'Ex', 'Fa')

data = my_stupid_one_hot(train_data, 'ExterQual', one_hot_map, feature_map)
data = my_stupid_one_hot(test_data, 'ExterQual', one_hot_map, feature_map)
# display.display(train_data.head(10))

feature_map = ('BsmtQual1', 'BsmtQual2', 'BsmtQual3', 'BsmtQual4')
one_hot_map = ('Gd', 'TA', 'Ex', 'Fa')

data = my_stupid_one_hot(train_data, 'ExterQual', one_hot_map, feature_map)
data = my_stupid_one_hot(test_data, 'ExterQual', one_hot_map, feature_map)


In [None]:
def normalize(series):
    return (series - series.mean()) / series.std()


In [None]:
# outlier and missing data
outlier_data_set = pd.DataFrame({
    '1stFlrSF':normalize(train_data['1stFlrSF'].apply(lambda x : min(x, 2500))),
    'GrLivArea':normalize(train_data['GrLivArea'].apply(lambda x : min(x, 3000))),
    'OverallQual':normalize(train_data['OverallQual']),
    'FullBath':normalize(train_data['FullBath']),
    'TotalBsmtSF':normalize(train_data['TotalBsmtSF'].apply(lambda x : min(x, 3000))),
    'BsmtQual':train_data['BsmtQual'],
    'SalePrice':np.log(train_data['SalePrice']),
})

estimator = train(
    learning_rate = 0.01,
    batch_size = 10,
    steps = 100,
    periods = 15,
    train_example = missing_data_set,
    mod_dir='./tensorboard/outlier2/train'
)

In [None]:
# Final prediction
def test_input(test_example):
    test_input_fn = tf.estimator.inputs.pandas_input_fn(
          x=test_example,
          num_epochs=1, # only to predict
          shuffle=False,
    )
    return test_input_fn

test_simple_example = pd.DataFrame({
    '1stFlrSF':normalize(test_data['1stFlrSF']),
    'GrLivArea':normalize(test_data['GrLivArea']),
    'OverallQual':normalize(test_data['OverallQual']),
    'FullBath':normalize(test_data['FullBath']),
    'TotalBsmtSF':normalize(test_data['TotalBsmtSF']),
    'BsmtQual':test_data['BsmtQual'],
})

tmp = test_simple_example['TotalBsmtSF'].copy().dropna()
tmp = test_simple_example['TotalBsmtSF'].sum() / len(test_simple_example['TotalBsmtSF'])
test_simple_example['TotalBsmtSF'] = test_simple_example['TotalBsmtSF'].fillna(tmp)
# print(test_simple_example['TotalBsmtSF'].isnull().sum())

tmp = test_simple_example['GarageArea'].copy().dropna()
tmp = test_simple_example['GarageArea'].sum() / len(test_simple_example['GarageArea'])
test_simple_example['GarageArea'] = test_simple_example['GarageArea'].fillna(tmp)

display.display(test_simple_example.info())
final_input = test_input(test_simple_example)

predictions = list(estimator.predict(input_fn=final_input))
predicted_classes = [math.exp(float(prediction['predictions'][0])) for prediction in predictions]

evaluation = test_data_copy[''].copy().to_frame()
evaluation["SalePrice"] = predicted_classes
evaluation.to_csv("evaluation_submission.csv", index=False)
# evaluation["actual"] = train_data.SalePrice
display.display(evaluation.describe())
display.display(evaluation)
