In [None]:
import numpy as np
import pandas as pd
import scipy.stats   
import scipy.special  
import subprocess
import sklearn.linear_model
import sklearn.model_selection
import sklearn.pipeline  
import sklearn.preprocessing
import sklearn.ensemble  
import sklearn.kernel_ridge 
import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

color = sns.color_palette()
sns.set_style('darkgrid')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 100)

Load data

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/train.csv")
df = pd.concat([train_df, test_df])
print('train_df shape: ', train_df.shape)
print('test_df shape: ', test_df.shape)
print('df shape: ', df.shape)

## Explore data
 
 * Data types
 * Sparsity
 * Data distributions

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.head(3)

In [None]:
print(train_df['SalePrice'].describe())

print('Medium value is ', train_df['SalePrice'].median())
sns.distplot(train_df['SalePrice'], fit=scipy.stats.norm)

Data Sparsity

In [None]:
print(df.isnull().sum())
cols_with_missing = [col for col in df.columns 
                                 if df[col].isnull().any()]
print("Columns with missing features: ",  cols_with_missing)

## Treat null values

In [None]:
imputer = sklearn.preprocessing.Imputer()

In [None]:
# Drop columns where large number of values are null
columns_to_drop = ['PoolQC', 'Fence', 'MiscFeature', 'Alley']
df = df.drop(columns_to_drop, axis=1)

In [None]:
from sklearn.preprocessing import Imputer
my_imputer = Imputer()
df[['LotFrontage']] = my_imputer.fit_transform(df[['LotFrontage']])

In [None]:
print(df.isnull().sum())

In [None]:
df[['MasVnrType']] = df[['MasVnrType']].fillna(value='None')
df[['MasVnrArea']] = df[['MasVnrArea']].fillna(value=0)

df[['BsmtQual']] = df[['BsmtQual']].fillna(value='None')
df[['BsmtCond']] = df[['BsmtCond']].fillna(value=0)

df[['BsmtExposure']] = df[['BsmtExposure']].fillna(value='NA')
df[['BsmtFinType1']] = df[['BsmtFinType1']].fillna(value='NA')
df[['BsmtFinType2']] = df[['BsmtFinType2']].fillna(value='NA')

# Only 2 missing values, drop it
df = df.dropna(subset=['Electrical'], how='all')

df[['FireplaceQu']] = df[['FireplaceQu']].fillna(value='NA')

df[['GarageType']] = df[['GarageType']].fillna(value='NA')
df[['GarageFinish']] = df[['GarageFinish']].fillna(value='NA')
df[['GarageQual']] = df[['GarageQual']].fillna(value='NA')
df[['GarageCond']] = df[['GarageCond']].fillna(value='NA')
df[['GarageYrBlt']] = df[['GarageYrBlt']].fillna(value=0)

In [None]:
df['MSSubClass'] = df['MSSubClass'].apply(str)
df['OverallCond'] = df['OverallCond'].astype(str)
df['YrSold'] = df['YrSold'].astype(str)
df['MoSold'] = df['MoSold'].astype(str)


In [None]:
# categorial features
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')

# apply sklearn.preprocessing.LabelEncoder to each categorical feature
for c in cols:
    lbl = sklearn.preprocessing.LabelEncoder() 
    lbl.fit(list(df[c].values)) 
    df[c] = lbl.transform(list(df[c].values))

# shape        
print('data_df.shape = ', df.shape)

In [None]:
cols = df.select_dtypes(exclude =[np.number]).columns.values
df  = pd.get_dummies(df).copy()

In [None]:
train_df = df[: train_df.shape[0]]
train_df_y = train_df[['SalePrice']]
train_df = train_df.drop('SalePrice', axis=1)

In [None]:
test_df = df[train_df.shape[0]-1:]
test_df = test_df.drop('SalePrice', axis=1)

In [None]:
test_df.shape

In [None]:
test_df.loc[-1:]

In [None]:
kfold = sklearn.model_selection.KFold(5, shuffle=True)

In [None]:
model = sklearn.ensemble.GradientBoostingRegressor(
    n_estimators=10000, 
    learning_rate=0.01,                
    max_depth=5,
    max_features='sqrt',
    min_samples_leaf=15, 
    min_samples_split=10,
    loss='huber', 
    random_state=5
)

In [None]:
model.fit(train_df, train_df_y.values.ravel())

In [None]:
predictions = model.predict(test_df)

In [None]:
submission = pd.DataFrame()
submission['Id'] = test_df['Id']
submission['SalePrice'] = predictions
submission.to_csv('submission3.csv', index=False)
submission.head(5)