In [55]:
import pandas as pd
from pandas import Series,DataFrame

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

In [56]:
import chainer
import chainer.functions as F
import chainer.links as L
import chainer
import cupy

from chainer import training
from chainer.training import extensions
from chainer.optimizer import WeightDecay

In [73]:
def pre_process(org_house_df):
    
    house_df = pd.DataFrame()
    
    org_house_df = org_house_df.fillna({'MSZoning' : 'RL'}) 
    house_df['MSZoning']= org_house_df['MSZoning'].map({'C (all)': 0, 'FV': 1, 'RH': 2, 'RL': 3, 'RM': 4}).astype(int)
    house_df['LandContour']= org_house_df['LandContour'].map({'Bnk': 0, 'HLS': 1, 'Low': 2, 'Lvl': 3}).astype(int)
    house_df['Condition1']= org_house_df['Condition1'].map({'Artery': 0,
         'Feedr': 1,
         'Norm': 2,
         'PosA': 3,
         'PosN': 4,
         'RRAe': 5,
         'RRAn': 6,
         'RRNe': 7,
         'RRNn': 8}).astype(int)
    house_df['Condition2']= org_house_df['Condition2'].map({'Artery': 0,
         'Feedr': 1,
         'Norm': 2,
         'PosA': 3,
         'PosN': 4,
         'RRAe': 5,
         'RRAn': 6,
         'RRNn': 7}).astype(int)
    house_df['Neighborhood']= org_house_df['Neighborhood'].map({'Blmngtn': 0,
         'Blueste': 1,
         'BrDale': 2,
         'BrkSide': 3,
         'ClearCr': 4,
         'CollgCr': 5,
         'Crawfor': 6,
         'Edwards': 7,
         'Gilbert': 8,
         'IDOTRR': 9,
         'MeadowV': 10,
         'Mitchel': 11,
         'NAmes': 12,
         'NPkVill': 13,
         'NWAmes': 14,
         'NoRidge': 15,
         'NridgHt': 16,
         'OldTown': 17,
         'SWISU': 18,
         'Sawyer': 19,
         'SawyerW': 20,
         'Somerst': 21,
         'StoneBr': 22,
         'Timber': 23,
         'Veenker': 24}).astype(int)
    house_df['BldgType']= org_house_df['BldgType'].map({'1Fam': 0, '2fmCon': 1, 'Duplex': 2, 'Twnhs': 3, 'TwnhsE': 4}).astype(int)
    
    house_df['1stFlrSF'] = org_house_df['1stFlrSF']
    house_df['2ndFlrSF'] = org_house_df['2ndFlrSF']
    
    org_house_df = org_house_df.fillna({'TotalBsmtSF' : 991}) 
    house_df['TotalBsmtSF'] = org_house_df['TotalBsmtSF']
    
    house_df['FlrSF'] = org_house_df['1stFlrSF'] + org_house_df['2ndFlrSF']
    house_df['TotalSF'] = house_df['TotalBsmtSF'] + house_df['FlrSF']
    
    house_df['GrLivArea'] = org_house_df['GrLivArea']
    
    house_df['HeatingQC']= org_house_df['HeatingQC'].map({'Ex': 0, 'Fa': 1, 'Gd': 2, 'Po': 3, 'TA': 4}).astype(int)
    house_df['BedroomAbvGr']= org_house_df['BedroomAbvGr']
    
    org_house_df = org_house_df.fillna({'KitchenQual' : 'TA'}) 
    house_df['KitchenQual']= org_house_df['KitchenQual'].map({'Ex': 0, 'Fa': 1, 'Gd': 2, 'TA': 3}).astype(int)
    
    house_df['TotRmsAbvGrd']= org_house_df['TotRmsAbvGrd']
    house_df['Fireplaces']= org_house_df['Fireplaces']
    
    org_house_df = org_house_df.fillna({'FireplaceQu' : 'Gd'}) 
    house_df['FireplaceQu']=org_house_df['FireplaceQu'].map({ 'Ex': 0, 'Fa': 1, 'Gd': 2, 'Po': 3, 'TA': 4}).astype(int)
    
    org_house_df = org_house_df.fillna({'GarageType' : 'Attchd'}) 
    house_df['GarageType']=org_house_df['GarageType'].map({'2Types': 0,
         'Attchd': 1,
         'Basment': 2,
         'BuiltIn': 3,
         'CarPort': 4,
         'Detchd': 5}).astype(int)
    
    org_house_df = org_house_df.fillna({'GarageCars' : 2}) 
    house_df['GarageCars']= org_house_df['GarageCars']
    
    org_house_df = org_house_df.fillna({'GarageQual' : 'TA'}) 
    house_df['GarageQual']=org_house_df['GarageQual'].map({'Ex': 0, 'Fa': 1, 'Gd': 2, 'Po': 3, 'TA': 4}).astype(int)
    
    org_house_df = org_house_df.fillna({'GarageCond' : 'TA'}) 
    house_df['GarageCond']=org_house_df['GarageCond'].map({'Ex': 0, 'Fa': 1, 'Gd': 2, 'Po': 3, 'TA': 4}).astype(int)
    
    house_df['OverallCond']=org_house_df['OverallCond']
    house_df['OverallQual']=org_house_df['OverallQual']
    
    house_df['HouseAge'] = org_house_df['YrSold'] - org_house_df['YearRemodAdd']
    #house_df['GarageAge'] = org_house_df['YrSold'] - org_house_df['GarageYrBlt']
    house_df['IsRebuild'] = org_house_df['YearRemodAdd'] - org_house_df['YearBuilt']
    house_df['IsRebuild'].loc[house_df['IsRebuild']>0] = 1 
    
    return house_df
 

In [74]:
class NN(chainer.Chain):
    def __init__(self, n_out=1):
        super().__init__()
        with self.init_scope():
            self.fc1 = L.Linear(None, 20)
            self.fc2 = L.Linear(None, 20)
            self.fc3 = L.Linear(None, 10)
            self.fc4 = L.Linear(None, n_out)
            
            self.bn = L.BatchNormalization(26)
        
    def __call__(self, x):
        h = self.bn(x)
        h = self.fc1(h)
        h = F.relu(h)
        h = self.fc2(h)
        h = F.relu(h)
        h = self.fc3(h)
        h = F.relu(h)
        h = self.fc4(h)

        return h

In [75]:
model_nn = L.Classifier(NN())
chainer.serializers.load_npz("models/house_price_4.npz", model_nn)

In [76]:
model_lr = joblib.load('model_20190120.pkl')

In [77]:
org_house_df = pd.read_csv('train.csv')

In [78]:
house_df = pre_process(org_house_df)

In [85]:
house_df['SalePrice'] = org_house_df['SalePrice']
house_df = house_df.drop(house_df[(house_df['TotalSF']>5) & (house_df['SalePrice']<12.5)].index)
house_df = house_df.drop(house_df[(house_df['GrLivArea']>5) & (house_df['SalePrice']<13)].index)

In [86]:
tmp_house_df = house_df.drop(['SalePrice'], axis=1)

In [87]:
x = tmp_house_df.values
x = x.astype('f')
sale_prices_nn = []

In [88]:
with chainer.using_config('train', False), chainer.using_config('enable_backprop', False):
    for i, x_ in enumerate(x):
        x_new = x_
        x_new = x_new[np.newaxis]
        y = model_nn.predictor(x_new)
        sale_prices_nn.append(y.data)

In [89]:
#tmp_house_df.columns[np.isnan(tmp_house_df).any()]

In [90]:
sale_prices_lr = np.exp(model_lr.predict(tmp_house_df))

In [91]:
sale_prices_lr.shape

(1460,)

In [92]:
len(sale_prices_nn)

1460

In [93]:
house_df_emsemble = pd.DataFrame()
house_df_emsemble['predict_lr'] = sale_prices_lr
house_df_emsemble['predict_nn'] = np.array(np.reshape(sale_prices_nn, 1460))

In [102]:
X_train, X_test, y_train, y_test = train_test_split(house_df_emsemble, np.log(org_house_df['SalePrice']), test_size=0.4, random_state=1)

In [103]:
xgb_regressor = xgb.XGBRegressor()
model_xgb = GridSearchCV(xgb_regressor,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200]}, verbose=1)


model_xgb.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    1.5s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [2, 4, 6], 'n_estimators': [50, 100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [105]:
model_xgb.score(X_test, y_test) 

0.863345434540151

In [109]:
predict_house_df = pd.read_csv('test.csv')
X_predict = pre_process(predict_house_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [110]:
y_predict_lr = np.exp(model_lr.predict(X_predict))

In [111]:
x = X_predict.values
x = x.astype('f')
y_predict_nn = []

In [112]:
with chainer.using_config('train', False), chainer.using_config('enable_backprop', False):
    for i, x_ in enumerate(x):
        x_new = x_
        x_new = x_new[np.newaxis]
        y = model_nn.predictor(x_new)
        y_predict_nn.append(y.data)

In [113]:
house_df_emsemble = pd.DataFrame()
house_df_emsemble['predict_lr'] = y_predict_lr
house_df_emsemble['predict_nn'] = np.array(np.reshape(y_predict_nn, 1459))

In [117]:
y_predict = np.exp(model_xgb.predict(house_df_emsemble))

In [118]:
output_df = pd.DataFrame()
output_df['Id'] = predict_house_df['Id']
output_df['SalePrice'] = np.array(y_predict)

In [119]:
output_df.head()

Unnamed: 0,Id,SalePrice
0,1461,121638.625
1,1462,151680.5625
2,1463,154607.15625
3,1464,186815.625
4,1465,183017.703125


In [120]:
output_df.to_csv('predict_20190120_2.csv')