# House Prices: Advanced Regression Techniques

In [1]:
import numpy as np
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [3]:
print(train.shape,test.shape)

(1460, 81) (1459, 80)


In [4]:
#欠損値が多いものを消す
train=train.drop('Alley',axis=1).drop('FireplaceQu',axis=1).drop('PoolQC',axis=1).drop('Fence',axis=1).drop('MiscFeature',axis=1)

test=test.drop('Alley',axis=1).drop('FireplaceQu',axis=1).drop('PoolQC',axis=1).drop('Fence',axis=1).drop('MiscFeature',axis=1)

In [6]:
train_id = train['Id']
test_id = test['Id']#テストセットのIDは提出用のファイルを作る際に必要

y_train = train['SalePrice']
x_train = train.drop(['Id','SalePrice'], axis=1)
x_test = test.drop('Id', axis=1)

In [9]:
x_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal


In [11]:
y_train.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [12]:
x_train = x_train.fillna(x_train.median())
x_test = x_test.fillna(x_test.median())

x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 74 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1452 no

出力結果からわかる通り、int型・float型の特徴量は欠損値が補完されましたが、object型についてはまだ欠損値が残ったままです

object型の欠損値をmodeで埋める

In [13]:
#object型の欠損値をmodeで埋めたい
for i in range(x_train.shape[1]):
    if x_train.iloc[:,i].dtypes == object:
        mode = x_train.mode()[x_train.columns.values[i]].values
        for j in range(x_train.shape[0]):
            if x_train.isnull().iloc[j,i] == True:
                x_train.iloc[j,i] = mode

In [15]:
for i in range(x_test.shape[1]):
    if x_test.iloc[:,i].dtypes == object:
        mode = x_test.mode()[x_test.columns.values[i]].values
        for j in range(x_test.shape[0]):
            if x_test.isnull().iloc[j,i]==True:
                x_test.iloc[j,i] = mode

確認してみましょう。以下のコードを実行すると、全インスタンスの欠損値の合計の値が出力されます。
下記コードで、sum()を2回呼んでいますが、1回目のsum()では全ての列毎に合計しており、2回目のsum()にて、その列全てを足し合わせた結果を取得しています。

In [16]:
x_train.isnull().sum().sum()

0

# ラベルエンコーディング

In [17]:
x_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal


In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#ラベルエンコーダー(訓練セット）
for i in range(x_train.shape[1]):
    if x_train.iloc[:,i].dtypes == object:
        le.fit(list(x_train[x_train.columns.values[i]].values)) 
        x_train[x_train.columns.values[i]] = le.transform(list(x_train[x_train.columns.values[i]].values))

#ラベルエンコーダー(テストセット）
for i in range(x_test.shape[1]):
    if x_test.iloc[:,i].dtypes == object:
        le.fit(list(x_test[x_test.columns.values[i]].values)) 
        x_test[x_test.columns.values[i]] = le.transform(list(x_test[x_test.columns.values[i]].values))

x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 74 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null int64
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null int64
LotShape         1460 non-null int64
LandContour      1460 non-null int64
Utilities        1460 non-null int64
LotConfig        1460 non-null int64
LandSlope        1460 non-null int64
Neighborhood     1460 non-null int64
Condition1       1460 non-null int64
Condition2       1460 non-null int64
BldgType         1460 non-null int64
HouseStyle       1460 non-null int64
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null int64
RoofMatl         1460 non-null int64
Exterior1st      1460 non-null int64
Exterior2nd      1460 non-null int64
MasVnrType       1460 non-null int64
Mas

In [19]:
x_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,3,65.0,8450,1,3,3,0,4,0,...,61,0,0,0,0,0,2,2008,8,4
1,20,3,80.0,9600,1,3,3,0,2,0,...,0,0,0,0,0,0,5,2007,8,4
2,60,3,68.0,11250,1,0,3,0,4,0,...,42,0,0,0,0,0,9,2008,8,4
3,70,3,60.0,9550,1,0,3,0,0,0,...,35,272,0,0,0,0,2,2006,8,0
4,60,3,84.0,14260,1,0,3,0,2,0,...,84,0,0,0,0,0,12,2008,8,4


# 特徴量の削減

In [20]:
from sklearn.feature_selection import SelectKBest, f_regression
selector = SelectKBest(score_func=f_regression, k=5)#回帰の場合はscore_func=f_classifです
selector.fit(x_train,y_train)
print(selector.get_support())

[False False False False False False False False False False False False
 False False False  True False False False False False False False False
 False  True False False False False False False False False False False
 False False False False False False False False  True False False False
 False False False False False False False False False False  True  True
 False False False False False False False False False False False False
 False False]


In [21]:
x_train_selected=pd.DataFrame({'OverallQual':x_train['OverallQual'],'ExterQual':x_train['ExterQual'],'GrLivArea':x_train['GrLivArea'],'GarageCars':x_train['GarageCars'],'GarageArea':x_train['GarageArea']})
x_test_selected=pd.DataFrame({'OverallQual':x_test['OverallQual'],'ExterQual':x_test['ExterQual'],'GrLivArea':x_test['GrLivArea'],'GarageCars':x_test['GarageCars'],'GarageArea':x_test['GarageArea']})

x_train_selected.head()

Unnamed: 0,OverallQual,ExterQual,GrLivArea,GarageCars,GarageArea
0,7,2,1710,2,548
1,6,3,1262,2,460
2,7,2,1786,2,608
3,7,3,1717,3,642
4,8,2,2198,3,836


# モデルの訓練

ではいよいよモデルを訓練していきましょう。機械学習モデルには様々なパラメータがあり例えば(SVMならCやgammaなど)、最適なものを手動で見つけるのは簡単ではありません。幸いなことにScikit-learnにはグリッドサーチという優れた機能があり、これを使えばモデルのハイパーパラメータが自動で最適化されます。

## 訓練セットを分ける

In [22]:
from sklearn.model_selection import train_test_split
xp_train,xp_test,yp_train,yp_test=train_test_split(x_train_selected,y_train,test_size=0.3,random_state=1) #セットのうち3割をテストセットにする

# グリッドサーチ

実際にグリッドサーチを行います。今回はランダムフォレストとサポートベクター回帰（SVR）を試していきたいと思います。

In [23]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

forest=RandomForestRegressor()
svr=SVR()

parameters_forest={'n_estimators':[100,500,1000,3000],'max_depth':[3,6,12]}
parameters_svr={'C':[0.1,10,1000],'epsilon':[0.01,0.1,0.5]}

In [24]:
from sklearn.model_selection import GridSearchCV
# ランダムフォレスト
grid_forest = GridSearchCV(forest,parameters_forest)
grid_forest.fit(xp_train,yp_train)

# SVR
grid_svr = GridSearchCV(svr,parameters_svr)
grid_svr.fit(xp_train,yp_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 10, 1000], 'epsilon': [0.01, 0.1, 0.5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

結果を見てみましょう。性能指標として今回は平均二乗誤差（MSE）を使います。まずはランダムフォレストから。

In [25]:
from sklearn.metrics import mean_squared_error
yp_pred_forest=grid_forest.predict(xp_test)
print(mean_squared_error(yp_test,yp_pred_forest))

1025856063.2550989


SVR

In [26]:
yp_pred_svr=grid_svr.predict(xp_test)
print(mean_squared_error(yp_test,yp_pred_svr))

7389304167.172188


ランダムフォレストの方が断然性能が良さそうなので今回はランダムフォレストを使ってモデルの訓練をしていきましょう。

# モデルを適用する

In [28]:
grid_forest.best_params_

{'max_depth': 6, 'n_estimators': 3000}

In [30]:
best_forest=RandomForestRegressor(max_depth= 6, n_estimators=3000)
best_forest.fit(x_train_selected,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=3000,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [31]:
result = np.array(best_forest.predict(x_test_selected))
# 提出用ファイルの作成
df_result=pd.DataFrame(result,columns=['SalePrice'])
df_result=pd.concat([test_id,df_result],axis=1)
df_result.to_csv('houseprices.csv', index=False)