# 房價預測
> https://www.kaggle.com/c/house-prices-advanced-regression-techniques

## 01. 讀取dataset

In [1]:
import numpy as np
import pandas as pd

DF = pd.read_csv('train.csv')
DF.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## 02. 前處理
### 檢查空值
列出有空值的column名稱

In [8]:
DF.columns[DF.isna().any()].tolist()

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

列出缺值數目

In [3]:
NA_columns = DF.columns[DF.isna().any()].tolist()
DF[NA_columns].isnull().sum()

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

### 選定特徵與標籤

In [2]:
NA_columns = DF.columns[DF.isna().any()].tolist()
train_feature = DF.drop(NA_columns, axis=1)
# 把 target drop掉
train_feature = train_feature.drop(['SalePrice', 'Id'], axis=1)
train_label = DF['SalePrice'].copy()

## 03. One Hot Encoding

In [16]:
train_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 62 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotArea          1460 non-null int64
Street           1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
ExterQual        1460 non-

先使用數值型態的feature

In [17]:
train_feature.select_dtypes(include=[np.number]).columns.tolist()

['Id',
 'MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice']

In [3]:
numeric_DF = train_feature.select_dtypes(include=[np.number]).columns.tolist()
train_feature_num = train_feature[numeric_DF].copy()

### Normalization
資料的標準化用意在於解決overflow的問題, 因為feature眾多收斂的速度不見得相同。

在艾姆斯房價資料集中房價變數（SalePrice）數量級約莫落於數萬美元至數十萬美元之間，而生活空間大小（GrLivArea）數量級約莫落於數千英畝左右，這可能使得在同一個學習速率下，兩個係數收斂的速度差異過大，而導致其中一個係數已經收斂，但另外一個數字仍很緩慢地向低點前進，雖然成本函數遞減的速率已經平緩，迭代後所得到的係數卻與內建函數的相差甚遠，特別是常數項。

In [4]:
# 避免overflow/ inf
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
column_names = list(train_feature_num)
train_feature_num = scaler.fit_transform(train_feature_num)
# 從 ndarray轉回dataframe
train_feature_num = pd.DataFrame(train_feature_num, columns=column_names)
train_feature_num.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,0.235294,0.03342,0.666667,0.5,0.949275,0.883333,0.125089,0.0,0.064212,0.140098,...,0.38646,0.0,0.111517,0.0,0.0,0.0,0.0,0.0,0.090909,0.5
1,0.0,0.038795,0.555556,0.875,0.753623,0.433333,0.173281,0.0,0.121575,0.206547,...,0.324401,0.347725,0.0,0.0,0.0,0.0,0.0,0.0,0.363636,0.25
2,0.235294,0.046507,0.666667,0.5,0.934783,0.866667,0.086109,0.0,0.185788,0.150573,...,0.428773,0.0,0.076782,0.0,0.0,0.0,0.0,0.0,0.727273,0.5
3,0.294118,0.038561,0.666667,0.5,0.311594,0.333333,0.038271,0.0,0.231164,0.123732,...,0.45275,0.0,0.063985,0.492754,0.0,0.0,0.0,0.0,0.090909,0.0
4,0.235294,0.060576,0.777778,0.5,0.927536,0.833333,0.116052,0.0,0.20976,0.187398,...,0.589563,0.224037,0.153565,0.0,0.0,0.0,0.0,0.0,1.0,0.5


## 05. Algorithm/ Model
### Linear Regression with SGD approach
![](https://image.slidesharecdn.com/anoverviewofgradientdescentoptimizationalgorithms-170414055411/95/an-overview-of-gradient-descent-optimization-algorithms-6-638.jpg?cb=1492149859)
### Learning Rate的選擇
> https://medium.freecodecamp.org/how-to-pick-the-best-learning-rate-for-your-machine-learning-project-9c28865039a8

在超參數的設置方面，經過多次實驗後發現當learning rate為1時會有較好的成效。

In [5]:
weights = np.ones(len(train_feature_num.columns))
bias = 0

pocket_err = 1000000
pocket_weights  = np.ones(len(train_feature_num.columns))
pocket_bias = 0

learningRate = 1

Iterations = 50
# 紀錄loss變化
loss_record = []

In [8]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# 訓練參數
def train_lr(data, labels, iterations):
    global loss_record, pocket_err, pocket_weights, pocket_bias
    for numIter in range(Iterations):
        # batch size為1, 每筆資料都更新參數
        for i in range(len(data.index)):
            # 改成random資料
            input = data.iloc[i]
            
            global weights, bias
            # 預測價格
            predict_price = np.dot(input, weights) + bias
            
            # 預測值與實際值間的誤差
            loss = predict_price - labels[i]
            
            # 偏微分, 計算梯度
            gradient = np.dot(input.T, loss)
            
            # 參數更新
            weights = weights - (1/len(data.index))*learningRate*gradient
#             weights = weights - learningRate*gradient
            # bias更新
            bias = bias - (1/len(data.index))*learningRate*loss
        
        # 在最後一筆資料時計算現在的 weight的 MSE
        all_predict = weights*data +bias
        RMSE = sqrt(mean_squared_error(labels, all_predict.sum(axis=1)))
        # 紀錄loss
        loss_record.append(RMSE)
        # 如果error最小, 暫存權重
        if RMSE < pocket_err:
            pocket_err = RMSE
            pocket_weights = weights
            pocket_bias = bias

In [9]:
train_lr(train_feature_num, train_label, Iterations)
loss_record

[1170631.603976558,
 1051899.6568562347,
 928133.9853847743,
 815649.4182539565,
 713388.0206056304,
 620231.1179834311,
 535192.5124840579,
 457406.7874995649,
 386118.3561504868,
 320674.92461283406,
 260528.63478947678,
 205254.0706314938,
 154612.48924640374,
 108775.13179591676,
 69258.23746763288,
 43505.83748705057,
 48799.23393006846,
 75227.74365484291,
 105985.04780606352,
 136740.31403983707,
 166501.23674010785,
 195023.88147983595,
 222279.6897263083,
 248312.74226505728,
 273192.3767455057,
 296995.07023831323,
 319796.91938536963,
 341670.44176354184,
 362683.292986419,
 382897.880685349,
 402371.40936803154,
 421156.13023703435,
 439299.68175049033,
 456845.4616571788,
 473832.99950589257,
 490298.3136842105,
 506274.24526356644,
 521790.76548384235,
 536875.2562419138,
 551552.7643338152,
 565846.2309261957,
 579776.6980755632,
 593363.494233702,
 606624.4006690998,
 619575.8006540937,
 632232.8131495068,
 644609.4125835837,
 656718.5361824099,
 668572.1801718011,
 680

In [26]:
# remove (1/len(data.index)), loss小一點
train_lr(train_feature_num, train_label, Iterations)
loss_record

[16870870254.0167,
 14721720401.203047,
 14410981272.466742,
 14465882990.790627,
 14583453764.575897,
 14697298801.516237,
 14794477087.197018,
 14874380155.016512,
 14939107869.5738,
 14991140644.015417]

## 06. Predict

In [10]:
def predict(data):
    prediction = []
    global pocket_weights, pocket_bias
    for i in range(len(data.index)):
        input = data.iloc[i]
        predict_price = np.dot(input, pocket_weights) + pocket_bias
        prediction.append(predict_price)
    
    return prediction

In [11]:
testDF = pd.read_csv('test.csv')
# training data沒空值的columns, 不代表在testing data也會沒有
test_feature = testDF.drop(NA_columns, axis=1)
test_feature = test_feature.drop(['Id'], axis=1)
test_feature_num = test_feature[test_feature.select_dtypes(include=[np.number]).columns.tolist()].copy()

# fill in NA
test_feature_num = test_feature_num.fillna(test_feature_num.mean())

test_column_names = list(test_feature_num)
test_feature_num = scaler.fit_transform(test_feature_num)
test_feature_num = pd.DataFrame(test_feature_num, columns=test_column_names)

y_pred = predict(test_feature_num)

In [17]:
test_feature_num.isnull().sum()

Id               0
MSSubClass       0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
BsmtFinSF1       1
BsmtFinSF2       1
BsmtUnfSF        1
TotalBsmtSF      1
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     2
BsmtHalfBath     2
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageCars       1
GarageArea       1
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
dtype: int64

In [12]:
DF_submit = pd.DataFrame(
    {'Id': testDF['Id'],
     'SalePrice': y_pred
    })

DF_submit.to_csv('submit_lr.csv', index=False)
DF_submit.head()

Unnamed: 0,Id,SalePrice
0,1461,106822.721638
1,1462,125665.995753
2,1463,185655.52449
3,1464,195663.863488
4,1465,162725.278019
