# 房價預測
> https://www.kaggle.com/c/house-prices-advanced-regression-techniques

## 01. 讀取dataset

In [1]:
import numpy as np
import pandas as pd

DF = pd.read_csv('train.csv')
DF.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## 02. 前處理
### 檢查空值
列出有空值的column名稱

In [8]:
DF.columns[DF.isna().any()].tolist()

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

列出缺值數目

In [3]:
NA_columns = DF.columns[DF.isna().any()].tolist()
DF[NA_columns].isnull().sum()

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

### 選定特徵與標籤

In [4]:
train_feature = DF.drop(NA_columns, axis=1)
train_label = DF['SalePrice'].copy()

## 03. One Hot Encoding

In [16]:
train_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 62 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotArea          1460 non-null int64
Street           1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
ExterQual        1460 non-

先使用數值型態的feature

In [17]:
train_feature.select_dtypes(include=[np.number]).columns.tolist()

['Id',
 'MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice']

In [5]:
numeric_DF = train_feature.select_dtypes(include=[np.number]).columns.tolist()
train_feature_num = train_feature[numeric_DF].copy()

## 04. Normalization

## 05. Algorithm/ Model
### Linear Regression with SGD approach
![](https://image.slidesharecdn.com/anoverviewofgradientdescentoptimizationalgorithms-170414055411/95/an-overview-of-gradient-descent-optimization-algorithms-6-638.jpg?cb=1492149859)

In [14]:
weights = np.zeros(len(train_feature_num.columns))
bias = 0

learningRate = 0.01

Iterations = 1
# 紀錄loss變化
loss_record = []

In [15]:
# 訓練參數
def train_lr(data, labels, iterations):
    global loss_record
    for numIter in range(Iterations):
        # batch size為1, 每筆資料都更新參數
        for i in range(len(data.index)):
            global weights, bias
            # 預測價格
            predict_price = np.dot(data.iloc[i], weights)
            # 處理inf
            print(predict_price)
            
            # 預測值與實際值間的誤差
            loss = predict_price - labels[i]
            MSE = np.mean(loss**2)
            
            # 偏微分, 計算梯度
            gradient = np.dot(data.iloc[i].T, loss)
            
            # 參數更新
            weights = weights - learningRate*gradient
        
        # 紀錄loss
        loss_record.append(MSE)
            

In [16]:
train_lr(train_feature_num, train_label, Iterations)
loss_record

  


[nan]

## 06. Predict

In [None]:
def predict(data):
    prediction = []
     for i in range(len(data.index)):

In [None]:
DF_submit = pd.DataFrame(
    {'Id': answer['Id'],
     'SalePrice': y_pred
    })

DF_submit.to_csv('submit_lr.csv', index=False)
DF_submit.head()