# House Prices: Advanced Regression Techniques

## 1. Getting data with Pandas

In [28]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor

In [29]:
train_data = pd.read_csv('all/train.csv')
test_data  = pd.read_csv('all/test.csv')

In [30]:
train_label = train_data['SalePrice']
train_data_input = train_data.drop('SalePrice',1)
all_data   = pd.concat([train_data_input, test_data], axis=0)

print('Data Shape:\n')
print('Train Data \t\t:', train_data.shape)
print('Train Data(Input)\t:',train_data_input.shape)
print('Test  Data(Input)\t:',test_data.shape)
print('All Data \t\t:',all_data.shape)

Data Shape:

Train Data 		: (1460, 81)
Train Data(Input)	: (1460, 80)
Test  Data(Input)	: (1459, 80)
All Data 		: (2919, 80)


## 2. Data exploration

In [31]:
all_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,2919.0,2919.0,2433.0,2919.0,2919.0,2919.0,2919.0,2919.0,2896.0,2918.0,...,2918.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0
mean,1460.0,57.137718,69.305795,10168.11408,6.089072,5.564577,1971.312778,1984.264474,102.201312,441.423235,...,472.874572,93.709832,47.486811,23.098321,2.602261,16.06235,2.251799,50.825968,6.213087,2007.792737
std,842.787043,42.517628,23.344905,7886.996359,1.409947,1.113131,30.291442,20.894344,179.334253,455.610826,...,215.394815,126.526589,67.575493,64.244246,25.188169,56.184365,35.663946,567.402211,2.714762,1.314964
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,730.5,20.0,59.0,7478.0,5.0,5.0,1953.5,1965.0,0.0,0.0,...,320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,1460.0,50.0,68.0,9453.0,6.0,5.0,1973.0,1993.0,0.0,368.5,...,480.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,2189.5,70.0,80.0,11570.0,7.0,6.0,2001.0,2004.0,164.0,733.0,...,576.0,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,2919.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,1488.0,1424.0,742.0,1012.0,508.0,576.0,800.0,17000.0,12.0,2010.0


In [32]:
missing_data = all_data.isnull().sum().sort_values(ascending=False)
missing_data = missing_data[missing_data > 0]

In [33]:
count        = all_data.isnull().sum().sort_values(ascending=False)
percent_relt = all_data.isnull().sum() / all_data.isnull().count()
percent_relt = percent_relt.sort_values(ascending=False)*100

missing_data_perc  = pd.concat([count, percent_relt], axis=1, keys = ['Count', 'Percent(%)'])
missing_data_perc.head(40)

Unnamed: 0,Count,Percent(%)
PoolQC,2909,99.657417
MiscFeature,2814,96.402878
Alley,2721,93.216855
Fence,2348,80.438506
FireplaceQu,1420,48.646797
LotFrontage,486,16.649538
GarageCond,159,5.447071
GarageQual,159,5.447071
GarageYrBlt,159,5.447071
GarageFinish,159,5.447071


## 3. Prepare data

## 3.1 Drop Missing Data

In [34]:
all_data = all_data.drop((missing_data_perc[missing_data_perc['Percent(%)'] > 25]).index, 1)

all_data = all_data.drop('LotArea', 1)
all_data = all_data.drop('YearBuilt', 1)
all_data = all_data.drop('BsmtFinSF1', 1)

all_data = all_data.drop('BsmtFinSF2', 1)
all_data = all_data.drop('BsmtUnfSF', 1)
all_data = all_data.drop('TotalBsmtSF', 1)

all_data = all_data.drop('GarageArea', 1)
all_data = all_data.drop('WoodDeckSF', 1)
all_data = all_data.drop('OpenPorchSF', 1)
all_data = all_data.drop('EnclosedPorch', 1)
all_data = all_data.drop('3SsnPorch', 1)
all_data = all_data.drop('ScreenPorch', 1)

## 3.2 Fill Missing Data

In [35]:
all_data['LotFrontage']  = all_data['LotFrontage'].fillna(0)
all_data['GarageCond']   = all_data['GarageCond'].fillna('NA')
all_data['GarageType']   = all_data['GarageType'].fillna('NA')
all_data['GarageFinish'] = all_data['GarageFinish'].fillna('NA')
all_data['GarageQual']   = all_data['GarageQual'].fillna('NA')
all_data['BsmtExposure'] = all_data['BsmtExposure'].fillna('NA')
all_data['BsmtFinType2'] = all_data['BsmtFinType2'].fillna('NA')
all_data['BsmtFinType1'] = all_data['BsmtFinType1'].fillna('NA')
all_data['BsmtCond']     = all_data['BsmtCond'].fillna('NA')
all_data['BsmtQual']     = all_data['BsmtQual'].fillna('NA')
all_data['MasVnrArea']   = all_data['MasVnrArea'].fillna(0)
all_data['MasVnrType']   = all_data['MasVnrType'].fillna('None')
all_data['Electrical']   = all_data['Electrical'].fillna('SBrkr')
all_data['MSZoning']     = all_data['MSZoning'].fillna('RM')
all_data['Functional']   = all_data['Functional'].fillna('Typ')
all_data['BsmtFullBath'] = all_data['BsmtFullBath'].fillna(0)
all_data['BsmtHalfBath'] = all_data['BsmtFullBath'].fillna(0)
all_data['Utilities']    = all_data['Utilities'].fillna('AllPub')
#all_data['GarageArea']   = all_data['GarageArea'].fillna(0)
#all_data['BsmtFinSF2']   = all_data['BsmtFinSF2'].fillna(0)
#all_data['BsmtUnfSF']    = all_data['BsmtUnfSF'].fillna(0)
all_data['SaleType']     = all_data['SaleType'].fillna('Oth')
all_data['Exterior2nd']  = all_data['Exterior2nd'].fillna('Other')
all_data['Exterior1st']  = all_data['Exterior1st'].fillna('Other')
all_data['KitchenQual']  = all_data['KitchenQual'].fillna('TA')
all_data['GarageCars']   = all_data['GarageCars'].fillna(0)
#all_data['TotalBsmtSF']  = all_data['TotalBsmtSF'].fillna(0)

In [36]:
all_data['GarageYrBlt'].describe()
all_data['GarageYrBlt']  = all_data['GarageYrBlt'].fillna(1978)

In [37]:
print('MiscVal  == 0 percent -> %3.2f%%'%(100*(all_data['MiscVal'].values == 0).sum() / all_data['MiscVal'].values.shape[0] ))
print('PoolArea == 0 percent -> %3.2f%%'%(100*(all_data['PoolArea'].values == 0).sum() / all_data['PoolArea'].values.shape[0] ))

MiscVal  == 0 percent -> 96.47%
PoolArea == 0 percent -> 99.55%


In [38]:
all_data = all_data.drop('MiscVal', 1)
all_data = all_data.drop('PoolArea', 1)

print(all_data.isnull().sum().max())

0


In [13]:
numerical_data    = [var for var in all_data.columns if all_data.dtypes[var] != 'object']
categorical_data  = [var for var in all_data.columns if all_data.dtypes[var] == 'object']

numerical_data.remove('Id')
#numerical_data.remove('SalePrice')
print('Number of numerical data   : ', len(numerical_data))
print('Number of categorical data : ', len(categorical_data))

Number of numerical data   :  31
Number of categorical data :  38


## 3.3 Standarize Data

In [14]:
all_data = pd.get_dummies(all_data, columns=categorical_data)
scalers  = []

for name in all_data:
    if name == 'Id':
        continue
    x = all_data[name].values.astype(np.float32)
    x = x.reshape(-1,1)
    scaler = StandardScaler()
    scaler.fit(x)
    all_data[name] = scaler.transform(x)

In [15]:
all_data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,OverallQual,OverallCond,YearRemodAdd,MasVnrArea,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,0.067331,0.216075,0.646183,-0.507284,0.896833,0.529034,-0.293025,-0.934165,-0.443078,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
1,2,-0.873616,0.664158,-0.063185,2.188279,-0.395604,-0.567016,-0.293025,-0.629284,0.477463,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
2,3,0.067331,0.305692,0.646183,-0.507284,0.848965,0.338903,-0.293025,-0.287999,-0.297968,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
3,4,0.302568,0.066714,0.646183,-0.507284,-0.682812,-0.567016,-0.293025,-0.046824,-0.669812,...,-0.052423,-0.298629,-0.052423,0.395018,3.789876,-0.064249,-0.09105,-0.126535,-2.155466,-0.302693
4,5,0.067331,0.783647,1.355551,-0.507284,0.753229,1.390216,-0.293025,-0.160586,0.212184,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693


## 3.4 Get Train Data and Test Data

In [16]:
train_data = all_data[:1460]
test_data  = all_data[1460:]

## 4. Model

In [17]:
def getPrediction(W, b, input_data):
    return np.sum(W*input_data) + b

def RMSE(Y, pred, m):
    return np.sqrt(np.mean(np.square(np.log(Y+1)-np.log(pred+1))))

def train(input_, err, lr):
    global W, b
    W = W  + lr * err * input_
    b = b  + lr * err    

## 5. Training

In [18]:
train_data_values = train_data.values

In [19]:
train_input = train_data_values[:, 1:]
train_label_values = train_label.values
train_input = train_input.astype(np.float64)

In [20]:
n, m = train_input.shape
regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)
#regr = ElasticNet(random_state=0, max_iter = 1000)
regr.fit(train_input, train_label)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [21]:
pred_ar = regr.predict(train_input)
RMSE(train_label, pred_ar, m)

0.2427536270707509

## 6. Testing

In [22]:
test_data  = all_data[1460:]
test_data.head()


test_input = test_data.values.astype(np.float64)[:,1:]
#test_input       = test_data_values[:, 1:]
test_input       = test_input.astype(np.float32)

In [23]:
train_data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,OverallQual,OverallCond,YearRemodAdd,MasVnrArea,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,0.067331,0.216075,0.646183,-0.507284,0.896833,0.529034,-0.293025,-0.934165,-0.443078,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
1,2,-0.873616,0.664158,-0.063185,2.188279,-0.395604,-0.567016,-0.293025,-0.629284,0.477463,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
2,3,0.067331,0.305692,0.646183,-0.507284,0.848965,0.338903,-0.293025,-0.287999,-0.297968,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
3,4,0.302568,0.066714,0.646183,-0.507284,-0.682812,-0.567016,-0.293025,-0.046824,-0.669812,...,-0.052423,-0.298629,-0.052423,0.395018,3.789876,-0.064249,-0.09105,-0.126535,-2.155466,-0.302693
4,5,0.067331,0.783647,1.355551,-0.507284,0.753229,1.390216,-0.293025,-0.160586,0.212184,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693


In [24]:
n, m         = test_input.shape
pred_test_ar = regr.predict(test_input)
#pred_test_ar = np.matmul(test_input,W) + b

In [25]:
pred_pd = pd.DataFrame(pred_test_ar, columns=['SalePrice'])
out     = pd.concat([test_data['Id'],pred_pd], axis=1)

Unnamed: 0,Id,SalePrice
0,1461,133699.697602
1,1462,139831.537353
2,1463,151419.346014
3,1464,156733.893538
4,1465,266935.655564
5,1466,156733.893538
6,1467,143025.203070
7,1468,156257.645202
8,1469,196713.228307
9,1470,134832.894096


In [26]:
out.to_csv('all/random_forest.csv',index=False)
tt1 = pd.read_csv('all/random_forest.csv')