# House Prices: Advanced Regression Techniques

## 1. Getting data with Pandas

In [161]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

In [162]:
train_data = pd.read_csv('all/train.csv')
test_data  = pd.read_csv('all/test.csv')

In [163]:
train_label = train_data['SalePrice']
train_data_input = train_data.drop('SalePrice',1)
all_data   = pd.concat([train_data_input, test_data], axis=0)

print(train_data.shape)
print(train_data_input.shape)
print(test_data.shape)
print(all_data.shape)

(1460, 81)
(1460, 80)
(1459, 80)
(2919, 80)


In [164]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## 2. Data exploration

In [165]:
all_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,10,2009,WD,Normal
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,8,2007,WD,Normal
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,0,,,Shed,350,11,2009,WD,Normal
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2008,WD,Abnorml
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,1,2008,WD,Normal


In [166]:
train_data['SalePrice'].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

## 3. Prepare data

In [167]:
missing_data = all_data.isnull().sum().sort_values(ascending=False)
missing_data = missing_data[missing_data > 0]

In [168]:
count        = all_data.isnull().sum().sort_values(ascending=False)
percent_relt = all_data.isnull().sum() / all_data.isnull().count()
percent_relt = percent_relt.sort_values(ascending=False)*100

missing_data_perc  = pd.concat([count, percent_relt], axis=1, keys = ['Count', 'Percent(%)'])
missing_data_perc.head(50)

Unnamed: 0,Count,Percent(%)
PoolQC,2909,99.657417
MiscFeature,2814,96.402878
Alley,2721,93.216855
Fence,2348,80.438506
FireplaceQu,1420,48.646797
LotFrontage,486,16.649538
GarageCond,159,5.447071
GarageQual,159,5.447071
GarageYrBlt,159,5.447071
GarageFinish,159,5.447071


In [169]:
all_data = all_data.drop((missing_data_perc[missing_data_perc['Percent(%)'] > 25]).index, 1)

all_data = all_data.drop('LotArea', 1)
all_data = all_data.drop('YearBuilt', 1)
all_data = all_data.drop('BsmtFinSF1', 1)
#all_data = all_data.drop('MasVnrArea', 1)

In [170]:
#all_data['FireplaceQu']  = all_data['FireplaceQu'].fillna('NA')

#all_data['LotArea']  = all_data['LotArea'].fillna(0)

all_data['LotFrontage']  = all_data['LotFrontage'].fillna(0)
all_data['GarageCond']   = all_data['GarageCond'].fillna('NA')
all_data['GarageType']   = all_data['GarageType'].fillna('NA')
all_data['GarageFinish'] = all_data['GarageFinish'].fillna('NA')
all_data['GarageQual']   = all_data['GarageQual'].fillna('NA')
all_data['BsmtExposure'] = all_data['BsmtExposure'].fillna('NA')
all_data['BsmtFinType2'] = all_data['BsmtFinType2'].fillna('NA')
all_data['BsmtFinType1'] = all_data['BsmtFinType1'].fillna('NA')
all_data['BsmtCond']     = all_data['BsmtCond'].fillna('NA')
all_data['BsmtQual']     = all_data['BsmtQual'].fillna('NA')
all_data['MasVnrArea']   = all_data['MasVnrArea'].fillna(0)
all_data['MasVnrType']   = all_data['MasVnrType'].fillna('None')
all_data['Electrical']   = all_data['Electrical'].fillna('SBrkr')
all_data['MSZoning']     = all_data['MSZoning'].fillna('RM')
all_data['Functional']   = all_data['Functional'].fillna('Typ')
all_data['BsmtFullBath'] = all_data['BsmtFullBath'].fillna(0)
all_data['BsmtHalfBath'] = all_data['BsmtFullBath'].fillna(0)
all_data['Utilities']    = all_data['Utilities'].fillna('AllPub')
all_data['GarageArea']   = all_data['GarageArea'].fillna(0)
all_data['BsmtFinSF2']   = all_data['BsmtFinSF2'].fillna(0)
all_data['BsmtUnfSF']    = all_data['BsmtUnfSF'].fillna(0)
all_data['SaleType']     = all_data['SaleType'].fillna('Oth')
all_data['Exterior2nd']  = all_data['Exterior2nd'].fillna('Other')
all_data['Exterior1st']  = all_data['Exterior1st'].fillna('Other')
all_data['KitchenQual']  = all_data['KitchenQual'].fillna('TA')
all_data['GarageCars']   = all_data['GarageCars'].fillna(0)
all_data['TotalBsmtSF']  = all_data['TotalBsmtSF'].fillna(0)


In [171]:
all_data['GarageYrBlt'].describe()
all_data['GarageYrBlt']  = all_data['GarageYrBlt'].fillna(1978)

In [172]:
print('MiscVal  == 0 percent -> %3.2f%%'%(100*(all_data['MiscVal'].values == 0).sum() / all_data['MiscVal'].values.shape[0] ))
print('PoolArea == 0 percent -> %3.2f%%'%(100*(all_data['PoolArea'].values == 0).sum() / all_data['PoolArea'].values.shape[0] ))

MiscVal  == 0 percent -> 96.47%
PoolArea == 0 percent -> 99.55%


In [173]:
all_data = all_data.drop('MiscVal', 1)
all_data = all_data.drop('PoolArea', 1)

print(all_data.isnull().sum().max())

0


In [174]:
numerical_data    = [var for var in all_data.columns if all_data.dtypes[var] != 'object']
categorical_data  = [var for var in all_data.columns if all_data.dtypes[var] == 'object']

numerical_data.remove('Id')
#numerical_data.remove('SalePrice')
print('Number of numerical data   : ', len(numerical_data))
print('Number of categorical data : ', len(categorical_data))

Number of numerical data   :  31
Number of categorical data :  38


In [175]:
all_data = pd.get_dummies(all_data, columns=categorical_data)
scalers  = []
#all_data = all_data.astype(np.float32)
for name in all_data:
    if name == 'Id':
        continue
    x = all_data[name].values.astype(np.float32)
    x = x.reshape(-1,1)
    scaler = StandardScaler()
    scaler.fit(x)
    #scalers.append(scaler)
    all_data[name] = scaler.transform(x)

In [176]:
all_data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,OverallQual,OverallCond,YearRemodAdd,MasVnrArea,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,0.067331,0.216075,0.646183,-0.507284,0.896833,0.529034,-0.293025,-0.934165,-0.443078,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
1,2,-0.873616,0.664158,-0.063185,2.188279,-0.395604,-0.567016,-0.293025,-0.629284,0.477463,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
2,3,0.067331,0.305692,0.646183,-0.507284,0.848965,0.338903,-0.293025,-0.287999,-0.297968,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
3,4,0.302568,0.066714,0.646183,-0.507284,-0.682812,-0.567016,-0.293025,-0.046824,-0.669812,...,-0.052423,-0.298629,-0.052423,0.395018,3.789876,-0.064249,-0.09105,-0.126535,-2.155466,-0.302693
4,5,0.067331,0.783647,1.355551,-0.507284,0.753229,1.390216,-0.293025,-0.160586,0.212184,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693


In [177]:
train_data = all_data[:1460]
test_data = all_data[1460:]

## 4. Model

In [178]:
def getPrediction(W, b, input_data):
    return np.sum(W*input_data) + b

def RMSE(Y, pred, m):
    return np.sqrt(np.mean(np.square(np.log(Y+1)-np.log(pred+1))))

def train(input_, err, lr):
    global W, b
    W = W  + lr * err * input_
    b = b  + lr * err    

## 5. Training

In [179]:
train_data_values = train_data.values

In [180]:
train_input = train_data_values[:, 1:]
#train_label = train_data_values[:,-1]
train_label_values = train_label.values
train_input = train_input.astype(np.float64)

In [181]:
n, m = train_input.shape
regr = ElasticNet(random_state=0, max_iter = 1000)
regr.fit(train_input, train_label)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=0, selection='cyclic', tol=0.0001, warm_start=False)

In [182]:
pred_ar = regr.predict(train_input)
RMSE(train_label, pred_ar, m)

0.11566551457771446

## 6. Testing

In [183]:
test_data  = all_data[1460:]
test_data.head()


test_input = test_data.values.astype(np.float64)[:,1:]
#test_input       = test_data_values[:, 1:]
test_input       = test_input.astype(np.float32)

In [184]:
train_data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,OverallQual,OverallCond,YearRemodAdd,MasVnrArea,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,0.067331,0.216075,0.646183,-0.507284,0.896833,0.529034,-0.293025,-0.934165,-0.443078,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
1,2,-0.873616,0.664158,-0.063185,2.188279,-0.395604,-0.567016,-0.293025,-0.629284,0.477463,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
2,3,0.067331,0.305692,0.646183,-0.507284,0.848965,0.338903,-0.293025,-0.287999,-0.297968,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
3,4,0.302568,0.066714,0.646183,-0.507284,-0.682812,-0.567016,-0.293025,-0.046824,-0.669812,...,-0.052423,-0.298629,-0.052423,0.395018,3.789876,-0.064249,-0.09105,-0.126535,-2.155466,-0.302693
4,5,0.067331,0.783647,1.355551,-0.507284,0.753229,1.390216,-0.293025,-0.160586,0.212184,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693


In [185]:
n, m         = test_input.shape
pred_test_ar = regr.predict(test_input)
#pred_test_ar = np.matmul(test_input,W) + b

In [186]:
pred_pd = pd.DataFrame(pred_test_ar, columns=['SalePrice'])
out     = pd.concat([test_data['Id'],pred_pd], axis=1)
out

Unnamed: 0,Id,SalePrice
0,1461,111072.848847
1,1462,156790.287009
2,1463,181029.518510
3,1464,195958.250441
4,1465,201018.939576
5,1466,172799.459511
6,1467,174251.007037
7,1468,165869.002042
8,1469,194903.167467
9,1470,119415.502217


In [187]:
out.to_csv('all/elastic_net.csv',index=False)
tt1 = pd.read_csv('all/elastic_net.csv')

In [150]:
tt1

Unnamed: 0,Id,SalePrice
0,1461,113085.739782
1,1462,158183.857596
2,1463,182935.534984
3,1464,196396.220594
4,1465,200543.871974
5,1466,172425.328542
6,1467,172891.064887
7,1468,165414.766769
8,1469,194629.676856
9,1470,117489.481587
