# House Prices: Advanced Regression Techniques

## 1. Getting data with Pandas

In [174]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

In [175]:
train_data = pd.read_csv('all/train.csv')
test_data  = pd.read_csv('all/test.csv')

In [176]:
train_label = train_data['SalePrice']
train_data_input = train_data.drop('SalePrice',1)
all_data   = pd.concat([train_data_input, test_data], axis=0)

print(train_data.shape)
print(train_data_input.shape)
print(test_data.shape)
print(all_data.shape)

(1460, 81)
(1460, 80)
(1459, 80)
(2919, 80)


In [177]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## 2. Data exploration

In [178]:
all_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,10,2009,WD,Normal
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,8,2007,WD,Normal
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,0,,,Shed,350,11,2009,WD,Normal
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2008,WD,Abnorml
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,1,2008,WD,Normal


In [179]:
train_data['SalePrice'].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

## 3. Prepare data

In [180]:
missing_data = all_data.isnull().sum().sort_values(ascending=False)
missing_data = missing_data[missing_data > 0]

In [181]:
count        = all_data.isnull().sum().sort_values(ascending=False)
percent_relt = all_data.isnull().sum() / all_data.isnull().count()
percent_relt = percent_relt.sort_values(ascending=False)*100

missing_data_perc  = pd.concat([count, percent_relt], axis=1, keys = ['Count', 'Percent(%)'])
missing_data_perc.head(50)

Unnamed: 0,Count,Percent(%)
PoolQC,2909,99.657417
MiscFeature,2814,96.402878
Alley,2721,93.216855
Fence,2348,80.438506
FireplaceQu,1420,48.646797
LotFrontage,486,16.649538
GarageCond,159,5.447071
GarageQual,159,5.447071
GarageYrBlt,159,5.447071
GarageFinish,159,5.447071


In [182]:
all_data = all_data.drop((missing_data_perc[missing_data_perc['Percent(%)'] > 25]).index, 1)

all_data = all_data.drop('LotArea', 1)
all_data = all_data.drop('YearBuilt', 1)
all_data = all_data.drop('BsmtFinSF1', 1)

In [187]:
all_data['LotFrontage']  = all_data['LotFrontage'].fillna(0)
all_data['GarageCond']   = all_data['GarageCond'].fillna('NA')
all_data['GarageType']   = all_data['GarageType'].fillna('NA')
all_data['GarageFinish'] = all_data['GarageFinish'].fillna('NA')
all_data['GarageQual']   = all_data['GarageQual'].fillna('NA')
all_data['BsmtExposure'] = all_data['BsmtExposure'].fillna('NA')
all_data['BsmtFinType2'] = all_data['BsmtFinType2'].fillna('NA')
all_data['BsmtFinType1'] = all_data['BsmtFinType1'].fillna('NA')
all_data['BsmtCond']     = all_data['BsmtCond'].fillna('NA')
all_data['BsmtQual']     = all_data['BsmtQual'].fillna('NA')
all_data['MasVnrArea']   = all_data['MasVnrArea'].fillna(0)
all_data['MasVnrType']   = all_data['MasVnrType'].fillna('None')
all_data['Electrical']   = all_data['Electrical'].fillna('SBrkr')
all_data['MSZoning']     = all_data['MSZoning'].fillna('RM')
all_data['Functional']   = all_data['Functional'].fillna('Typ')
all_data['BsmtFullBath'] = all_data['BsmtFullBath'].fillna(0)
all_data['BsmtHalfBath'] = all_data['BsmtFullBath'].fillna(0)
all_data['Utilities']    = all_data['Utilities'].fillna('AllPub')
all_data['GarageArea']   = all_data['GarageArea'].fillna(0)
all_data['BsmtFinSF2']   = all_data['BsmtFinSF2'].fillna(0)
all_data['BsmtUnfSF']    = all_data['BsmtUnfSF'].fillna(0)
all_data['SaleType']     = all_data['SaleType'].fillna('Oth')
all_data['Exterior2nd']  = all_data['Exterior2nd'].fillna('Other')
all_data['Exterior1st']  = all_data['Exterior1st'].fillna('Other')
all_data['KitchenQual']  = all_data['KitchenQual'].fillna('TA')
all_data['GarageCars']   = all_data['GarageCars'].fillna(0)
all_data['TotalBsmtSF']  = all_data['TotalBsmtSF'].fillna(0)

In [188]:
all_data['GarageYrBlt'].describe()
all_data['GarageYrBlt']  = all_data['GarageYrBlt'].fillna(1978)

In [190]:
print('MiscVal  == 0 percent -> %3.2f%%'%(100*(all_data['MiscVal'].values == 0).sum() / all_data['MiscVal'].values.shape[0] ))
print('PoolArea == 0 percent -> %3.2f%%'%(100*(all_data['PoolArea'].values == 0).sum() / all_data['PoolArea'].values.shape[0] ))

In [192]:
all_data = all_data.drop('MiscVal', 1)
all_data = all_data.drop('PoolArea', 1)

print(all_data.isnull().sum().max())

0


In [193]:
numerical_data    = [var for var in all_data.columns if all_data.dtypes[var] != 'object']
categorical_data  = [var for var in all_data.columns if all_data.dtypes[var] == 'object']

numerical_data.remove('Id')
#numerical_data.remove('SalePrice')
print('Number of numerical data   : ', len(numerical_data))
print('Number of categorical data : ', len(categorical_data))

Number of numerical data   :  14
Number of categorical data :  38


In [194]:
all_data = pd.get_dummies(all_data, columns=categorical_data)
#encoders = []
#for i, name in enumerate(categorical_data):
    #train_data[name].get_dummies()
    #pd.get_dummies(train_data[name])
    #print(pd.get_dummies(train_data[name]).shape)
    #x = train_data[name].values
    #encoders.append(LabelEncoder())
    #label_enc = LabelEncoder()
    #x = label_enc.fit_transform(x)
    #x[:,0] = encoders[i].fit_transform(x[:,])
    #oneH_enc = OneHotEncoder(categorical_features = None)
    #encoders.append(enc)
    #x = oneH_enc.fit_transform(x).toarray()
    #train_data[name] = oneH_enc.fit_transform(x).toarray()
    #encoders.append(LabelEncoder())
    #train_data[name] = encoders[i].fit_transform(x)
#    train_data[name] = pd.get_dummies(train_data[name])
scalers  = []
#all_data = all_data.astype(np.float32)
for name in all_data:
    if name == 'Id':
        continue
    x = all_data[name].values.astype(np.float32)
    x = x.reshape(-1,1)
    scaler = StandardScaler()
    scaler.fit(x)
    #scalers.append(scaler)
    all_data[name] = scaler.transform(x)

In [195]:
all_data.head()

Unnamed: 0,Id,LotFrontage,OverallQual,OverallCond,YearRemodAdd,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,0.216075,0.646183,-0.507284,0.896833,1.087334,1.087334,0.781366,1.232599,0.169927,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
1,2,0.664158,-0.063185,2.188279,-0.395604,-0.818929,-0.818929,0.781366,-0.756321,0.169927,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
2,3,0.305692,0.646183,-0.507284,0.848965,1.087334,1.087334,0.781366,1.232599,0.169927,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
3,4,0.066714,0.646183,-0.507284,-0.682812,1.087334,1.087334,-1.027363,-0.756321,0.169927,...,-0.052423,-0.298629,-0.052423,0.395018,3.789876,-0.064249,-0.09105,-0.126535,-2.155466,-0.302693
4,5,0.783647,1.355551,-0.507284,0.753229,1.087334,1.087334,0.781366,1.232599,1.385655,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693


In [196]:
train_data = all_data[:1460]
test_data = all_data[1460:]

## 4. Model

In [197]:
def getPrediction(W, b, input_data):
    return np.sum(W*input_data) + b

def RMSE(Y, pred, m):
    return np.sqrt(np.mean(np.square(np.log(Y+1)-np.log(pred+1))))

def train(input_, err, lr):
    global W, b
    W = W  + lr * err * input_
    b = b  + lr * err    

## 5. Training

In [198]:
train_data_values = train_data.values

In [199]:
train_input = train_data_values[:, 1:]
#train_label = train_data_values[:,-1]
train_label_values = train_label.values
train_input = train_input.astype(np.float64)

In [202]:
n, m = train_input.shape
print('n = %d, m = %d'%(n,m))
W = np.random.normal(0.0, 0.001, m)
b = 0.0
def linear_regression(train_input, train_label):
    global W, b
    n, m    = train_input.shape
    pred_ar = np.ndarray(shape=(n,), dtype=np.float32)
    for i in range(1, 901):
        for id in range(n):
            pred = getPrediction(W, b, train_input[id])
            err  = train_label_values[id] - pred
            train(train_input[id], err, 1e-5)

        #pred_ar[pred_ar < 0] = 0
        
        if (i%50 == 0):
            pred_ar = np.matmul(train_input,W) + b
            cc = pred_ar[pred_ar < 0].sum()
            loss = RMSE(train_label_values, pred_ar, m)
            print("ep : %5d loss = %.6f, count = %d"%(i, loss, cc))

linear_regression(train_input, train_label)

n = 1460, m = 258


  """


ep :    50 loss = nan, count = -1698934
ep :   100 loss = nan, count = -157641
ep :   150 loss = nan, count = -1025
ep :   200 loss = 0.176210, count = 0
ep :   250 loss = 0.148101, count = 0
ep :   300 loss = 0.137922, count = 0
ep :   350 loss = 0.133644, count = 0
ep :   400 loss = 0.131594, count = 0
ep :   450 loss = 0.130511, count = 0
ep :   500 loss = 0.129903, count = 0
ep :   550 loss = 0.129552, count = 0
ep :   600 loss = 0.129347, count = 0
ep :   650 loss = 0.129232, count = 0
ep :   700 loss = 0.129171, count = 0
ep :   750 loss = 0.129144, count = 0
ep :   800 loss = 0.129140, count = 0
ep :   850 loss = 0.129149, count = 0
ep :   900 loss = 0.129167, count = 0


## 6. Testing

In [203]:
test_data  = all_data[1460:]
test_data.head()


test_input = test_data.values.astype(np.float64)[:,1:]
#test_input       = test_data_values[:, 1:]
test_input       = test_input.astype(np.float32)

In [204]:
train_data.head()

Unnamed: 0,Id,LotFrontage,OverallQual,OverallCond,YearRemodAdd,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,0.216075,0.646183,-0.507284,0.896833,1.087334,1.087334,0.781366,1.232599,0.169927,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
1,2,0.664158,-0.063185,2.188279,-0.395604,-0.818929,-0.818929,0.781366,-0.756321,0.169927,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
2,3,0.305692,0.646183,-0.507284,0.848965,1.087334,1.087334,0.781366,1.232599,0.169927,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
3,4,0.066714,0.646183,-0.507284,-0.682812,1.087334,1.087334,-1.027363,-0.756321,0.169927,...,-0.052423,-0.298629,-0.052423,0.395018,3.789876,-0.064249,-0.09105,-0.126535,-2.155466,-0.302693
4,5,0.783647,1.355551,-0.507284,0.753229,1.087334,1.087334,0.781366,1.232599,1.385655,...,-0.052423,-0.298629,-0.052423,0.395018,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693


In [205]:
n, m         = test_input.shape
pred_test_ar = np.matmul(test_input,W) + b

In [206]:
pred_pd = pd.DataFrame(pred_test_ar, columns=['SalePrice'])
out     = pd.concat([test_data['Id'],pred_pd], axis=1)
out

Unnamed: 0,Id,SalePrice
0,1461,101910.681458
1,1462,139103.651217
2,1463,160103.773905
3,1464,186975.270432
4,1465,201175.302763
5,1466,166927.197831
6,1467,169668.818425
7,1468,165651.285767
8,1469,194145.948323
9,1470,113866.972708


In [207]:
out.to_csv('all/regr_lin1.csv',index=False)
#tt1 = pd.read_csv('all/regr_lin.csv')

In [208]:
pd.read_csv('all/regr_lin.csv')

Unnamed: 0,Id,SalePrice
0,1461,111987.949920
1,1462,159006.541023
2,1463,181751.497798
3,1464,199622.458834
4,1465,201850.044658
5,1466,173166.025728
6,1467,174004.580737
7,1468,166031.810086
8,1469,195481.759694
9,1470,119611.517105
