# Scikit Learn Neural Network
Overview of Implementation
1. <a href="#section1">Import Dataset</a>
2. <a href="#section2">Cleaning the Data for Model Training</a>
3. <a href="#section3">Multi-layer Perceptron (MLP) regressor. </a>



In [1]:
# Import libraries

# math library
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

## <a id='section1'>Import Dataset</a>

In [2]:
train = pd.read_csv('train.csv')
train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


## <a id='section2'>Cleaning the Data for Model Training</a>
Remove the NA data and perform One Hot Encoding

In [3]:
nullData = [['LotFrontage', 259], ['MasVnrArea', 8], ['Electrical', 1], ['GarageYrBlt', 81]]
n = len(train)
treshold = 0.1
drop = []

print('Drop feature - too many nulls:')
for i in nullData:
    if i[1]/n > treshold: # Arbitrary treshold: 10%
        print(i[0])
        train.drop(columns=[i[0]], inplace=True)
    else:
        drop.append(i[0])
        
print('Remove data point:')
print(drop)
train.dropna(subset=drop, inplace=True)

train

Drop feature - too many nulls:
LotFrontage
Remove data point:
['MasVnrArea', 'Electrical', 'GarageYrBlt']


Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:
#One-Hot encoding
categoricalcolumns = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond','Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
train1 = pd.get_dummies(train, columns= categoricalcolumns, prefix= categoricalcolumns)
print(train1)

        Id  LotArea  YearBuilt  YearRemodAdd  MasVnrArea  BsmtFinSF1  \
0        1     8450       2003          2003       196.0         706   
1        2     9600       1976          1976         0.0         978   
2        3    11250       2001          2002       162.0         486   
3        4     9550       1915          1970         0.0         216   
4        5    14260       2000          2000       350.0         655   
...    ...      ...        ...           ...         ...         ...   
1455  1456     7917       1999          2000         0.0           0   
1456  1457    13175       1978          1988       119.0         790   
1457  1458     9042       1941          2006         0.0         275   
1458  1459     9717       1950          1996         0.0          49   
1459  1460     9937       1965          1965         0.0         830   

      BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  1stFlrSF  ...  SaleType_ConLw  \
0              0        150          856       856  ...     

## <a id='section3'>Multi-layer Perceptron (MLP) regressor</a>
This model optimizes the squared-loss using stochastic gradient descent.

In [5]:
#Applying Standardization (Z-Score method)
continuous = ['LotArea', 'YearBuilt', 'YearRemodAdd', 
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr','TotRmsAbvGrd', 'Fireplaces', 
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
features_stand=train1.copy()
for i in continuous:
    # fit on training data column
    scale = StandardScaler().fit(train1[[i]])
    # transform training data column
    features_stand[i] = scale.transform(features_stand[[i]])
labels = train1['SalePrice']
features_stand.drop(['Id', 'SalePrice'], axis=1, inplace=True)
train_X_stan, test_X_stan, train_Y_stan, test_Y_stan = train_test_split(features_stand,labels,test_size=0.2, random_state=0)

## sklearn neural network

In [18]:
for i in range(29,40):
    regr = MLPRegressor(random_state=i,solver='adam', max_iter=10**(200),early_stopping=True).fit(train_X_stan,train_Y_stan)
    print("i =",i,"\ttrain acc = ",regr.score(train_X_stan,train_Y_stan),"\ttest acc = ",regr.score(test_X_stan,test_Y_stan))


i = 29 	train acc =  0.7644708703647427 	test acc =  0.6924860806161166
i = 30 	train acc =  0.8050701721860402 	test acc =  0.6998313911119793
i = 31 	train acc =  0.7749747025618977 	test acc =  0.6939349148990162
i = 32 	train acc =  0.7116076861807802 	test acc =  0.6601688627539135
i = 33 	train acc =  0.7474265797771464 	test acc =  0.6840731150524375
i = 34 	train acc =  0.8039941973223602 	test acc =  0.717852100810236
i = 35 	train acc =  0.77772263504749 	test acc =  0.6949311618346734
i = 36 	train acc =  0.7650489346333753 	test acc =  0.6946535482113954
i = 37 	train acc =  0.810411694693462 	test acc =  0.6993865097574492
i = 38 	train acc =  0.7988795310630322 	test acc =  0.7005922741528048
i = 39 	train acc =  0.7668348583705872 	test acc =  0.6905050715149762


kaggle test

In [13]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [14]:
#One-Hot encoding
categoricalcolumns = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond','Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
test1 = pd.get_dummies(test, columns= categoricalcolumns, prefix= categoricalcolumns)
print(test1)

        Id  LotFrontage  LotArea  YearBuilt  YearRemodAdd  MasVnrArea  \
0     1461         80.0    11622       1961          1961         0.0   
1     1462         81.0    14267       1958          1958       108.0   
2     1463         74.0    13830       1997          1998         0.0   
3     1464         78.0     9978       1998          1998        20.0   
4     1465         43.0     5005       1992          1992         0.0   
...    ...          ...      ...        ...           ...         ...   
1454  2915         21.0     1936       1970          1970         0.0   
1455  2916         21.0     1894       1970          1970         0.0   
1456  2917        160.0    20000       1960          1996         0.0   
1457  2918         62.0    10441       1992          1992         0.0   
1458  2919         74.0     9627       1993          1994        94.0   

      BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  ...  SaleType_ConLw  \
0          468.0       144.0      270.0       

In [15]:
test1.drop(['Id','LotFrontage'], axis=1, inplace=True)

In [16]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 301 entries, LotArea to SaleCondition_Partial
dtypes: float64(10), int64(22), uint8(269)
memory usage: 748.1 KB


In [36]:
kaggle_test=test1.copy()
kaggle_test.to_numpy()
values={test1.columns[0]: np.nanmean(kaggle_test[test1.columns[0]])}
for i in range(1,301):
    col_name=test1.columns[i]
    values.update({col_name: np.nanmean(kaggle_test[col_name])})
kaggle_test=test1.fillna(value=values)
continuous = ['LotArea', 'YearBuilt', 'YearRemodAdd', 
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr','TotRmsAbvGrd', 'Fireplaces', 
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
features_stand=kaggle_test.copy()
for i in continuous:
    # fit on training data column
    scale = StandardScaler().fit(kaggle_test[[i]])
    # transform training data column
    features_stand[i] = scale.transform(features_stand[[i]])
print(features_stand.shape)

(1459, 301)


In [50]:
#removing cloumns
i=0
while i<len(features_stand.columns):
    train_column_name = features_stand.columns[i]
    if train_column_name not in train_X_stan.columns:
        print(i)
        features_stand.drop([train_column_name], axis=1, inplace=True)
    else:
        i+=1
print(features_stand.shape)

(1459, 296)


In [51]:
#removing cloumns
i=0
while i<len(features_stand.columns):
    train_column_name = train_X_stan.columns[i]
    if train_column_name not in features_stand.columns:   
        features_stand.insert(i, train_column_name, np.zeros(1459))
    else:
        i+=1
print(features_stand.shape)

(1459, 314)


In [52]:
kaggle_test_pred=regr.predict(features_stand)
df=pd.DataFrame(data=kaggle_test_pred,columns=["SalePrice"])
df.to_csv("submission.csv")

submission scored 0.19841. 6052/7479