In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics
from scipy import stats
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from statistics import mean, median, mode, stdev
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from scipy.optimize import minimize
from sklearn.model_selection import KFold

from sklearn.neural_network import MLPRegressor

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## Import dataset

In [2]:
train = pd.read_csv('train.csv')

Cleaning null + OHE

In [3]:
nullData = [['LotFrontage', 259], ['MasVnrArea', 8], ['Electrical', 1], ['GarageYrBlt', 81]]
n = len(train)
treshold = 0.1
drop = []

print('Drop feature - too many nulls:')
for i in nullData:
    if i[1]/n > treshold: # Arbitrary treshold: 10%
        print(i[0])
        train.drop(columns=[i[0]], inplace=True)
    else:
        drop.append(i[0])
        
print('Remove data point:')
print(drop)
train.dropna(subset=drop, inplace=True)

Drop feature - too many nulls:
LotFrontage
Remove data point:
['MasVnrArea', 'Electrical', 'GarageYrBlt']


In [4]:
#One-Hot encoding
categoricalcolumns = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond','Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
train0 = pd.get_dummies(train, columns= categoricalcolumns, prefix= categoricalcolumns)

In [5]:
y = train0['SalePrice']
X = train0.drop(['Id', 'SalePrice'], axis=1)

train_X0, test_X0, train_Y0, test_Y0 = train_test_split(X,y,test_size=0.2, random_state=0)

## Adam
note: Nadam, adamax, and ftrl give better scores than adam for bare set <br>
nadam was the best so i only kept that


In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(train_X0.shape[1])))
model.add(tf.keras.layers.Dense(1, activation='relu'))
opt = keras.optimizers.Adam()
model.compile(opt, loss='mse')
model.fit(train_X0, train_Y0)

In [None]:
y_pred = model.predict(train_X0)
print('Train mse loss\t:', mean_squared_error(train_Y0, np.squeeze(y_pred)))
print('Train R2 score\t:', r2_score(train_Y0, np.squeeze(y_pred)))

y_pred = model.predict(test_X0)
print('Test mse loss\t:', mean_squared_error(test_Y0, np.squeeze(y_pred)))
print('Test R2 score\t:', r2_score(test_Y0, np.squeeze(y_pred)))

## Nadam

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(train_X0.shape[1])))
model.add(tf.keras.layers.Dense(1, activation='relu'))
opt = keras.optimizers.Nadam()
model.compile(opt, loss='mse')
model.fit(train_X0, train_Y0)

In [None]:
y_pred = model.predict(train_X0)
print('Train mse loss\t:', mean_squared_error(train_Y0, np.squeeze(y_pred)))
print('Train R2 score\t:', r2_score(train_Y0, np.squeeze(y_pred)))

y_pred = model.predict(test_X0)
print('Test mse loss\t:', mean_squared_error(test_Y0, np.squeeze(y_pred)))
print('Test R2 score\t:', r2_score(test_Y0, np.squeeze(y_pred)))

### Try fit to quadratic curve

In [None]:
# data features
d = 2
poly = PolynomialFeatures(d)
train_X0_2 = poly.fit_transform(train_X0)
test_X0_2 = poly.fit_transform(test_X0)

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(train_X0_2.shape[1])))
model.add(tf.keras.layers.Dense(1, activation='relu'))
opt = keras.optimizers.Adam()
model.compile(opt, loss='mse')
model.fit(train_X0_2, train_Y0)

y_pred = model.predict(train_X0_2)
print('Train mse loss\t:', mean_squared_error(train_Y0, np.squeeze(y_pred)))
print('Train R2 score\t:', r2_score(train_Y0, np.squeeze(y_pred)))

y_pred = model.predict(test_X0_2)
print('Test mse loss\t:', mean_squared_error(test_Y0, np.squeeze(y_pred)))
print('Test R2 score\t:', r2_score(test_Y0, np.squeeze(y_pred)))

## Try with pre-processing

### train_ONE

In [6]:
train1 = pd.read_csv('train_ONE.csv')

y = train1['SalePrice']
X = train1.drop(['Id', 'SalePrice'], axis=1)

train_X1, test_X1, train_Y1, test_Y1 = train_test_split(X,y,test_size=0.2, random_state=0)

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(train_X1.shape[1])))
model.add(tf.keras.layers.Dense(1, activation='relu'))
opt = keras.optimizers.Adam()
model.compile(opt, loss='mse')
model.fit(train_X1, train_Y1)

In [None]:
y_pred = model.predict(train_X1)
print('Train mse loss\t:', mean_squared_error(train_Y1, np.squeeze(y_pred)))
print('Train R2 score\t:', r2_score(train_Y1, np.squeeze(y_pred)))

y_pred = model.predict(test_X1)
print('Test mse loss\t:', mean_squared_error(test_Y1, np.squeeze(y_pred)))
print('Test R2 score\t:', r2_score(test_Y1, np.squeeze(y_pred)))

### train_TWO

In [7]:
train2 = pd.read_csv('train_TWO.csv')

y = train2['SalePrice']
X = train2.drop(['Id', 'SalePrice'], axis=1)

train_X2, test_X2, train_Y2, test_Y2 = train_test_split(X,y,test_size=0.2, random_state=0)

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(train_X2.shape[1])))
model.add(tf.keras.layers.Dense(1, activation='relu'))
opt = keras.optimizers.Adam()
model.compile(opt, loss='mse')
model.fit(train_X2, train_Y2)

In [None]:
y_pred = model.predict(train_X2)
print('Train mse loss\t:', mean_squared_error(train_Y2, np.squeeze(y_pred)))
print('Train R2 score\t:', r2_score(train_Y2, np.squeeze(y_pred)))

y_pred = model.predict(test_X2)
print('Test mse loss\t:', mean_squared_error(test_Y2, np.squeeze(y_pred)))
print('Test R2 score\t:', r2_score(test_Y2, np.squeeze(y_pred)))

### train_THREE

In [12]:
train3 = pd.read_csv('train_ONE.csv')

y = train3['SalePrice']
X = train3.drop(['Id', 'SalePrice'], axis=1)

train_X3, test_X3, train_Y3, test_Y3 = train_test_split(X,y,test_size=0.2, random_state=0)

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(train_X3.shape[1])))
model.add(tf.keras.layers.Dense(1, activation='relu'))
opt = keras.optimizers.Adam()
model.compile(opt, loss='mse')
model.fit(train_X3, train_Y3)

In [None]:
y_pred = model.predict(train_X3)
print('Train mse loss\t:', mean_squared_error(train_Y3, np.squeeze(y_pred)))
print('Train R2 score\t:', r2_score(train_Y3, np.squeeze(y_pred)))

y_pred = model.predict(test_X3)
print('Test mse loss\t:', mean_squared_error(test_Y3, np.squeeze(y_pred)))
print('Test R2 score\t:', r2_score(test_Y3, np.squeeze(y_pred)))

### train_FOUR

In [8]:
train4 = pd.read_csv('train_ONE.csv')

y = train4['SalePrice']
X = train4.drop(['Id', 'SalePrice'], axis=1)

train_X4, test_X4, train_Y4, test_Y4 = train_test_split(X,y,test_size=0.2, random_state=0)

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(train_X4.shape[1])))
model.add(tf.keras.layers.Dense(1, activation='relu'))
opt = keras.optimizers.Adam()
model.compile(opt, loss='mse')
model.fit(train_X4, train_Y4)

In [None]:
y_pred = model.predict(train_X4)
print('Train mse loss\t:', mean_squared_error(train_Y4, np.squeeze(y_pred)))
print('Train R2 score\t:', r2_score(train_Y4, np.squeeze(y_pred)))

y_pred = model.predict(test_X4)
print('Test mse loss\t:', mean_squared_error(test_Y4, np.squeeze(y_pred)))
print('Test R2 score\t:', r2_score(test_Y4, np.squeeze(y_pred)))

## sklearn.neural_network.MLPRegressor

### adam

In [9]:
# adam
# bare dataset

for i in range(10):
    mlp = MLPRegressor(random_state=i, solver='adam', max_iter=1000, early_stopping=True)
    mlp.fit(train_X0, train_Y0)
    pred_train = mlp.predict(train_X0)
    pred_test = mlp.predict(test_X0)
    print('Random state =', i, 'Train R2 =', r2_score(train_Y0, pred_train), 'Test R2 =', r2_score(test_Y0, pred_test)) ######

Random state = 0 Train R2 = 0.7295460469776351 Test R2 = 0.45046427765185615
Random state = 1 Train R2 = 0.7277783756888094 Test R2 = 0.48091148271190154
Random state = 2 Train R2 = 0.6953451716997927 Test R2 = 0.5174298602736237
Random state = 3 Train R2 = 0.7262295211694022 Test R2 = 0.49965730564788113
Random state = 4 Train R2 = 0.7322077171278982 Test R2 = 0.500123428134436
Random state = 5 Train R2 = -0.33102801775673885 Test R2 = -0.38831696502427127




Random state = 6 Train R2 = 0.7569073334274237 Test R2 = 0.461798306866935
Random state = 7 Train R2 = 0.7242219904256835 Test R2 = 0.47975200950713603




Random state = 8 Train R2 = 0.7542678051177171 Test R2 = 0.4134154892439431
Random state = 9 Train R2 = 0.7320497456435788 Test R2 = 0.518487655953835


### lbfgs
so far the best is the second pre-processed dataset

In [10]:
# lbfgs
# bare dataset

for i in range(10):
    mlp = MLPRegressor(random_state=i, solver='lbfgs', max_iter=5000, early_stopping=True)
    mlp.fit(train_X0, train_Y0)
    pred_train = mlp.predict(train_X0)
    pred_test = mlp.predict(test_X0)
    print('Random state =', i, 'Train R2 =', r2_score(train_Y0, pred_train), 'Test R2 =', r2_score(test_Y0, pred_test)) ######

Random state = 0 Train R2 = 0.8587334554787459 Test R2 = 0.6761791463038842


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Random state = 1 Train R2 = 0.9074902723397903 Test R2 = 0.7867161118093207
Random state = 2 Train R2 = 0.7452373695200438 Test R2 = 0.465747221158349
Random state = 3 Train R2 = 0.7599232738030033 Test R2 = 0.40072143293153417


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Random state = 4 Train R2 = 0.9157438580960772 Test R2 = -0.037134754264871006
Random state = 5 Train R2 = 0.8266749149276514 Test R2 = 0.08139724388343428
Random state = 6 Train R2 = 0.8043323176134047 Test R2 = 0.4246620396388394
Random state = 7 Train R2 = 0.8636536530759139 Test R2 = 0.7628268094572761
Random state = 8 Train R2 = 0.8035541110881056 Test R2 = -0.08030611616825167
Random state = 9 Train R2 = 0.9254563334663202 Test R2 = 0.4194772365937188


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [17]:
# lbfgs
# first preprocessed dataset
# - Dealing with null data
# - Dealing with highly-skewed categorical data
# - One Hot Encoding
# - Feature selection

for i in range(10):
    mlp = MLPRegressor(random_state=i, solver='lbfgs', max_iter=10000, early_stopping=True)
    mlp.fit(train_X1, train_Y1)
    pred_train = mlp.predict(train_X1)
    pred_test = mlp.predict(test_X1)
    print('Random state =', i, 'Train R2 =', r2_score(train_Y1, pred_train), 'Test R2 =', r2_score(test_Y1, pred_test)) ######

Random state = 0 Train R2 = 0.9231909642657724 Test R2 = 0.24168169584472654
Random state = 1 Train R2 = 0.7531296813946199 Test R2 = 0.46346911484632347
Random state = 2 Train R2 = 0.765860900452468 Test R2 = 0.544041127449048
Random state = 3 Train R2 = 0.852735550469968 Test R2 = 0.44426874247877646
Random state = 4 Train R2 = 0.9109144990937464 Test R2 = -0.5261561078022301
Random state = 5 Train R2 = 0.8574242027617756 Test R2 = 0.7368702504039029
Random state = 6 Train R2 = 0.7514403698474917 Test R2 = 0.4939897973487072
Random state = 7 Train R2 = 0.7513309298821966 Test R2 = 0.3767765407757532
Random state = 8 Train R2 = 0.8251306757451707 Test R2 = 0.3729223069566453
Random state = 9 Train R2 = 0.9103509051081109 Test R2 = -0.019755386462180224


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [16]:
# lbfgs
# second preprocessed dataset
# - Dealing with null data
# - Dealing with highly-skewed categorical data
# - One Hot Encoding
# - Dealing with highly-skewed continuous data
# - Feature selection

for i in range(10):
    mlp = MLPRegressor(random_state=i, solver='lbfgs', max_iter=10000, early_stopping=True)
    mlp.fit(train_X2, train_Y2)
    pred_train = mlp.predict(train_X2)
    pred_test = mlp.predict(test_X2)
    print('Random state =', i, 'Train R2 =', r2_score(train_Y2, pred_train), 'Test R2 =', r2_score(test_Y2, pred_test)) ######

Random state = 0 Train R2 = 0.8643043985923857 Test R2 = 0.7942276515543296
Random state = 1 Train R2 = 0.8442834677298046 Test R2 = 0.7957329785168132
Random state = 2 Train R2 = 0.895542196130147 Test R2 = 0.8111112478490636
Random state = 3 Train R2 = 0.8587324888410194 Test R2 = 0.7871884884938685
Random state = 4 Train R2 = 0.8792210453477525 Test R2 = 0.8056393247710293
Random state = 5 Train R2 = 0.8513296713989102 Test R2 = 0.7831328133668556
Random state = 6 Train R2 = 0.8581849422719614 Test R2 = 0.7869016420908056
Random state = 7 Train R2 = 0.8896445895817151 Test R2 = 0.8109421773679459
Random state = 8 Train R2 = 0.8882416588369575 Test R2 = 0.8186683626838979
Random state = 9 Train R2 = 0.9101370608074594 Test R2 = 0.8363431902412719


In [13]:
# lbfgs
# third preprocessed dataset
# - Dealing with null data
# - Dealing with highly-skewed categorical data
# - One Hot Encoding
# - Dealing with highly-skewed continuous data
# - Normalisation
# - Feature selection

# After running, random state 5 seems to be the best

for i in range(10):
    mlp = MLPRegressor(random_state=i, solver='lbfgs', max_iter=10000, early_stopping=True)
    mlp.fit(train_X3, train_Y3)
    pred_train = mlp.predict(train_X3)
    pred_test = mlp.predict(test_X3)
    print('Random state =', i, 'Train R2 =', r2_score(train_Y3, pred_train), 'Test R2 =', r2_score(test_Y3, pred_test)) ######

Random state = 0 Train R2 = 0.9231909642657724 Test R2 = 0.24168169584472654
Random state = 1 Train R2 = 0.7531296813946199 Test R2 = 0.46346911484632347
Random state = 2 Train R2 = 0.765860900452468 Test R2 = 0.544041127449048
Random state = 3 Train R2 = 0.852735550469968 Test R2 = 0.44426874247877646
Random state = 4 Train R2 = 0.9109144990937464 Test R2 = -0.5261561078022301
Random state = 5 Train R2 = 0.8574242027617756 Test R2 = 0.7368702504039029
Random state = 6 Train R2 = 0.7514403698474917 Test R2 = 0.4939897973487072
Random state = 7 Train R2 = 0.7513309298821966 Test R2 = 0.3767765407757532
Random state = 8 Train R2 = 0.8251306757451707 Test R2 = 0.3729223069566453
Random state = 9 Train R2 = 0.9103509051081109 Test R2 = -0.019755386462180224


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [14]:
# lbfgs
# fourth preprocessed dataset
# - Dealing with null data
# - Dealing with highly-skewed categorical data
# - One Hot Encoding
# - Dealing with highly-skewed continuous data
# - Z-scoring
# - Feature selection

for i in range(10):
    mlp = MLPRegressor(random_state=i, solver='lbfgs', max_iter=10000, early_stopping=True)
    mlp.fit(train_X4, train_Y4)
    pred_train = mlp.predict(train_X4)
    pred_test = mlp.predict(test_X4)
    print('Random state =', i, 'Train R2 =', r2_score(train_Y4, pred_train), 'Test R2 =', r2_score(test_Y4, pred_test)) ######

Random state = 0 Train R2 = 0.9231909642657724 Test R2 = 0.24168169584472654
Random state = 1 Train R2 = 0.7531296813946199 Test R2 = 0.46346911484632347
Random state = 2 Train R2 = 0.765860900452468 Test R2 = 0.544041127449048
Random state = 3 Train R2 = 0.852735550469968 Test R2 = 0.44426874247877646
Random state = 4 Train R2 = 0.9109144990937464 Test R2 = -0.5261561078022301
Random state = 5 Train R2 = 0.8574242027617756 Test R2 = 0.7368702504039029
Random state = 6 Train R2 = 0.7514403698474917 Test R2 = 0.4939897973487072
Random state = 7 Train R2 = 0.7513309298821966 Test R2 = 0.3767765407757532
Random state = 8 Train R2 = 0.8251306757451707 Test R2 = 0.3729223069566453
Random state = 9 Train R2 = 0.9103509051081109 Test R2 = -0.019755386462180224


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


# Test data


In [83]:
import researchpy as rp
from scipy.stats import skew 


In [164]:
test = pd.read_csv('test.csv')

## Dealing with null
LotFrontage: drop column<br>
MasVnrArea : set na to 0<br>
Electrical: drop the whole electrical_NA column after OHE (since this is a categorical data)<br>
- it so happens that this test set in electrical has no NA so we can just ignore this yay<br>

GarageYrBlt: set to mean of the rest<br>

In [167]:
test.drop(columns='LotFrontage', inplace=True)

test['MasVnrArea'] = test['MasVnrArea'].fillna(0)

# mean = test['GarageYrBlt'].mean()
# test['GarageYrBlt'] = test['GarageYrBlt'].fillna(mean)

### One Hot Encoding

In [168]:
categorical = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond','Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

test = pd.get_dummies(test, columns= categorical, prefix= categorical)
test

Unnamed: 0,Id,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,11622,1961,1961,0.0,468.0,144.0,270.0,882.0,896,...,0,0,0,1,0,0,0,0,1,0
1,1462,14267,1958,1958,108.0,923.0,0.0,406.0,1329.0,1329,...,0,0,0,1,0,0,0,0,1,0
2,1463,13830,1997,1998,0.0,791.0,0.0,137.0,928.0,928,...,0,0,0,1,0,0,0,0,1,0
3,1464,9978,1998,1998,20.0,602.0,0.0,324.0,926.0,926,...,0,0,0,1,0,0,0,0,1,0
4,1465,5005,1992,1992,0.0,263.0,0.0,1017.0,1280.0,1280,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,1936,1970,1970,0.0,0.0,0.0,546.0,546.0,546,...,0,0,0,1,0,0,0,0,1,0
1455,2916,1894,1970,1970,0.0,252.0,0.0,294.0,546.0,546,...,0,0,0,1,1,0,0,0,0,0
1456,2917,20000,1960,1996,0.0,1224.0,0.0,0.0,1224.0,1224,...,0,0,0,1,1,0,0,0,0,0
1457,2918,10441,1992,1992,0.0,337.0,0.0,575.0,912.0,970,...,0,0,0,1,0,0,0,0,1,0


In [169]:
col = list(train2.columns)
col.remove('SalePrice')
test1 = test[col]
test1

Unnamed: 0,GrLivArea,GarageCars,GarageArea,TotRmsAbvGrd,FullBath,TotalBsmtSF,1stFlrSF,YearBuilt,Foundation_PConc,YearRemodAdd,...,MSZoning_RM,MasVnrType_None,OverallQual_5,HeatingQC_TA,GarageType_Detchd,Foundation_CBlock,BsmtQual_TA,GarageFinish_Unf,KitchenQual_TA,Id
0,896,1.0,730.0,5,1,882.0,896,1961,0,1961,...,0,1,1,1,0,1,1,1,1,1461
1,1329,1.0,312.0,6,1,1329.0,1329,1958,0,1958,...,0,0,0,1,0,1,1,1,0,1462
2,1629,2.0,482.0,6,2,928.0,928,1997,1,1998,...,0,1,1,0,0,0,0,0,1,1463
3,1604,2.0,470.0,7,2,926.0,926,1998,1,1998,...,0,0,0,0,0,0,1,0,0,1464
4,1280,2.0,506.0,5,2,1280.0,1280,1992,1,1992,...,0,1,0,0,0,0,0,0,0,1465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1092,0.0,0.0,5,1,546.0,546,1970,0,1970,...,1,1,0,0,0,1,1,0,1,2915
1455,1092,1.0,286.0,6,1,546.0,546,1970,0,1970,...,1,1,0,1,0,1,1,1,1,2916
1456,1224,2.0,576.0,7,1,1224.0,1224,1960,0,1996,...,0,1,1,0,1,1,1,1,1,2917
1457,970,0.0,0.0,6,1,912.0,970,1992,1,1992,...,0,1,1,1,0,0,0,0,1,2918


In [173]:
for index in test1:
    if test1[index].isna().sum() != 0 :
        mean = test1[index].mean()
        test1[index] = test1['GarageYrBlt'].fillna(mean)

In [174]:
Id = test1['Id']
test1 = test1.drop(columns=['Id'])

In [175]:
print(test1.shape)
print(train_X2.shape)

(1459, 73)
(550, 73)


In [200]:
pred = []
for i in range(10):
    mlp = MLPRegressor(random_state=i, solver='lbfgs', max_iter=10000, early_stopping=True)
    mlp.fit(train_X2, train_Y2)
    pred.append(mlp.predict(test1))

In [201]:
pred

[array([1164188.34303469, 1115113.96035348, 1284735.95708143, ...,
        1142476.98181597,  634904.37505995, 1297630.03115401]),
 array([608861.48681033, 628466.05926551, 689012.73993047, ...,
        616396.02201571, 327442.63183479, 725794.80644294]),
 array([621829.52757806, 584650.513996  , 692310.63539531, ...,
        250396.56619153, 278030.72907326, 593901.82947317]),
 array([615464.14436427, 663228.17732403, 607066.45579366, ...,
        685220.35858846, 355957.44569263, 601334.41103905]),
 array([637338.06427847, 729383.28435822, 766730.85597865, ...,
        816645.16766059, 357361.75242826, 859794.24607246]),
 array([ 890300.09535237,  854801.79074125,  977188.57989816, ...,
         845542.6821454 ,  483978.12680326, 1009606.61558934]),
 array([1105488.51108531, 1134160.40371465, 1227797.77390536, ...,
        1174087.74492319,  366065.18100683, 1221528.68101294]),
 array([651221.11977861, 681976.98252856, 748083.0327975 , ...,
        655437.12837074, 372646.89104149, 7

In [204]:
for i in pred:
    print(i[0])

1164188.3430346937
608861.4868103322
621829.5275780605
615464.1443642668
637338.0642784663
890300.0953523688
1105488.511085309
651221.1197786143
625768.1924085672
7910600.887689075


try 1 2 3 4 5 7 8

In [219]:
submit = pd.DataFrame(columns=['SalePrice'], data=pred[8])
submit.insert(0, 'Id', Id)
submit

Unnamed: 0,Id,SalePrice
0,1461,625768.192409
1,1462,708663.846076
2,1463,697967.612855
3,1464,725702.960970
4,1465,609262.856739
...,...,...
1454,2915,210529.029742
1455,2916,539502.644756
1456,2917,798940.042981
1457,2918,304696.628956


In [220]:
submit.to_csv('submit.csv', index=False)