# IMPORTING RELEVANT LIBRARIES

In [20]:
import seaborn as sns
import numpy as np 
import matplotlib.pyplot as plt
import sklearn 
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('Data/train.csv')
df = pd.DataFrame(data)

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# using the info from the EDA done to
# the data set we will keep only the relevant features for training the machine learning algorithms 
# Hightly corelated features according to the EDA done
# OverallQual, YearBuilt, YearRemodAdd, MasVnrArea, TotalBsmtSF, GrLivArea, FullBath, KitchenAbvGr, TotRmsAbvGrd
# ,FirePlaces, GerageYrBlt, GerageCars, GerageArea

selected_cols = [  'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF', 'GrLivArea', 'FullBath', 'KitchenAbvGr', 'TotRmsAbvGrd'
,'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'SalePrice' ]

df = df.drop(columns=[col for col in df.columns if col not in selected_cols])
df.head()

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,GrLivArea,FullBath,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,SalePrice
0,7,2003,2003,196.0,856,1710,2,1,8,0,2003.0,2,548,208500
1,6,1976,1976,0.0,1262,1262,2,1,6,1,1976.0,2,460,181500
2,7,2001,2002,162.0,920,1786,2,1,6,1,2001.0,2,608,223500
3,7,1915,1970,0.0,756,1717,1,1,7,1,1998.0,3,642,140000
4,8,2000,2000,350.0,1145,2198,2,1,9,1,2000.0,3,836,250000


In [5]:
# replacing any NaN values with the mean of the data
df = df.fillna(data.mean())

  df = df.fillna(data.mean())


In [6]:
# spliting the exsisting dataframe into X and y ie features and output dataframes

X = df.drop(columns = ['SalePrice'])
Y = df['SalePrice']

In [7]:
print('X shape :', X.shape)
print('Y shape :', Y.shape)

X shape : (1460, 13)
Y shape : (1460,)


In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle= True, random_state = 42)

In [9]:
print(x_train.head())
print(y_train.head())

      OverallQual  YearBuilt  YearRemodAdd  MasVnrArea  TotalBsmtSF  \
254             5       1957          1957         0.0         1314   
1066            6       1993          1994         0.0          799   
638             5       1910          1950         0.0          796   
799             5       1937          1950       252.0          731   
380             5       1924          1950         0.0         1026   

      GrLivArea  FullBath  KitchenAbvGr  TotRmsAbvGrd  Fireplaces  \
254        1314         1             1             5           0   
1066       1571         2             1             7           1   
638         796         1             1             4           0   
799        1768         1             1             7           2   
380        1691         2             1             6           1   

      GarageYrBlt  GarageCars  GarageArea  
254   1957.000000           1         294  
1066  1993.000000           2         380  
638   1978.506164         

In [10]:
print('X_train shape :', x_train.shape[1])
print('Y_train shape :', y_train.shape)

X_train shape : 13
Y_train shape : (1168,)


# NN based model

In [11]:
if x_train.isnull().values.any() or x_test.isnull().values.any() or y_train.isnull().values.any():
    raise ValueError("Input data contains NaN values.")

In [12]:
# processing the data using standard scalar from sklearn
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

In [13]:
print(X_test)

[[-0.08893368 -0.25978931  0.87347031 ... -0.65139094 -1.05654384
  -1.00601401]
 [ 1.37408806  0.75122223  0.48746451 ...  0.6386082   0.29509165
   1.11715856]
 [-0.82044456 -1.43386723 -1.6838181  ... -2.14945444  0.29509165
  -0.55104846]
 ...
 [ 0.64257719  1.10996826  0.96997176 ...  1.09634982  0.29509165
  -0.31408723]
 [-1.55195543 -1.00989465 -1.6838181  ... -1.60848707 -1.05654384
  -0.85435882]
 [-1.55195543 -0.03149638 -0.71880361 ...  0.6386082   0.29509165
  -0.06290834]]


In [14]:
NN_Model = Sequential()

# The first layer with the input layer
NN_Model.add(Dense(128, activation = 'relu', input_shape = [X_train.shape[1]]))
# Hidden layer 1
NN_Model.add(Dense(64, activation = 'relu'))
NN_Model.add(Dropout(0.2))
# Hidden layer 2
NN_Model.add(Dense(32, activation = 'relu'))
NN_Model.add(Dropout(0.2))
# Hidden layer 3
NN_Model.add(Dense(16, activation = 'relu'))
NN_Model.add(Dropout(0.2))
# Output layer
NN_Model.add(Dense(1))


In [15]:
NN_Model.compile(loss=tf.keras.losses.mae, optimizer='adam', metrics = ['mae'])

In [16]:
NN_Model.fit(X_train, y_train, batch_size = 32, epochs = 100, verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x2bb8ffc0b80>

In [17]:
y_pred = NN_Model.predict(X_test[:20])

In [18]:
mae = mean_absolute_error(y_test[:20], y_pred)
mse = mean_squared_error(y_test[:20], y_pred)
print("Mean Absolute error :", mae)
print("Mean sq error :", mse)

Mean Absolute error : 18165.890625
Mean sq error : 746173104.3732421


In [19]:
print(f"True price: {y_test[:20]}\nPredicted price: {tf.squeeze(y_pred)}")


True price: 892     154500
1105    325000
413     115000
522     159000
1036    315500
614      75500
218     311500
1160    146000
649      84500
887     135500
576     145000
1252    130000
1061     81000
567     214000
1108    181000
1113    134500
168     183500
1102    135000
1120    118400
67      226000
Name: SalePrice, dtype: int64
Predicted price: [134088.03  317373.7   123162.82  162463.3   291338.38   89404.04
 220211.17  148513.48   88235.79  117410.79  129255.86  109617.984
 136930.9   210461.02  183117.5   119985.78  198364.92  122984.97
 128763.58  205510.86 ]


# Linear Regression Model

In [25]:
LR_model = LinearRegression()

LR_model.fit(x_train, y_train)

In [26]:
y_pred_LR = LR_model.predict(x_test)

In [27]:
mse_LR = mean_squared_error(y_test, y_pred_LR)
print("mean sq error : ", mse_LR)

mean sq error :  1468063831.2396834
