# Importing libraries and Dataset

In [1]:
import pandas as pd
import statsmodels.api  as sm
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

#import external functions 
import Quality
import mapper 
import missing_data
import Encoder

In [2]:
#sklearn libraries
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold 
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.model_selection import cross_val_predict

In [3]:
#Data set
test = pd.read_csv("test.csv", index_col="Id")
df = pd.read_csv("train.csv", index_col="Id")
print("Length of Test = {}".format(len(test.columns)))
print("Length of Train =  {}".format(len(df.columns)))
print(df.columns)
print("\n")
print(df._get_numeric_data().columns)

Length of Test = 79
Length of Train =  80
Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQua

# Data Processing

In [4]:
#Handling the missing data
df = missing_data.deal_with_missing_values(df)
test = missing_data.deal_with_missing_values(test)
print("Length of Test = {}".format(len(test.columns)))
print("Length of Train =  {}".format(len(df.columns)))
##Checking the Quality of the data
quality = Quality.quality_check(df, "SalePrice")
print(quality)

## Mapping the category data
df, mapp = mapper.mapping(df)
test, mapp = mapper.mapping(test)
print("Length of Test = {}".format(len(test.columns)))
print("Length of Train =  {}".format(len(df.columns)))
print(mapp)

Length of Test = 79
Length of Train =  80
               SalePrice       skew        kurt
SalePrice       1.000000   1.882876    6.536282
OverallQual     0.790982   0.216944    0.096293
GrLivArea       0.708624   1.366560    4.895121
GarageCars      0.640409  -0.342549    0.220998
GarageArea      0.623431   0.179981    0.917067
TotalBsmtSF     0.613581   1.524255   13.250483
1stFlrSF        0.605852   1.376757    5.745841
FullBath        0.560664   0.036562   -0.857043
TotRmsAbvGrd    0.533723   0.676341    0.880762
YearBuilt       0.522897  -0.613461   -0.439552
YearRemodAdd    0.507101  -0.503562   -1.272245
MasVnrArea      0.472614   2.677616   10.141416
Fireplaces      0.466929   0.649565   -0.217237
BsmtFinSF1      0.386420   1.685503   11.118236
LotFrontage     0.334901   2.384950   21.848165
WoodDeckSF      0.324413   1.541376    2.992951
2ndFlrSF        0.319334   0.813030   -0.553464
OpenPorchSF     0.315856   2.364342    8.490336
HalfBath        0.284108   0.675897   -1.07692

In [5]:
##ONE Hot encoding using dummy
df, dummies = Encoder.one_hot_encode(df)
test, dummies_test = Encoder.one_hot_encode(test)
print("Length of Test = {}".format(len(test.columns)))
print("Length of Train =  {}".format(len(df.columns)))
l = missing_data.find_col_with_missing_data(df)
print(l)

Total Columns: 250
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Columns: 250 entries, LotFrontage to SaleCondition_Partial
dtypes: float64(3), int64(54), uint8(193)
memory usage: 936.7 KB
None
Total Columns: 235
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 1461 to 2919
Columns: 235 entries, LotFrontage to SaleCondition_Partial
dtypes: float64(11), int64(45), uint8(179)
memory usage: 904.8 KB
None
Length of Test = 235
Length of Train =  250
[]


#### Note that the size of test and df is different.  

In [6]:
dummies = dummies.values
dummies_test = dummies_test.values
M = list(set(dummies) - set(dummies_test))
for i in M:
    test[i] = 0
print("Length of Test = {}".format(len(test.columns)))
print("Length of Train =  {}".format(len(df.columns)))

Length of Test = 250
Length of Train =  250


# CONVERTING YEAR DATA INTO AGE

In [7]:
def age_Conversion(X):
    return X.apply(lambda x: 0 if x == 0 else (2020-x) )
df["GarageYrBlt"] = age_Conversion(df["GarageYrBlt"])
df["YearRemodAdd"] = age_Conversion(df["YearRemodAdd"])
df["YearBuilt"] = age_Conversion(df["YearBuilt"])
df["YrSold"] = age_Conversion(df["YrSold"])

In [8]:
test["GarageYrBlt"] = age_Conversion(test["GarageYrBlt"])
test["YearRemodAdd"] = age_Conversion(test["YearRemodAdd"])
test["YearBuilt"] = age_Conversion(test["YearBuilt"])
test["YrSold"] = age_Conversion(test["YrSold"])

In [9]:
def add_fet(X):
    X["Remod"] = 2
    X.loc[(X.YearBuilt==X.YearRemodAdd), ['Remod']] = 0
    X.loc[(X.YearBuilt!=X.YearRemodAdd), ['Remod']] = 1
    X.Remod
    X["Age"] = X.YearRemodAdd - X.YrSold # sice I convert both to age
    X["IsNew"] = 2
    X.loc[(X.YearBuilt==X.YrSold), ['IsNew']] = 1
    X.loc[(X.YearBuilt!=X.YrSold), ['IsNew']] = 0
    return X
df = add_fet(df)
test = add_fet(test)

# FEATURE ENGINEERING

In [10]:
def fet_Engineering(X):
    X["Garage_Area_Car"] = X["GarageCars"] * X["GarageArea"]

    X['TotalBsmtSF_x_Bsm'] = X.TotalBsmtSF * X['1stFlrSF']

    TotalArea = ["GrLivArea", "TotalBsmtSF", "WoodDeckSF", "MasVnrArea", "GarageArea", "OpenPorchSF", "3SsnPorch", "ScreenPorch", "EnclosedPorch", "PoolArea" ]
    X["TotalArea"] = X.GrLivArea + X.TotalBsmtSF + X.WoodDeckSF + X.MasVnrArea + X.GarageArea + X.OpenPorchSF + X["3SsnPorch"] + X.ScreenPorch + X.EnclosedPorch + X.PoolArea
    #df.drop(TotalArea, axis=1, inplace=True)

    X['LotAreaMultSlope'] = X.LotArea * X.LandSlope
    #df.drop([ "MiscVal", "GarageCars", "1stFlrSF", "LotArea", "LandSlope", "Utilities"], axis=1, inplace=True)
    
    return X

df = fet_Engineering(df)
test = fet_Engineering(test)

# Handling Skewd data

In [11]:
quality2 = Quality.quality_check(df, "SalePrice")

mapp.remove("Utilities")

#returns a list of cols where |skewd| > 1
#return a list of cols where |kurt| > 3
def get_skewd_kurts_cols(X):
    L1 = X[(X['skew'] > 1) | (X['skew'] < -1) ].index.tolist()
    L2 = X[(X['kurt'] > 3) | (X['kurt'] < -3) ].index.tolist()
    return L1,L2
skewd, kurts = get_skewd_kurts_cols(quality2)


## Handling the skewd data 
import numpy as np
def handle_skewd_data(L, X):
    for col in L:
        try: 
            X[col] = np.log1p(X[col])
        except:
            pass
            
    return X

temp_df = df[mapp]
temp_test = test[mapp]
df = handle_skewd_data(skewd, df)
test = handle_skewd_data(skewd, test)##Uncomment if you want to handle the skewd data
df[mapp] = temp_df
test[mapp] = temp_test

quality3 = Quality.quality_check(df, "SalePrice")
print(quality3)

                   SalePrice      skew      kurt
SalePrice           1.000000  0.121347  0.809519
TotalArea           0.838092 -0.341445  1.978518
OverallQual         0.817185  0.216944  0.096293
GrLivArea           0.730254 -0.006140  0.281988
GarageCars          0.680625 -0.342549  0.220998
...                      ...       ...       ...
GarageType_Detchd  -0.388638  1.065652 -0.865574
MasVnrType_None    -0.395389 -0.373853 -1.862787
YearRemodAdd       -0.565608  0.503562 -1.272245
Age                -0.568136  0.502489 -1.266028
YearBuilt          -0.586570  0.613461 -0.439552

[257 rows x 3 columns]


# Split the data

In [12]:
y = df[['SalePrice']]
df.drop(['SalePrice'], axis=1, inplace=True)
X = df

X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.15, random_state=0)

cols = X.columns
num_cols = X._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))

In [13]:
type(X_train)
X_train._get_numeric_data().columns

Index(['LotFrontage', 'LotArea', 'Street', 'LotShape', 'Utilities',
       'LandSlope', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       ...
       'SaleCondition_Family', 'SaleCondition_Normal', 'SaleCondition_Partial',
       'Remod', 'Age', 'IsNew', 'Garage_Area_Car', 'TotalBsmtSF_x_Bsm',
       'TotalArea', 'LotAreaMultSlope'],
      dtype='object', length=256)

In [14]:
def Backward_Elimination(y, X, sl):
    cols = X.columns.values
    ini = len(cols)
    col_vars = X.shape[1]
    for i in range (0, col_vars):
        Regressor = sm.OLS(y, X).fit()
        maxVar = max(Regressor.pvalues)
        if maxVar > sl:
            for j in range(0, col_vars-i):
                if (Regressor.pvalues[j].astype(float) == maxVar):
                    cols = np.delete(cols, j)
                    X = X.loc[:, cols]
        
    print('\nSelect {:d} features from {:d} by best p-values.'.format(len(cols), ini))
    print(Regressor.summary())
    return cols

In [15]:
colum = Backward_Elimination(y_train, X_train, 0.051)


Select 133 features from 256 by best p-values.
                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.944
Model:                            OLS   Adj. R-squared:                  0.938
Method:                 Least Squares   F-statistic:                     157.3
Date:                Tue, 23 Feb 2021   Prob (F-statistic):               0.00
Time:                        20:49:46   Log-Likelihood:                 1152.5
No. Observations:                1241   AIC:                            -2063.
Df Residuals:                    1120   BIC:                            -1443.
Df Model:                         120                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

In [16]:
len(colum)

133

# Modeling

In [17]:
model = GradientBoostingRegressor(n_estimators=1000, random_state=0)
model.fit(X_train[colum], y_train)
y_pred_test=model.predict(X_test[colum])

  y = column_or_1d(y, warn=True)


In [18]:
y_inv_test = np.expm1(y_test)
y_inv_pred = np.expm1(y_pred_test)

mean_absolute_error(y_inv_pred, y_inv_test)

15828.686650696656

## Neural Network model using Pytorch

In [19]:
##Import libraries for neural network
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

### Prepare data for pytorch

In [20]:
X_train =X_train[colum].values
y_train = y_train.values
X_test =X_test[colum].values
y_test = y_test.values

##########
test = test[colum].values
test = torch.Tensor(test)

In [21]:
X_train = torch.Tensor(X_train)
y_train = torch.Tensor(y_train)
training_set = TensorDataset(X_train, y_train)

X_test = torch.Tensor(X_test)
y_test = torch.Tensor(y_test)
testing_set = TensorDataset(X_test, y_test)

In [22]:
print("{}, {}, {}, {}".format(X_train.dtype, X_test.dtype, y_train.dtype, y_test.dtype))

torch.float32, torch.float32, torch.float32, torch.float32


In [23]:
trainset = DataLoader(training_set, batch_size=10, shuffle=True)
testset = DataLoader(testing_set, batch_size=10)
testLoader = DataLoader(test, batch_size=10)

In [24]:
X_train.shape

torch.Size([1241, 133])

In [51]:
class Net(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(133, 64) # input, output for a layer
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 16) 
        self.fc5 = nn.Linear(16, 1)# 1 Target Variable
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        
        return x

net = Net()
print(net)

Net(
  (fc1): Linear(in_features=133, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=16, bias=True)
  (fc5): Linear(in_features=16, out_features=1, bias=True)
)


In [52]:
for data in testset:
    X, y = data
    net.zero_grad()
    output = net(X)
    print(output)
    break

tensor([[0.2786],
        [0.8461],
        [0.3112],
        [0.7304],
        [0.6371],
        [0.0000],
        [0.0000],
        [0.3937],
        [1.9989],
        [0.1992]], grad_fn=<ReluBackward0>)


In [55]:
import time
optimizer = optim.Adam(net.parameters(), lr=0.001)
EPOCHS = 100
start = time.time()
for epoch in range(EPOCHS):
    for data in trainset:
        X,y = data
        net.zero_grad()
        output = net(X)
        loss = F.mse_loss(output, y)
        loss.backward()
        optimizer.step()
    print(loss)
end = time.time()
print(end - start)

tensor(0.0012, grad_fn=<MseLossBackward>)
tensor(0.0129, grad_fn=<MseLossBackward>)
tensor(0.1412, grad_fn=<MseLossBackward>)
tensor(0.0020, grad_fn=<MseLossBackward>)
tensor(0.0207, grad_fn=<MseLossBackward>)
tensor(0.0027, grad_fn=<MseLossBackward>)
tensor(0.0145, grad_fn=<MseLossBackward>)
tensor(0.0055, grad_fn=<MseLossBackward>)
tensor(0.1250, grad_fn=<MseLossBackward>)
tensor(0.0061, grad_fn=<MseLossBackward>)
tensor(0.0006, grad_fn=<MseLossBackward>)
tensor(6.7387e-06, grad_fn=<MseLossBackward>)
tensor(0.0005, grad_fn=<MseLossBackward>)
tensor(0.0172, grad_fn=<MseLossBackward>)
tensor(0.0010, grad_fn=<MseLossBackward>)
tensor(0.0024, grad_fn=<MseLossBackward>)
tensor(0.0158, grad_fn=<MseLossBackward>)
tensor(0.0198, grad_fn=<MseLossBackward>)
tensor(0.0015, grad_fn=<MseLossBackward>)
tensor(0.0215, grad_fn=<MseLossBackward>)
tensor(0.0255, grad_fn=<MseLossBackward>)
tensor(0.0133, grad_fn=<MseLossBackward>)
tensor(0.0005, grad_fn=<MseLossBackward>)
tensor(0.0236, grad_fn=<MseLos

In [56]:
correct = 0
total = 0
with torch.no_grad():
    for data in testset:
        X,y = data
        output = net(X)
        for idx, i in enumerate(output):
            print(i, y[idx])            

tensor([12.2818]) tensor([12.2092])
tensor([11.9567]) tensor([11.7981])
tensor([11.7835]) tensor([11.6082])
tensor([12.5043]) tensor([12.1653])
tensor([11.5114]) tensor([11.3851])
tensor([11.2753]) tensor([11.3504])
tensor([12.5107]) tensor([12.5529])
tensor([11.9353]) tensor([11.8565])
tensor([14.1181]) tensor([13.5211])
tensor([12.0717]) tensor([11.9104])
tensor([12.3079]) tensor([12.2496])
tensor([12.0259]) tensor([11.8271])
tensor([12.5798]) tensor([12.3239])
tensor([11.7174]) tensor([11.7199])
tensor([11.8513]) tensor([11.6886])
tensor([12.0229]) tensor([11.8845])
tensor([12.4655]) tensor([12.1548])
tensor([11.6343]) tensor([11.7248])
tensor([11.9627]) tensor([11.9141])
tensor([12.0131]) tensor([11.9512])
tensor([11.8219]) tensor([12.0197])
tensor([12.0005]) tensor([11.8810])
tensor([11.2970]) tensor([11.6082])
tensor([12.0843]) tensor([12.0668])
tensor([12.4201]) tensor([12.1281])
tensor([12.3520]) tensor([12.0317])
tensor([12.2570]) tensor([12.0867])
tensor([11.3373]) tensor([11

In [57]:
test_pred = torch.LongTensor()
for data in testLoader:
    X = data
    output = net(X)
    test_pred = torch.cat((test_pred, output), dim=0)

In [58]:
test_pred.shape

torch.Size([1459, 1])

In [59]:
submission = test_pred.detach().numpy()

In [60]:
submission = np.expm1(submission)

In [61]:
submission_file = pd.read_csv("sample_submission.csv")
submission_file.iloc[:, 1] = np.floor(submission)
submission_file.to_csv("House_price_submission_nn.csv", index=False)