# Importing libraries and Dataset

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api  as sm # For p value calculations

#import external functions 
import Quality # To Check the quality of the data
import mapper # For Label encoding
import missing_data # To Handle Missing data
import Encoder # For OneHotEncoding

In [2]:
#sklearn libraries
from sklearn.model_selection import train_test_split

In [3]:
# Import Data set
test = pd.read_csv("test.csv", index_col="Id")
df = pd.read_csv("train.csv", index_col="Id")

# Lets CheckOut the Total No of Features
print("Length of Test = {}".format(len(test.columns)))
print("Length of Train =  {}".format(len(df.columns)))

print(df.columns)
print("\n")
print(df._get_numeric_data().columns)

Length of Test = 79
Length of Train =  80
Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQua

# Data Processing

In [4]:
#Handling the missing data
df = missing_data.deal_with_missing_values(df)
test = missing_data.deal_with_missing_values(test)

##Checking the Quality of the data
quality = Quality.quality_check(df, "SalePrice")
print(quality)

## Label Encoding
df, mapp = mapper.mapping(df)
test, mapp = mapper.mapping(test)
print(mapp)

               SalePrice       skew        kurt
SalePrice       1.000000   1.882876    6.536282
OverallQual     0.790982   0.216944    0.096293
GrLivArea       0.708624   1.366560    4.895121
GarageCars      0.640409  -0.342549    0.220998
GarageArea      0.623431   0.179981    0.917067
TotalBsmtSF     0.613581   1.524255   13.250483
1stFlrSF        0.605852   1.376757    5.745841
FullBath        0.560664   0.036562   -0.857043
TotRmsAbvGrd    0.533723   0.676341    0.880762
YearBuilt       0.522897  -0.613461   -0.439552
YearRemodAdd    0.507101  -0.503562   -1.272245
MasVnrArea      0.472614   2.677616   10.141416
Fireplaces      0.466929   0.649565   -0.217237
BsmtFinSF1      0.386420   1.685503   11.118236
LotFrontage     0.334901   2.384950   21.848165
WoodDeckSF      0.324413   1.541376    2.992951
2ndFlrSF        0.319334   0.813030   -0.553464
OpenPorchSF     0.315856   2.364342    8.490336
HalfBath        0.284108   0.675897   -1.076927
LotArea         0.263843  12.207688  203

In [5]:
##ONE Hot encoding using dummy
df, dummies = Encoder.one_hot_encode(df)
test, dummies_test = Encoder.one_hot_encode(test)

miss = missing_data.find_col_with_missing_data(df)
print(miss) # just for assurance

Total Columns: 250
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Columns: 250 entries, LotFrontage to SaleCondition_Partial
dtypes: float64(3), int64(54), uint8(193)
memory usage: 936.7 KB
None
Total Columns: 235
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 1461 to 2919
Columns: 235 entries, LotFrontage to SaleCondition_Partial
dtypes: float64(11), int64(45), uint8(179)
memory usage: 904.8 KB
None
[]


#### Note that the size of test and df is different.  The reason behind this is that some categorical features in testset doesnot contains all the unique values. So here i am creating all those cols with 0 value.

In [6]:
dummies = dummies.values
dummies_test = dummies_test.values
M = list(set(dummies) - set(dummies_test))
for i in M:
    test[i] = 0
print("Length of Test = {}".format(len(test.columns)))
print("Length of Train =  {}".format(len(df.columns)))

Length of Test = 250
Length of Train =  250


# CONVERTING YEAR DATA INTO AGE

In [7]:
def age_Conversion(X):
    return X.apply(lambda x: 0 if x == 0 else (2020-x) )
df["GarageYrBlt"] = age_Conversion(df["GarageYrBlt"])
df["YearRemodAdd"] = age_Conversion(df["YearRemodAdd"])
df["YearBuilt"] = age_Conversion(df["YearBuilt"])
df["YrSold"] = age_Conversion(df["YrSold"])

In [8]:
test["GarageYrBlt"] = age_Conversion(test["GarageYrBlt"])
test["YearRemodAdd"] = age_Conversion(test["YearRemodAdd"])
test["YearBuilt"] = age_Conversion(test["YearBuilt"])
test["YrSold"] = age_Conversion(test["YrSold"])

# FEATURE ENGINEERING

In [9]:
def add_fet(X):
    X["Remod"] = 2
    X.loc[(X.YearBuilt==X.YearRemodAdd), ['Remod']] = 0
    X.loc[(X.YearBuilt!=X.YearRemodAdd), ['Remod']] = 1
    X.Remod
    X["Age"] = X.YearRemodAdd - X.YrSold # sice I convert both to age
    X["IsNew"] = 2
    X.loc[(X.YearBuilt==X.YrSold), ['IsNew']] = 1
    X.loc[(X.YearBuilt!=X.YrSold), ['IsNew']] = 0
    return X
df = add_fet(df)
test = add_fet(test)

In [10]:
def fet_Engineering(X):
    X["Garage_Area_Car"] = X["GarageCars"] * X["GarageArea"]

    X['TotalBsmtSF_x_Bsm'] = X.TotalBsmtSF * X['1stFlrSF']

    #TotalArea = ["GrLivArea", "TotalBsmtSF", "WoodDeckSF", "MasVnrArea", "GarageArea", "OpenPorchSF", "3SsnPorch", "ScreenPorch", "EnclosedPorch", "PoolArea" ]
    X["TotalArea"] = X.GrLivArea + X.TotalBsmtSF + X.WoodDeckSF + X.MasVnrArea + X.GarageArea + X.OpenPorchSF + X["3SsnPorch"] + X.ScreenPorch + X.EnclosedPorch + X.PoolArea
    #df.drop(TotalArea, axis=1, inplace=True)

    X['LotAreaMultSlope'] = X.LotArea * X.LandSlope
    #df.drop([ "MiscVal", "GarageCars", "1stFlrSF", "LotArea", "LandSlope", "Utilities"], axis=1, inplace=True)
    
    return X

df = fet_Engineering(df)
test = fet_Engineering(test)

# Handling Skewd data

What is skewd data?
A data is called as skewed when curve appears distorted or skewed either to the left or to the right, in a statistical distribution. In a normal distribution, the graph appears symmetry meaning that there are about as many data values on the left side of the median as on the right side.

How does it affects the model?
So in skewed data, the tail region may act as an outlier for the statistical model and we know that outliers adversely affect the model’s performance especially regression-based models.

In [11]:
quality2 = Quality.quality_check(df, "SalePrice")
mapp.remove("Utilities") ## this col is useless 

#returns a list of cols where |skewd| > 1
#return a list of cols where |kurt| > 3
def get_skewd_kurts_cols(X):
    L1 = X[(X['skew'] > 1) | (X['skew'] < -1) ].index.tolist()
    L2 = X[(X['kurt'] > 3) | (X['kurt'] < -3) ].index.tolist()
    return L1,L2
skewd, _ = get_skewd_kurts_cols(quality2)


## Handling the skewd data 
def handle_skewd_data(L, X):
    for col in L:
        try: 
            X[col] = np.log1p(X[col])
        except:
            pass
            
    return X

temp_df = df[mapp]
temp_test = test[mapp]
df = handle_skewd_data(skewd, df)
test = handle_skewd_data(skewd, test)##Uncomment if you want to handle the skewd data
df[mapp] = temp_df
test[mapp] = temp_test

quality3 = Quality.quality_check(df, "SalePrice")
print(quality3)

                   SalePrice      skew      kurt
SalePrice           1.000000  0.121347  0.809519
TotalArea           0.838092 -0.341445  1.978518
OverallQual         0.817185  0.216944  0.096293
GrLivArea           0.730254 -0.006140  0.281988
GarageCars          0.680625 -0.342549  0.220998
...                      ...       ...       ...
GarageType_Detchd  -0.388638  1.065652 -0.865574
MasVnrType_None    -0.395389 -0.373853 -1.862787
YearRemodAdd       -0.565608  0.503562 -1.272245
Age                -0.568136  0.502489 -1.266028
YearBuilt          -0.586570  0.613461 -0.439552

[257 rows x 3 columns]


# Split the data

In [12]:
y = df[['SalePrice']]
df.drop(['SalePrice'], axis=1, inplace=True)
X = df

X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.15, random_state=0)

cols = X.columns
num_cols = X._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))

In [13]:
type(X_train)
print(X_train._get_numeric_data().columns)
X_train.shape

Index(['LotFrontage', 'LotArea', 'Street', 'LotShape', 'Utilities',
       'LandSlope', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       ...
       'SaleCondition_Family', 'SaleCondition_Normal', 'SaleCondition_Partial',
       'Remod', 'Age', 'IsNew', 'Garage_Area_Car', 'TotalBsmtSF_x_Bsm',
       'TotalArea', 'LotAreaMultSlope'],
      dtype='object', length=256)


(1241, 256)

### Feature Elimination using P Value

What is p value?
The p-value is used as an alternative to rejection points to provide the smallest level of significance at which the null hypothesis would be rejected. A smaller p-value means that there is stronger evidence in favor of the alternative hypothesis.

In [14]:
def Backward_Elimination(y, X, sl):
    cols = X.columns.values
    ini = len(cols)
    col_vars = X.shape[1]
    for i in range (0, col_vars):
        Regressor = sm.OLS(y, X).fit()
        maxVar = max(Regressor.pvalues)
        if maxVar > sl:
            for j in range(0, col_vars-i):
                if (Regressor.pvalues[j].astype(float) == maxVar):
                    cols = np.delete(cols, j)
                    X = X.loc[:, cols]
        
    print('\nSelect {:d} features from {:d} by best p-values.'.format(len(cols), ini))
    print(Regressor.summary())
    return cols

In [15]:
colum = Backward_Elimination(y_train, X_train, 0.051)
len(colum)


Select 133 features from 256 by best p-values.
                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.944
Model:                            OLS   Adj. R-squared:                  0.938
Method:                 Least Squares   F-statistic:                     157.3
Date:                Thu, 25 Feb 2021   Prob (F-statistic):               0.00
Time:                        01:50:00   Log-Likelihood:                 1152.5
No. Observations:                1241   AIC:                            -2063.
Df Residuals:                    1120   BIC:                            -1443.
Df Model:                         120                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

133

###  Converting DataFrames into Numpy arrays and saving it in .npy format

In [16]:
X_train =X_train[colum].values
y_train = y_train.values
X_val =X_test[colum].values
y_val = y_test.values
test = test[colum].values

In [17]:
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_val.npy', X_val)
np.save('y_val.npy', y_val)
np.save('test.npy', test)