# Steps in Feature Engineering
    # 1) Missing values
    # 2) Temporal variables
    # 3) Categorical variables: remove rare labels
    # 4) Standardise the values of the variables to the same range

In [1]:
# Importing Libraries
import pandas as pd # to read and pre-process the data
import numpy as np # to work with [single to multi dimensional] arrays
import seaborn as sns # to handle statistical function and create visualization graph
import matplotlib.pyplot as plt # to create visualization graph
%matplotlib inline


# Libraries for Feature Selection
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [2]:
# "House Prices" data is good to study, because it has 81 features / columns

# hence, we will display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)

In [3]:
data = pd.read_csv('X_train.csv') # Processed data by Feature Engineering in Video 35
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,60,3,-1.030153,-0.236599,1,2,0,1,1,0,0,14,2,1,3,5,7,5,2003,2003,0,0,10,10,2,196.0,2,3,4,3,3,1,6,706,5,0,150,856,2,4,1,3,-0.435347,854,0,-0.361169,1,0,2,1,3,1,2,8,4,0,1,4,2003.0,2,2,548,2,3,2,0,61,0,0,0,0,0,3,2,0,2,2008,2,3,-0.085097,0,0,0
1,2,20,3,-0.940688,-0.228594,1,2,0,1,1,2,0,11,1,1,3,3,6,8,1976,1976,0,0,4,3,1,0.0,1,3,2,3,3,4,4,978,5,0,284,1262,2,4,1,3,-0.391729,0,0,-0.391729,0,1,2,0,3,1,1,6,4,1,3,4,1976.0,2,2,460,2,3,2,298,0,0,0,0,0,0,3,2,0,5,2007,2,3,-0.090069,0,0,0
2,3,60,3,-1.009373,-0.218951,1,2,1,1,1,0,0,14,2,1,3,5,7,5,2001,2002,0,0,10,10,2,162.0,2,3,4,3,3,2,6,486,5,0,434,920,2,4,1,3,-0.426813,866,0,-0.357020,1,0,2,1,3,1,2,6,4,1,3,4,2001.0,2,2,608,2,3,2,0,42,0,0,0,0,0,3,2,0,9,2008,2,3,-0.082644,0,0,0
3,4,70,3,-1.069119,-0.228917,1,2,1,1,1,1,0,16,2,1,3,5,7,5,1915,1970,0,0,2,4,1,0.0,1,3,1,2,4,1,4,216,5,0,540,756,2,3,1,3,-0.421752,756,0,-0.360777,1,0,1,0,3,1,2,7,4,1,4,2,1998.0,1,3,642,2,3,2,0,35,272,0,0,0,0,3,2,0,2,2006,2,0,-0.099666,0,0,0
4,5,60,3,-0.921742,-0.205134,1,2,1,1,1,2,0,22,2,1,3,5,8,5,2000,2000,0,0,10,10,2,350.0,2,3,4,3,3,3,6,655,5,0,490,1145,2,4,1,3,-0.402145,1053,0,-0.337925,1,0,2,1,4,1,2,9,4,1,3,4,2000.0,2,3,836,2,3,2,192,84,0,0,0,0,0,3,2,0,12,2008,2,3,-0.078742,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3,-1.052817,-0.240776,1,2,0,1,1,0,0,13,2,1,3,5,6,5,1999,2000,0,0,10,10,1,0.0,1,3,4,3,3,1,5,0,5,0,953,953,2,4,1,3,-0.422717,694,0,-0.364794,0,0,2,1,3,1,1,7,4,1,3,4,1999.0,2,2,460,2,3,2,0,40,0,0,0,0,0,3,2,0,8,2007,2,3,-0.091395,0,0,0
1456,1457,20,3,-0.917252,-0.209669,1,2,0,1,1,0,0,12,2,1,3,3,6,6,1978,1988,0,0,7,7,4,119.0,1,3,2,3,3,1,4,790,2,163,589,1542,2,2,1,3,-0.343196,0,0,-0.343196,1,0,2,0,3,1,1,7,3,2,3,4,1978.0,1,2,500,2,3,2,349,0,0,0,0,0,0,2,2,0,2,2010,2,3,-0.084842,0,0,0
1457,1458,70,3,-1.023031,-0.232322,1,2,0,1,1,0,0,16,2,1,3,5,7,9,1941,2006,0,0,9,9,1,0.0,3,2,3,2,4,1,6,275,5,0,877,1152,2,4,1,3,-0.398160,1152,0,-0.332384,0,0,2,0,4,1,2,9,4,2,4,4,1941.0,2,1,252,2,3,2,0,60,0,0,0,0,0,4,1,2500,5,2010,2,3,-0.076544,0,0,0
1458,1459,20,3,-1.009373,-0.227845,1,2,0,1,1,0,0,8,2,1,3,3,5,6,1950,1996,2,0,4,3,1,0.0,1,3,2,2,3,2,6,49,2,1029,0,1078,2,3,1,2,-0.408767,0,0,-0.408767,1,0,1,0,2,1,2,5,4,0,1,4,1950.0,1,1,240,2,3,2,366,0,112,0,0,0,0,3,2,0,4,2010,2,3,-0.099099,0,0,0


In [4]:
# dependent feature= y
y_train=data[['SalePrice']]

y_train

Unnamed: 0,SalePrice
0,-0.085097
1,-0.090069
2,-0.082644
3,-0.099666
4,-0.078742
...,...
1455,-0.091395
1456,-0.084842
1457,-0.076544
1458,-0.099099


In [5]:
# independent feature = Xi
X_train=data.drop(['Id','SalePrice'],axis=1) # droppnig un-necessary feature 'Id'
X_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,60,3,-1.030153,-0.236599,1,2,0,1,1,0,0,14,2,1,3,5,7,5,2003,2003,0,0,10,10,2,196.0,2,3,4,3,3,1,6,706,5,0,150,856,2,4,1,3,-0.435347,854,0,-0.361169,1,0,2,1,3,1,2,8,4,0,1,4,2003.0,2,2,548,2,3,2,0,61,0,0,0,0,0,3,2,0,2,2008,2,3,0,0,0
1,20,3,-0.940688,-0.228594,1,2,0,1,1,2,0,11,1,1,3,3,6,8,1976,1976,0,0,4,3,1,0.0,1,3,2,3,3,4,4,978,5,0,284,1262,2,4,1,3,-0.391729,0,0,-0.391729,0,1,2,0,3,1,1,6,4,1,3,4,1976.0,2,2,460,2,3,2,298,0,0,0,0,0,0,3,2,0,5,2007,2,3,0,0,0
2,60,3,-1.009373,-0.218951,1,2,1,1,1,0,0,14,2,1,3,5,7,5,2001,2002,0,0,10,10,2,162.0,2,3,4,3,3,2,6,486,5,0,434,920,2,4,1,3,-0.426813,866,0,-0.357020,1,0,2,1,3,1,2,6,4,1,3,4,2001.0,2,2,608,2,3,2,0,42,0,0,0,0,0,3,2,0,9,2008,2,3,0,0,0
3,70,3,-1.069119,-0.228917,1,2,1,1,1,1,0,16,2,1,3,5,7,5,1915,1970,0,0,2,4,1,0.0,1,3,1,2,4,1,4,216,5,0,540,756,2,3,1,3,-0.421752,756,0,-0.360777,1,0,1,0,3,1,2,7,4,1,4,2,1998.0,1,3,642,2,3,2,0,35,272,0,0,0,0,3,2,0,2,2006,2,0,0,0,0
4,60,3,-0.921742,-0.205134,1,2,1,1,1,2,0,22,2,1,3,5,8,5,2000,2000,0,0,10,10,2,350.0,2,3,4,3,3,3,6,655,5,0,490,1145,2,4,1,3,-0.402145,1053,0,-0.337925,1,0,2,1,4,1,2,9,4,1,3,4,2000.0,2,3,836,2,3,2,192,84,0,0,0,0,0,3,2,0,12,2008,2,3,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,3,-1.052817,-0.240776,1,2,0,1,1,0,0,13,2,1,3,5,6,5,1999,2000,0,0,10,10,1,0.0,1,3,4,3,3,1,5,0,5,0,953,953,2,4,1,3,-0.422717,694,0,-0.364794,0,0,2,1,3,1,1,7,4,1,3,4,1999.0,2,2,460,2,3,2,0,40,0,0,0,0,0,3,2,0,8,2007,2,3,0,0,0
1456,20,3,-0.917252,-0.209669,1,2,0,1,1,0,0,12,2,1,3,3,6,6,1978,1988,0,0,7,7,4,119.0,1,3,2,3,3,1,4,790,2,163,589,1542,2,2,1,3,-0.343196,0,0,-0.343196,1,0,2,0,3,1,1,7,3,2,3,4,1978.0,1,2,500,2,3,2,349,0,0,0,0,0,0,2,2,0,2,2010,2,3,0,0,0
1457,70,3,-1.023031,-0.232322,1,2,0,1,1,0,0,16,2,1,3,5,7,9,1941,2006,0,0,9,9,1,0.0,3,2,3,2,4,1,6,275,5,0,877,1152,2,4,1,3,-0.398160,1152,0,-0.332384,0,0,2,0,4,1,2,9,4,2,4,4,1941.0,2,1,252,2,3,2,0,60,0,0,0,0,0,4,1,2500,5,2010,2,3,0,0,0
1458,20,3,-1.009373,-0.227845,1,2,0,1,1,0,0,8,2,1,3,3,5,6,1950,1996,2,0,4,3,1,0.0,1,3,2,2,3,2,6,49,2,1029,0,1078,2,3,1,2,-0.408767,0,0,-0.408767,1,0,1,0,2,1,2,5,4,0,1,4,1950.0,1,1,240,2,3,2,366,0,112,0,0,0,0,3,2,0,4,2010,2,3,0,0,0


In [6]:
# Applying Lasso Regression
# & selecting from ML models --> to reduce un-necessary features

# Libraries for Feature Selection
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0))
# random_state=0 : remember to set the seed
feature_sel_model.fit(X_train,y_train)

# "SelectFromModel" select features which coefficients are non-zero
# alpha=0.005: penalty value
    # the bigger the alpha, the less features will be selected


SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))

In [7]:
# o/p - true / false - to count the number of features
feature_sel_model.get_support()
# True: indicates - the feature is important and should be used
# False: indicates trivial features and can be skipped

array([ True, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False,  True, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [8]:
# print the number of total and selected features

selected_feat = X_train.columns[(feature_sel_model.get_support())] # to make a list of selected features

# to print statistics
print('total features: {}'.format((X_train.shape[1])))

print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.
      format(
          np.sum(feature_sel_model.estimator_.coef_ == 0)
      ))

total features: 82
selected features: 8
features with coefficients shrank to zero: 62


In [9]:
selected_feat

Index(['MSSubClass', 'Neighborhood', 'YearBuilt', 'YearRemodAdd', '2ndFlrSF',
       'GarageYrBlt', 'GarageArea', 'ScreenPorch'],
      dtype='object')

In [10]:
# storing New data in "X_train"
X_train = X_train[selected_feat]
X_train

Unnamed: 0,MSSubClass,Neighborhood,YearBuilt,YearRemodAdd,2ndFlrSF,GarageYrBlt,GarageArea,ScreenPorch
0,60,14,2003,2003,854,2003.0,548,0
1,20,11,1976,1976,0,1976.0,460,0
2,60,14,2001,2002,866,2001.0,608,0
3,70,16,1915,1970,756,1998.0,642,0
4,60,22,2000,2000,1053,2000.0,836,0
...,...,...,...,...,...,...,...,...
1455,60,13,1999,2000,694,1999.0,460,0
1456,20,12,1978,1988,0,1978.0,500,0
1457,70,16,1941,2006,1152,1941.0,252,0
1458,20,8,1950,1996,0,1950.0,240,0
