In [1]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# for the yeo-johnson transformation
import scipy.stats as stats

# Reproducibility

# to build the models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer

# to visualise all the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [2]:
# load the train and test set with the engineered variables

# we build and saved these datasets in the previous lecture.
# If you haven't done so, go and check previous notebook.
# to find out how to create these datasets

X_train = pd.read_csv('xtrain.csv')
X_test = pd.read_csv('xtest.csv')


X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
0,0.75,0.75,0.461171,0,1,1.0,0.333333,1.0,1,0.0,0.0,0.863636,0.4,1,0.75,0.6,0.777778,0.5,0.014706,0.04918,0.0,0,1.0,1.0,0.333333,0.0,0.666667,0.5,1.0,0.666667,0.666667,0.666667,0.0,0.002835,0.0,0,0.673479,0.239935,1.0,1.0,1,1.0,0.55976,0.0,0,0.52325,0.0,0.0,0.666667,0.0,0.375,0.333333,0.666667,0.416667,1.0,0.0,0.0,0.75,0.018692,1.0,0.75,0.430183,0.5,0.5,1.0,0.116686,0.032907,0,0,0,0.0,0,0.0,1.0,0,0.545455,0.666667,0.75,0,0,0
1,0.75,0.75,0.456066,0,1,1.0,0.333333,0.333333,1,0.0,0.0,0.363636,0.4,1,0.75,0.6,0.444444,0.75,0.360294,0.04918,0.0,0,0.6,0.6,0.666667,0.03375,0.666667,0.5,0.5,0.333333,0.666667,0.0,1.0,0.142807,0.0,0,0.114724,0.17234,1.0,1.0,1,1.0,0.434539,0.0,0,0.406196,0.333333,0.0,0.333333,0.5,0.375,0.333333,0.666667,0.25,1.0,0.0,0.0,0.75,0.457944,0.5,0.25,0.220028,0.5,0.5,1.0,0.0,0.0,0,0,0,0.0,0,0.75,1.0,0,0.636364,0.666667,0.75,0,0,0
2,0.916667,0.75,0.394699,0,1,1.0,0.0,0.333333,1,0.0,0.0,0.954545,0.4,1,1.0,0.6,0.888889,0.5,0.036765,0.098361,1.0,0,0.3,0.2,0.666667,0.2575,1.0,0.5,1.0,1.0,0.666667,0.0,0.0,0.080794,0.0,0,0.601951,0.286743,1.0,1.0,1,1.0,0.627205,0.0,0,0.586296,0.333333,0.0,0.666667,0.0,0.25,0.333333,1.0,0.333333,1.0,0.333333,0.8,0.75,0.046729,0.5,0.5,0.406206,0.5,0.5,1.0,0.228705,0.149909,0,0,0,0.0,0,0.0,1.0,0,0.090909,0.666667,0.75,0,0,0
3,0.75,0.75,0.445002,0,1,1.0,0.666667,0.666667,1,0.0,0.0,0.454545,0.4,1,0.75,0.6,0.666667,0.5,0.066176,0.163934,0.0,0,1.0,1.0,0.333333,0.0,0.666667,0.5,1.0,0.666667,0.666667,1.0,0.0,0.25567,0.0,0,0.018114,0.242553,1.0,1.0,1,1.0,0.56692,0.0,0,0.529943,0.333333,0.0,0.666667,0.0,0.375,0.333333,0.666667,0.25,1.0,0.333333,0.4,0.75,0.084112,0.5,0.5,0.362482,0.5,0.5,1.0,0.469078,0.045704,0,0,0,0.0,0,0.0,1.0,0,0.636364,0.666667,0.75,1,0,0
4,0.75,0.75,0.577658,0,1,1.0,0.333333,0.333333,1,0.0,0.0,0.363636,0.4,1,0.75,0.6,0.555556,0.5,0.323529,0.737705,0.0,0,0.6,0.7,0.666667,0.17,0.333333,0.5,0.5,0.333333,0.666667,0.0,0.75,0.086818,0.0,0,0.434278,0.233224,1.0,0.75,1,1.0,0.549026,0.0,0,0.513216,0.0,0.0,0.666667,0.0,0.375,0.333333,0.333333,0.416667,1.0,0.333333,0.8,0.75,0.411215,0.5,0.5,0.406206,0.5,0.5,1.0,0.0,0.0,0,1,0,0.0,0,0.0,1.0,0,0.545455,0.666667,0.75,0,0,0


In [3]:
X_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
0,0.750000,0.75,0.461171,0,1,1.0,0.333333,1.000000,1,0.0,0.0,0.863636,0.4,1,0.75,0.6,0.777778,0.50,0.014706,0.049180,0.0,0,1.0,1.0,0.333333,0.000000,0.666667,0.5,1.0,0.666667,0.666667,0.666667,0.00,0.002835,0.0,0,0.673479,0.239935,1.0,1.00,1,1.0,0.559760,0.000000,0,0.523250,0.000000,0.0,0.666667,0.0,0.375,0.333333,0.666667,0.416667,1.00,0.000000,0.0,0.75,0.018692,1.0,0.75,0.430183,0.5,0.5,1.0,0.116686,0.032907,0,0,0,0.0,0,0.00,1.0,0,0.545455,0.666667,0.75,0,0,0
1,0.750000,0.75,0.456066,0,1,1.0,0.333333,0.333333,1,0.0,0.0,0.363636,0.4,1,0.75,0.6,0.444444,0.75,0.360294,0.049180,0.0,0,0.6,0.6,0.666667,0.033750,0.666667,0.5,0.5,0.333333,0.666667,0.000000,1.00,0.142807,0.0,0,0.114724,0.172340,1.0,1.00,1,1.0,0.434539,0.000000,0,0.406196,0.333333,0.0,0.333333,0.5,0.375,0.333333,0.666667,0.250000,1.00,0.000000,0.0,0.75,0.457944,0.5,0.25,0.220028,0.5,0.5,1.0,0.000000,0.000000,0,0,0,0.0,0,0.75,1.0,0,0.636364,0.666667,0.75,0,0,0
2,0.916667,0.75,0.394699,0,1,1.0,0.000000,0.333333,1,0.0,0.0,0.954545,0.4,1,1.00,0.6,0.888889,0.50,0.036765,0.098361,1.0,0,0.3,0.2,0.666667,0.257500,1.000000,0.5,1.0,1.000000,0.666667,0.000000,0.00,0.080794,0.0,0,0.601951,0.286743,1.0,1.00,1,1.0,0.627205,0.000000,0,0.586296,0.333333,0.0,0.666667,0.0,0.250,0.333333,1.000000,0.333333,1.00,0.333333,0.8,0.75,0.046729,0.5,0.50,0.406206,0.5,0.5,1.0,0.228705,0.149909,0,0,0,0.0,0,0.00,1.0,0,0.090909,0.666667,0.75,0,0,0
3,0.750000,0.75,0.445002,0,1,1.0,0.666667,0.666667,1,0.0,0.0,0.454545,0.4,1,0.75,0.6,0.666667,0.50,0.066176,0.163934,0.0,0,1.0,1.0,0.333333,0.000000,0.666667,0.5,1.0,0.666667,0.666667,1.000000,0.00,0.255670,0.0,0,0.018114,0.242553,1.0,1.00,1,1.0,0.566920,0.000000,0,0.529943,0.333333,0.0,0.666667,0.0,0.375,0.333333,0.666667,0.250000,1.00,0.333333,0.4,0.75,0.084112,0.5,0.50,0.362482,0.5,0.5,1.0,0.469078,0.045704,0,0,0,0.0,0,0.00,1.0,0,0.636364,0.666667,0.75,1,0,0
4,0.750000,0.75,0.577658,0,1,1.0,0.333333,0.333333,1,0.0,0.0,0.363636,0.4,1,0.75,0.6,0.555556,0.50,0.323529,0.737705,0.0,0,0.6,0.7,0.666667,0.170000,0.333333,0.5,0.5,0.333333,0.666667,0.000000,0.75,0.086818,0.0,0,0.434278,0.233224,1.0,0.75,1,1.0,0.549026,0.000000,0,0.513216,0.000000,0.0,0.666667,0.0,0.375,0.333333,0.333333,0.416667,1.00,0.333333,0.8,0.75,0.411215,0.5,0.50,0.406206,0.5,0.5,1.0,0.000000,0.000000,0,1,0,0.0,0,0.00,1.0,0,0.545455,0.666667,0.75,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1309,1.000000,0.75,0.504203,0,1,1.0,0.000000,0.333333,1,0.0,0.0,1.000000,0.4,1,0.75,1.0,0.777778,0.50,0.073529,0.180328,0.0,0,1.0,1.0,0.666667,0.420625,0.666667,0.5,1.0,0.666667,0.666667,0.333333,0.00,0.206060,0.0,0,0.041338,0.204910,1.0,1.00,1,1.0,0.504851,0.586004,0,0.692428,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.500000,1.00,0.333333,0.8,0.75,0.093458,0.5,0.75,0.603667,0.5,0.5,1.0,0.000000,0.234004,0,0,1,0.0,0,0.00,1.0,0,0.545455,0.666667,0.75,0,0,0
1310,0.750000,0.75,0.388581,0,1,1.0,0.000000,0.333333,1,0.0,0.0,0.272727,0.4,1,0.75,0.6,0.333333,0.75,0.441176,0.262295,0.0,0,1.0,0.6,0.333333,0.000000,0.333333,0.5,0.5,0.666667,0.666667,0.000000,0.75,0.078313,0.0,0,0.290293,0.174632,1.0,0.50,1,1.0,0.439537,0.000000,0,0.410869,0.000000,0.0,0.666667,0.0,0.250,0.333333,0.666667,0.166667,0.25,0.000000,0.0,0.75,0.130841,0.0,0.50,0.307475,0.5,0.5,1.0,0.338390,0.000000,0,0,0,0.0,0,0.00,1.0,0,0.090909,0.666667,0.75,0,0,0
1311,0.250000,0.25,0.434909,0,1,1.0,0.000000,0.333333,1,0.0,0.0,0.272727,0.8,1,0.25,0.2,0.555556,0.50,0.235294,0.540984,0.0,0,1.0,1.0,0.333333,0.000000,0.333333,0.5,0.0,0.333333,0.666667,0.000000,0.00,0.000000,0.0,0,0.000000,0.000000,1.0,0.50,1,1.0,0.519487,0.311966,0,0.615356,0.000000,0.0,0.666667,0.0,0.500,0.666667,0.333333,0.500000,1.00,0.000000,0.0,0.75,0.299065,0.0,0.50,0.380113,0.5,0.5,1.0,0.000000,0.000000,0,0,0,0.0,0,0.00,1.0,0,0.272727,0.666667,0.75,0,0,0
1312,0.916667,0.75,0.445002,0,1,1.0,0.000000,0.333333,1,0.0,0.0,0.636364,0.4,1,1.00,0.6,0.666667,0.50,0.022059,0.049180,0.0,0,1.0,1.0,0.666667,0.011250,0.666667,0.5,1.0,0.666667,0.666667,1.000000,0.00,0.000000,0.0,0,0.638179,0.224877,1.0,1.00,1,1.0,0.582551,0.000000,0,0.544554,0.000000,0.0,0.666667,0.0,0.250,0.333333,0.666667,0.416667,1.00,0.333333,0.6,0.75,0.028037,1.0,0.50,0.296192,0.5,0.5,1.0,0.166861,0.036563,0,0,0,0.0,0,0.00,1.0,0,0.818182,0.666667,0.75,1,0,0


In [4]:
# Impute missing values with mean in X_train
# Dataset = pd.read_csv('xtrain.csv')
# mean_value = Dataset['BsmtFinType1'].mean()

# To calculate mean use 
# Dataset['BsmtFinType1'].fillna(value=mean_value, inplace=True)
# print('Updated Dataframe')
# print(Dataset) 

In [5]:
# X_train = pd.read_csv('xtrain.csv')

# mean_value = X_train['BsmtFinType1'].mean()

# X_train['BsmtFinType1'].fillna(value=0, inplace=True)
# print('Updated X_train')
# print(X_train) 

In [6]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
0,0.75,0.75,0.461171,0,1,1.0,0.333333,1.0,1,0.0,0.0,0.863636,0.4,1,0.75,0.6,0.777778,0.5,0.014706,0.04918,0.0,0,1.0,1.0,0.333333,0.0,0.666667,0.5,1.0,0.666667,0.666667,0.666667,0.0,0.002835,0.0,0,0.673479,0.239935,1.0,1.0,1,1.0,0.55976,0.0,0,0.52325,0.0,0.0,0.666667,0.0,0.375,0.333333,0.666667,0.416667,1.0,0.0,0.0,0.75,0.018692,1.0,0.75,0.430183,0.5,0.5,1.0,0.116686,0.032907,0,0,0,0.0,0,0.0,1.0,0,0.545455,0.666667,0.75,0,0,0
1,0.75,0.75,0.456066,0,1,1.0,0.333333,0.333333,1,0.0,0.0,0.363636,0.4,1,0.75,0.6,0.444444,0.75,0.360294,0.04918,0.0,0,0.6,0.6,0.666667,0.03375,0.666667,0.5,0.5,0.333333,0.666667,0.0,1.0,0.142807,0.0,0,0.114724,0.17234,1.0,1.0,1,1.0,0.434539,0.0,0,0.406196,0.333333,0.0,0.333333,0.5,0.375,0.333333,0.666667,0.25,1.0,0.0,0.0,0.75,0.457944,0.5,0.25,0.220028,0.5,0.5,1.0,0.0,0.0,0,0,0,0.0,0,0.75,1.0,0,0.636364,0.666667,0.75,0,0,0
2,0.916667,0.75,0.394699,0,1,1.0,0.0,0.333333,1,0.0,0.0,0.954545,0.4,1,1.0,0.6,0.888889,0.5,0.036765,0.098361,1.0,0,0.3,0.2,0.666667,0.2575,1.0,0.5,1.0,1.0,0.666667,0.0,0.0,0.080794,0.0,0,0.601951,0.286743,1.0,1.0,1,1.0,0.627205,0.0,0,0.586296,0.333333,0.0,0.666667,0.0,0.25,0.333333,1.0,0.333333,1.0,0.333333,0.8,0.75,0.046729,0.5,0.5,0.406206,0.5,0.5,1.0,0.228705,0.149909,0,0,0,0.0,0,0.0,1.0,0,0.090909,0.666667,0.75,0,0,0
3,0.75,0.75,0.445002,0,1,1.0,0.666667,0.666667,1,0.0,0.0,0.454545,0.4,1,0.75,0.6,0.666667,0.5,0.066176,0.163934,0.0,0,1.0,1.0,0.333333,0.0,0.666667,0.5,1.0,0.666667,0.666667,1.0,0.0,0.25567,0.0,0,0.018114,0.242553,1.0,1.0,1,1.0,0.56692,0.0,0,0.529943,0.333333,0.0,0.666667,0.0,0.375,0.333333,0.666667,0.25,1.0,0.333333,0.4,0.75,0.084112,0.5,0.5,0.362482,0.5,0.5,1.0,0.469078,0.045704,0,0,0,0.0,0,0.0,1.0,0,0.636364,0.666667,0.75,1,0,0
4,0.75,0.75,0.577658,0,1,1.0,0.333333,0.333333,1,0.0,0.0,0.363636,0.4,1,0.75,0.6,0.555556,0.5,0.323529,0.737705,0.0,0,0.6,0.7,0.666667,0.17,0.333333,0.5,0.5,0.333333,0.666667,0.0,0.75,0.086818,0.0,0,0.434278,0.233224,1.0,0.75,1,1.0,0.549026,0.0,0,0.513216,0.0,0.0,0.666667,0.0,0.375,0.333333,0.333333,0.416667,1.0,0.333333,0.8,0.75,0.411215,0.5,0.5,0.406206,0.5,0.5,1.0,0.0,0.0,0,1,0,0.0,0,0.0,1.0,0,0.545455,0.666667,0.75,0,0,0


In [7]:
print(X_train)

      MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \
0       0.750000      0.75     0.461171        0       1    1.0  0.333333   
1       0.750000      0.75     0.456066        0       1    1.0  0.333333   
2       0.916667      0.75     0.394699        0       1    1.0  0.000000   
3       0.750000      0.75     0.445002        0       1    1.0  0.666667   
4       0.750000      0.75     0.577658        0       1    1.0  0.333333   
...          ...       ...          ...      ...     ...    ...       ...   
1309    1.000000      0.75     0.504203        0       1    1.0  0.000000   
1310    0.750000      0.75     0.388581        0       1    1.0  0.000000   
1311    0.250000      0.25     0.434909        0       1    1.0  0.000000   
1312    0.916667      0.75     0.445002        0       1    1.0  0.000000   
1313    1.000000      0.75     0.376033        0       1    1.0  1.000000   

      LandContour  Utilities  LotConfig  LandSlope  Neighborhood  Condition

In [12]:
print(X_train.isnull().sum())
X_train =X_train.fillna(X_train.mean())
X_train.head()

MSSubClass        0
MSZoning          0
LotFrontage       0
LotArea           0
Street            0
                 ..
SaleType          0
SaleCondition     0
LotFrontage_na    0
MasVnrArea_na     0
GarageYrBlt_na    0
Length: 81, dtype: int64


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
0,0.75,0.75,0.461171,0,1,1.0,0.333333,1.0,1,0.0,0.0,0.863636,0.4,1,0.75,0.6,0.777778,0.5,0.014706,0.04918,0.0,0,1.0,1.0,0.333333,0.0,0.666667,0.5,1.0,0.666667,0.666667,0.666667,0.0,0.002835,0.0,0,0.673479,0.239935,1.0,1.0,1,1.0,0.55976,0.0,0,0.52325,0.0,0.0,0.666667,0.0,0.375,0.333333,0.666667,0.416667,1.0,0.0,0.0,0.75,0.018692,1.0,0.75,0.430183,0.5,0.5,1.0,0.116686,0.032907,0,0,0,0.0,0,0.0,1.0,0,0.545455,0.666667,0.75,0,0,0
1,0.75,0.75,0.456066,0,1,1.0,0.333333,0.333333,1,0.0,0.0,0.363636,0.4,1,0.75,0.6,0.444444,0.75,0.360294,0.04918,0.0,0,0.6,0.6,0.666667,0.03375,0.666667,0.5,0.5,0.333333,0.666667,0.0,1.0,0.142807,0.0,0,0.114724,0.17234,1.0,1.0,1,1.0,0.434539,0.0,0,0.406196,0.333333,0.0,0.333333,0.5,0.375,0.333333,0.666667,0.25,1.0,0.0,0.0,0.75,0.457944,0.5,0.25,0.220028,0.5,0.5,1.0,0.0,0.0,0,0,0,0.0,0,0.75,1.0,0,0.636364,0.666667,0.75,0,0,0
2,0.916667,0.75,0.394699,0,1,1.0,0.0,0.333333,1,0.0,0.0,0.954545,0.4,1,1.0,0.6,0.888889,0.5,0.036765,0.098361,1.0,0,0.3,0.2,0.666667,0.2575,1.0,0.5,1.0,1.0,0.666667,0.0,0.0,0.080794,0.0,0,0.601951,0.286743,1.0,1.0,1,1.0,0.627205,0.0,0,0.586296,0.333333,0.0,0.666667,0.0,0.25,0.333333,1.0,0.333333,1.0,0.333333,0.8,0.75,0.046729,0.5,0.5,0.406206,0.5,0.5,1.0,0.228705,0.149909,0,0,0,0.0,0,0.0,1.0,0,0.090909,0.666667,0.75,0,0,0
3,0.75,0.75,0.445002,0,1,1.0,0.666667,0.666667,1,0.0,0.0,0.454545,0.4,1,0.75,0.6,0.666667,0.5,0.066176,0.163934,0.0,0,1.0,1.0,0.333333,0.0,0.666667,0.5,1.0,0.666667,0.666667,1.0,0.0,0.25567,0.0,0,0.018114,0.242553,1.0,1.0,1,1.0,0.56692,0.0,0,0.529943,0.333333,0.0,0.666667,0.0,0.375,0.333333,0.666667,0.25,1.0,0.333333,0.4,0.75,0.084112,0.5,0.5,0.362482,0.5,0.5,1.0,0.469078,0.045704,0,0,0,0.0,0,0.0,1.0,0,0.636364,0.666667,0.75,1,0,0
4,0.75,0.75,0.577658,0,1,1.0,0.333333,0.333333,1,0.0,0.0,0.363636,0.4,1,0.75,0.6,0.555556,0.5,0.323529,0.737705,0.0,0,0.6,0.7,0.666667,0.17,0.333333,0.5,0.5,0.333333,0.666667,0.0,0.75,0.086818,0.0,0,0.434278,0.233224,1.0,0.75,1,1.0,0.549026,0.0,0,0.513216,0.0,0.0,0.666667,0.0,0.375,0.333333,0.333333,0.416667,1.0,0.333333,0.8,0.75,0.411215,0.5,0.5,0.406206,0.5,0.5,1.0,0.0,0.0,0,1,0,0.0,0,0.0,1.0,0,0.545455,0.666667,0.75,0,0,0


In [14]:
# load the target (remember that the target is log transformed)
y_train = pd.read_csv('ytrain.csv')
y_test = pd.read_csv('ytest.csv')

y_train.head()

Unnamed: 0,SalePrice
0,12.211
1,11.888
2,12.676
3,12.278
4,12.103


In [22]:
# Feature Selection
# We will do the model fitting and feature selection
# altogether in a few lines of code

# first, we specify the Lasso Regression model, and we 
# select a suitable alpha (equivalent of penalty)
# The bigger  the alpha the less features that will be selected.

# Then we use the selectFromModel object from sklearn, which
# will select automatically the features which coefficients are non-zero

# remember to set the seed, the random state in this function
sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=0))

# train Lasso model and select features

sel_.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.001, random_state=0))

In [23]:
sel_.get_support().sum()

37

In [24]:
# let's visualise thos efeatures that were selected.
# (selected features marked with True)
sel_.get_support()

array([ True,  True, False, False, False, False,  True,  True, False,
        True, False,  True, False, False, False,  True,  True,  True,
       False,  True,  True, False,  True, False, False, False,  True,
       False,  True,  True, False,  True,  True, False, False, False,
       False, False, False,  True,  True, False,  True,  True, False,
        True,  True, False,  True,  True, False, False,  True,  True,
        True,  True,  True, False, False,  True,  True,  True, False,
       False,  True,  True, False, False, False,  True, False, False,
       False, False, False, False, False,  True, False, False, False])

In [25]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feats = X_train.columns[(sel_.get_support())]

# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feats)))
print('features with coefficients shrank to zero:{}'.format(
    np.sum(sel_.estimator_.coef_ == 0 )))


total features: 81
selected features: 37
features with coefficients shrank to zero:44


In [26]:
selected_feats

Index(['MSSubClass', 'MSZoning', 'LotShape', 'LandContour', 'LotConfig',
       'Neighborhood', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearRemodAdd', 'RoofStyle', 'Exterior1st', 'ExterQual', 'Foundation',
       'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC', 'CentralAir',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath',
       'HalfBath', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageFinish', 'GarageCars', 'GarageArea', 'PavedDrive',
       'WoodDeckSF', 'ScreenPorch', 'SaleCondition'],
      dtype='object')

In [27]:
pd.Series(selected_feats).to_csv('selected_features.csv', index=False)