In [1]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [5]:
file_path = r'data\house_price_pred\train_scaled.csv'
df = pd.read_csv(file_path)

In [6]:
df.shape

(1460, 85)

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_nan,MasVnrArea_nan,GarageYrBlt_nan
0,0,1,12.247694,0.235294,0.75,0.418208,0.366344,1.0,1.0,0.0,...,1.0,1.0,0.0,0.090909,0.5,0.666667,0.75,0.0,0.0,0.0
1,1,2,12.109011,0.0,0.75,0.495064,0.391317,1.0,1.0,0.0,...,1.0,1.0,0.0,0.363636,0.25,0.666667,0.75,0.0,0.0,0.0
2,2,3,12.317167,0.235294,0.75,0.434909,0.422359,1.0,1.0,0.333333,...,1.0,1.0,0.0,0.727273,0.5,0.666667,0.75,0.0,0.0,0.0
3,3,4,11.849398,0.294118,0.75,0.388581,0.390295,1.0,1.0,0.333333,...,1.0,1.0,0.0,0.090909,0.0,0.666667,0.0,0.0,0.0,0.0
4,4,5,12.429216,0.235294,0.75,0.513123,0.468761,1.0,1.0,0.333333,...,1.0,1.0,0.0,1.0,0.5,0.666667,0.75,0.0,0.0,0.0


In [11]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [12]:
df.head()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_nan,MasVnrArea_nan,GarageYrBlt_nan
0,1,12.247694,0.235294,0.75,0.418208,0.366344,1.0,1.0,0.0,0.333333,...,1.0,1.0,0.0,0.090909,0.5,0.666667,0.75,0.0,0.0,0.0
1,2,12.109011,0.0,0.75,0.495064,0.391317,1.0,1.0,0.0,0.333333,...,1.0,1.0,0.0,0.363636,0.25,0.666667,0.75,0.0,0.0,0.0
2,3,12.317167,0.235294,0.75,0.434909,0.422359,1.0,1.0,0.333333,0.333333,...,1.0,1.0,0.0,0.727273,0.5,0.666667,0.75,0.0,0.0,0.0
3,4,11.849398,0.294118,0.75,0.388581,0.390295,1.0,1.0,0.333333,0.333333,...,1.0,1.0,0.0,0.090909,0.0,0.666667,0.0,0.0,0.0,0.0
4,5,12.429216,0.235294,0.75,0.513123,0.468761,1.0,1.0,0.333333,0.333333,...,1.0,1.0,0.0,1.0,0.5,0.666667,0.75,0.0,0.0,0.0


In [13]:
y_train = df['SalePrice']
len(y_train)

1460

In [14]:
x_train = df.copy()

In [16]:
x_train = df.drop(columns=['Id', 'SalePrice'], axis=1)
x_train.shape

(1460, 82)

In [17]:
x_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_nan,MasVnrArea_nan,GarageYrBlt_nan
0,0.235294,0.75,0.418208,0.366344,1.0,1.0,0.0,0.333333,1.0,0.0,...,1.0,1.0,0.0,0.090909,0.5,0.666667,0.75,0.0,0.0,0.0
1,0.0,0.75,0.495064,0.391317,1.0,1.0,0.0,0.333333,1.0,0.5,...,1.0,1.0,0.0,0.363636,0.25,0.666667,0.75,0.0,0.0,0.0
2,0.235294,0.75,0.434909,0.422359,1.0,1.0,0.333333,0.333333,1.0,0.0,...,1.0,1.0,0.0,0.727273,0.5,0.666667,0.75,0.0,0.0,0.0
3,0.294118,0.75,0.388581,0.390295,1.0,1.0,0.333333,0.333333,1.0,0.25,...,1.0,1.0,0.0,0.090909,0.0,0.666667,0.0,0.0,0.0,0.0
4,0.235294,0.75,0.513123,0.468761,1.0,1.0,0.333333,0.333333,1.0,0.5,...,1.0,1.0,0.0,1.0,0.5,0.666667,0.75,0.0,0.0,0.0


### Apply Feature Selection

Bigger the alpha less features will be selected.
SelectFromModel will select the features based on the coefficients

In [18]:
feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=42))
feature_sel_model.fit(x_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, random_state=42))

In [19]:
feature_sel_model.get_support()

array([ True,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True,  True,  True, False,  True,  True, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False])

In [30]:
np.sum(feature_sel_model.estimator_.coef_ == 0)

<IPython.core.display.Javascript object>

61

In [31]:
sel_features = x_train.columns[feature_sel_model.get_support()]
print(f'Total features : {len(x_train.columns)}')
print('Selected features : ', len(sel_features))
print('Features with coef shring to 0 : ', np.sum(feature_sel_model.estimator_.coef_ == 0))

Total features : 82
Selected features :  21


<IPython.core.display.Javascript object>

Features with coef shring to 0 :  61


In [32]:
x_train_bkup = x_train.copy()
x_train  = x_train[sel_features]
x_train.head()

Unnamed: 0,MSSubClass,MSZoning,Neighborhood,OverallQual,YearRemodAdd,RoofStyle,BsmtQual,BsmtExposure,HeatingQC,CentralAir,...,GrLivArea,BsmtFullBath,KitchenQual,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,PavedDrive,SaleCondition
0,0.235294,0.75,0.636364,0.666667,0.883333,0.0,0.75,0.25,1.0,1.0,...,0.577712,0.333333,0.666667,0.0,0.2,0.8,0.666667,0.5,1.0,0.75
1,0.0,0.75,0.5,0.555556,0.433333,0.0,0.75,1.0,1.0,1.0,...,0.470245,0.0,0.333333,0.333333,0.6,0.8,0.666667,0.5,1.0,0.75
2,0.235294,0.75,0.636364,0.666667,0.866667,0.0,0.75,0.5,1.0,1.0,...,0.593095,0.333333,0.666667,0.333333,0.6,0.8,0.666667,0.5,1.0,0.75
3,0.294118,0.75,0.727273,0.666667,0.333333,0.0,0.5,0.25,0.75,1.0,...,0.579157,0.333333,0.666667,0.333333,0.8,0.4,0.333333,0.75,1.0,0.0
4,0.235294,0.75,1.0,0.777778,0.833333,0.0,0.75,0.75,1.0,1.0,...,0.666523,0.333333,0.666667,0.333333,0.6,0.8,0.666667,0.75,1.0,0.75
