In [1]:
import pandas as pd
from plotly import graph_objects as go
from sklearn.datasets import load_boston
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Lasso

ModuleNotFoundError: No module named 'plotly'

In [2]:
# load data
boston_bunch = load_boston()
df = pd.DataFrame(boston_bunch.data, columns = boston_bunch.feature_names)
df["target"] = boston_bunch.target
df.head()

NameError: name 'load_boston' is not defined

In [59]:
boston_bunch.DESCR

".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000

## Filter Method - Pearsons correlation coefficient

In [60]:
# Pearsons correlation coefficient
corr = df.corr(method="pearson")["target"].sort_values(ascending=False)[1:]
corr

RM         0.695360
ZN         0.360445
B          0.333461
DIS        0.249929
CHAS       0.175260
AGE       -0.376955
RAD       -0.381626
CRIM      -0.388305
NOX       -0.427321
TAX       -0.468536
INDUS     -0.483725
PTRATIO   -0.507787
LSTAT     -0.737663
Name: target, dtype: float64

In [61]:
# absolute for positive values
abs_corr = abs(corr) 

relevant_features = abs_corr[abs_corr>0.4]
relevant_features

RM         0.695360
NOX        0.427321
TAX        0.468536
INDUS      0.483725
PTRATIO    0.507787
LSTAT      0.737663
Name: target, dtype: float64

In [62]:
new_df = df[relevant_features.index]
new_df.head()

Unnamed: 0,RM,NOX,TAX,INDUS,PTRATIO,LSTAT
0,6.575,0.538,296.0,2.31,15.3,4.98
1,6.421,0.469,242.0,7.07,17.8,9.14
2,7.185,0.469,242.0,7.07,17.8,4.03
3,6.998,0.458,222.0,2.18,18.7,2.94
4,7.147,0.458,222.0,2.18,18.7,5.33


## Wrapper Method - Recursive feature elimination

In [80]:
# input and output features
X = df.drop("target", axis= 1)
y = df["target"]

# defining model to build
lin_reg = LinearRegression()

# create the RFE model and select 6 attributes
rfe = RFE(lin_reg, 6)
rfe.fit(X, y)

# summarize the selection of the attributes
print(f"Number of selected features: {rfe.n_features_}\n\
Mask: {rfe.support_}\n\
Selected Features:", [feature for feature, rank in zip(X.columns.values, rfe.ranking_) if rank==1],"\n\
Estimator : {rfe.estimator_}")

Number of selected features: 6
Mask: [False False False  True  True  True False  True False False  True False
  True]
Selected Features: ['CHAS', 'NOX', 'RM', 'DIS', 'PTRATIO', 'LSTAT'] 
Estimator : {rfe.estimator_}


## Embedded Method - Lasso Regression

In [87]:
lasso = Lasso() 
lasso.fit(X, y)

# perform feature selection
keep_cols = [feature for feature, weight in zip(X.columns.values, lasso.coef_) if weight != 0]
keep_cols

['CRIM', 'ZN', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

In [2]:
fig = go.Figure(
        go.Waterfall(name= "Lasso Coefficients",
                     orientation= "h",
                     y = X.columns.values,
                     x = lasso.coef_
                     ))

fig.update_layout(title = "Coefficients of Lasso Regression Model")

fig.show()

NameError: name 'go' is not defined