In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('housing.csv')
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
from sklearn.feature_selection import VarianceThreshold

In [4]:
data = data.dropna()

In [5]:
x = data.drop(['median_house_value', 'ocean_proximity'], axis=1)
y = data['median_house_value']

In [6]:
selector = VarianceThreshold(threshold=0.9)
selected = pd.DataFrame(selector.fit_transform(x))
selected.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462


# Univariate Selections - Select K best

## Case1: Y is categorical 

In [7]:
x1 = x.drop(['longitude', 'latitude'], axis=1)
y1 = data['ocean_proximity']

In [8]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
kbest = SelectKBest(chi2, k=4).fit_transform(x1, y1)
selected = pd.DataFrame(kbest)
selected.head()

Unnamed: 0,0,1,2,3
0,41.0,880.0,322.0,126.0
1,21.0,7099.0,2401.0,1138.0
2,52.0,1467.0,496.0,177.0
3,52.0,1274.0,558.0,219.0
4,52.0,1627.0,565.0,259.0


## Case 2: Y is numerical 

In [9]:
x = data.drop(['median_house_value', 'ocean_proximity'], axis=1)
y = data['median_house_value']
from sklearn.feature_selection import f_regression
pd.DataFrame(f_regression(x,y))
# First row are the F values and second row are the p values 

Unnamed: 0,0,1,2,3,4,5,6,7
0,42.19522,436.5537,234.0896,369.5705,50.56317,13.085778,86.40233,18398.896423
1,8.450466e-11,6.132893e-96,1.4961340000000002e-52,1.221172e-81,1.191968e-12,0.000298,1.611514e-20,0.0


# Recursive Feature Elimination/ Backward Elmination

In [10]:
from sklearn.feature_selection import RFE
from sklearn import linear_model
lm = linear_model.LinearRegression()
rfe = RFE(lm, n_features_to_select=6, verbose=True)
rfe.fit(x, y)
print(rfe.ranking_)
print(x.columns)

Fitting estimator with 8 features.
Fitting estimator with 7 features.
[1 1 1 3 1 2 1 1]
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')


In [44]:
from sklearn.feature_selection import RFECV
selector = RFECV(estimator = LinearRegression(), step=1, cv=5)
selector = selector.fit(x, y)
selector.support_

array([ True,  True,  True,  True,  True,  True,  True,  True])

In [45]:
selector.ranking_

array([1, 1, 1, 1, 1, 1, 1, 1])

# SelectFromModel with LassoCV

In [11]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
sfm = SelectFromModel(estimator=LassoCV(), threshold=0.25)
sfm.fit(x, y)
n_features = sfm.transform(x).shape[1]
n_features

6

In [12]:
x.shape

(20433, 8)

In [13]:
sfm.estimator_.coef_

array([-0.00000000e+00, -0.00000000e+00,  1.56047485e+03, -1.28885816e+00,
       -5.34234938e+00, -4.33488131e+01,  1.61783542e+02,  3.37178691e+04])

In [14]:
selected = pd.DataFrame(sfm.transform(x))
selected.head()

Unnamed: 0,0,1,2,3,4,5
0,41.0,880.0,129.0,322.0,126.0,8.3252
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014
2,52.0,1467.0,190.0,496.0,177.0,7.2574
3,52.0,1274.0,235.0,558.0,219.0,5.6431
4,52.0,1627.0,280.0,565.0,259.0,3.8462


In [19]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
sfm = SelectFromModel(estimator=LogisticRegression())
sfm.fit(x1, y1)
n_features = sfm.transform(x1).shape[1]
n_features

1

In [24]:
# pd.DataFrame(sfm.estimator_.coef_)

Unnamed: 0,0,1,2,3,4,5
0,0.008642,-0.00091,-6.4e-05,0.000842,0.002882,0.410533
1,-0.058581,0.001601,-0.003627,-0.000202,-0.004964,-0.890677
2,-0.016969,-0.001388,0.008529,-0.003056,-0.00362,-0.85781
3,0.045476,0.00036,-0.005241,-0.001665,0.007797,-0.056374
4,-0.004018,-8.2e-05,-0.000392,-0.000509,0.001968,0.020268


# Cross Validation

In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
cv = cross_val_score(LinearRegression(), x, y, cv=10)
cv

array([0.53115461, 0.64316928, 0.48933681, 0.55966335, 0.6039088 ,
       0.55396891, 0.45344879, 0.48182744, 0.51554848, 0.62247673])

In [27]:
cv.mean()

0.5454503198403671

In [29]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100)
cv = cross_val_score(rf, x, y, cv=10)
cv

array([-0.01498997,  0.64508713,  0.70215616,  0.63191712,  0.75661436,
        0.61086687,  0.49412919,  0.52871874,  0.3837235 ,  0.62773315])

In [57]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=2, shuffle=True)

In [70]:
for train,test in skf.split(x1, y1):
    print(train)
    print(test)
    x_train, y_train = x1[x.index.isin(train)], y1[y.index.isin(train)]
    x_test, y_test = x1[x.index.isin(test)], y1[y.index.isin(test)]

[    0     1     2 ... 20430 20431 20432]
[    3     4    10 ... 20423 20427 20428]
[    3     4    10 ... 20423 20427 20428]
[    0     1     2 ... 20430 20431 20432]


In [71]:
x_train.shape
y_train.shape

(10093,)

In [75]:
y_train.value_counts()/len(y_train)

<1H OCEAN     0.441593
INLAND        0.314872
NEAR OCEAN    0.130586
NEAR BAY      0.112850
ISLAND        0.000099
Name: ocean_proximity, dtype: float64

In [69]:
x_test.shape
y_test.shape

(10127,)

In [76]:
y_test.value_counts()/len(y_test)

<1H OCEAN     0.444499
INLAND        0.314258
NEAR OCEAN    0.129255
NEAR BAY      0.111593
ISLAND        0.000395
Name: ocean_proximity, dtype: float64

# Grid Search CV 

In [31]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2,4,6,8]}, 
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2,3,4]},
]
rf = RandomForestRegressor(random_state=100)

In [42]:
grid_search = GridSearchCV(rf, param_grid, cv=5,
                          return_train_score=True,
                          n_jobs=-1)
grid_search.fit(x,y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=100,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'max_features': [2, 4, 6, 8],
   

In [43]:
grid_search.best_params_

{'max_features': 2, 'n_estimators': 30}