In [None]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing

# fetch a regression dataset
data = fetch_california_housing()

X = data["data"]
col_names = data["feature_names"]
y = data["target"]
# convert to pandas dataframe

df = pd.DataFrame(X, columns=col_names)
# introduce a highly correlated column
df.loc[:, "MedInc_Sqrt"] = df.MedInc.apply(np.sqrt)
# get correlation matrix (pearson)
df.corr()

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /root/scikit_learn_data


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc_Sqrt
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.984329
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,-0.132797
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.326688
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.06691
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,0.018415
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,0.015266
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.084303
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.015569
MedInc_Sqrt,0.984329,-0.132797,0.326688,-0.06691,0.018415,0.015266,-0.084303,-0.015569,1.0


In [None]:
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(threshold=0.1)
transformed_data = var_thresh.fit_transform(X)

In [None]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc_Sqrt
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,2.885342
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,2.881215
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,2.693956
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,2.375521
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,1.961173


In [None]:
transformed_data[0,:]

array([   8.3252    ,   41.        ,    6.98412698,    1.02380952,
        322.        ,    2.55555556,   37.88      , -122.23      ])

# **Select K-Best and select percentile**

In [None]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile

class UnivariateFeatureSelction:
    def __init__(self, n_features, problem_type, scoring):
        """
        Custom univariate feature selection wrapper on
        different univariate feature selection models from
        scikit-learn.
        :param n_features: SelectPercentile if float else SelectKBest
        :param problem_type: classification or regression
        :param scoring: scoring function, string
        """
        
        # for a given problem type, there are only
        # a few valid scoring methods
        # you can extend this with your own custom
        # methods if you wish
        if problem_type == "classification":
            valid_scoring = {
            "f_classif": f_classif,
            "chi2": chi2,
            "mutual_info_classif": mutual_info_classif
            }
        else:
            valid_scoring = {
            "f_regression": f_regression,
            "mutual_info_regression": mutual_info_regression
            }

        # raise exception if we do not have a valid scoring method
        if scoring not in valid_scoring:
            raise Exception("Invalid scoring function")

        # if n_features is int, we use selectkbest
        # if n_features is float, we use selectpercentile
        # please note that it is int in both cases in sklearn
        if isinstance(n_features, int):
            self.selection = SelectKBest(
            valid_scoring[scoring],
            k=n_features
            )
        elif isinstance(n_features, float):
            self.selection = SelectPercentile(
            valid_scoring[scoring],
            percentile=int(n_features * 100)
            )
        else:
            raise Exception("Invalid type of feature")

    # same fit function
    def fit(self, X, y):
        return self.selection.fit(X, y)

    # same transform function
    def transform(self, X):
        return self.selection.transform(X)

    # same fit_transform function
    def fit_transform(self, X, y):
        return self.selection.fit_transform(X, y)


In [None]:
ufs = UnivariateFeatureSelction(
 n_features=X.shape[1]-3,
 problem_type="regression",
 scoring="f_regression"
)

ufs.fit(X, y)

X_transformed = ufs.transform(X)

In [None]:
X_transformed[0,:]

array([ 8.3252    , 41.        ,  6.98412698,  1.02380952, 37.88      ])

# **Greedy Feature Selection**

In [None]:
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn.datasets import make_classification

class GreedyFeatureSelection:
    """
    A simple and custom class for greedy feature selection.
    You will need to modify it quite a bit to make it suitable
    for your dataset.
    """
    def evaluate_score(self, X, y):
        """
        This function evaluates model on data and returns
        Area Under ROC Curve (AUC)
        NOTE: We fit the data and calculate AUC on same data.
        WE ARE OVERFITTING HERE.
        But this is also a way to achieve greedy selection.
        k-fold will take k times longer.
        If you want to implement it in really correct way,
        calculate OOF AUC and return mean AUC over k folds.
        This requires only a few lines of change and has been
        shown a few times in this book.
        :param X: training data
        :param y: targets
        :return: overfitted area under the roc curve
        """
        # fit the logistic regression model,
        # and calculate AUC on same data
        # again: BEWARE
        # you can choose any model that suits your data
        model = linear_model.LogisticRegression()
        model.fit(X, y)

        predictions = model.predict_proba(X)[:, 1]
        auc = metrics.roc_auc_score(y, predictions)

        return auc

    def _feature_selection(self, X, y):
        """
        This function does the actual greedy selection
        :param X: data, numpy array
        :param y: targets, numpy array
        :return: (best scores, best features)
        """
        # initialize good features list
        # and best scores to keep track of both
        good_features = []
        best_scores = []

        # calculate the number of features
        num_features = X.shape[1]

        # infinite loop
        while True:
            # initialize best feature and score of this loop
            this_feature = None
            best_score = 0
            
            # loop over all features
            for feature in range(num_features):
                # if feature is already in good features,
                # skip this for loop
                if feature in good_features:
                    continue
                # selected features are all good features till now
                # and current feature
                selected_features = good_features + [feature]
                # remove all other features from data
                xtrain = X[:, selected_features]
                # calculate the score, in our case, AUC
                score = self.evaluate_score(xtrain, y)
                # if score is greater than the best score
                # of this loop, change best score and best feature
                if score > best_score:
                    this_feature = feature
                    best_score = score

            # if we have selected a feature, add it
            # to the good feature list and update best scores list
            if this_feature != None:
                good_features.append(this_feature)
                best_scores.append(best_score)

            # if we didnt improve during the last two rounds,
            # exit the while loop
            if len(best_scores) > 2:
                if best_scores[-1] < best_scores[-2]:
                    break

        # return best scores and good features
        # why do we remove the last data point?
        return best_scores[:-1], good_features[:-1]

    def __call__(self, X, y):
        """
        Call function will call the class on a set of arguments
        """
        # select features, return scores and selected indices
        scores, features = self._feature_selection(X, y)
        # transform data with selected features
        return X[:, features], scores

In [None]:
# generate binary classification data
X, y = make_classification(n_samples=1000, n_features=100)
print(X[0,:],y[0])

[ 1.53479760e-02 -1.19909265e+00 -9.09681553e-01  2.13261366e+00
 -1.68707919e+00  5.32921279e-02 -3.51710261e-01 -9.60139610e-01
  1.46161172e+00 -1.77634265e+00 -5.94704615e-01  9.64702218e-01
 -1.02668670e+00  1.73226069e+00 -3.15540874e-01  9.40121708e-01
 -4.39332663e-01 -2.04273432e+00  5.98435365e-01  2.57030519e-01
 -4.82338940e-02  8.29484343e-01 -1.28051502e+00 -1.37759225e+00
 -2.20971484e-01  5.46477664e-01  1.18888915e-01 -1.43823632e-01
 -1.29710638e+00 -5.18765268e-01  3.22957925e-01 -4.02554124e-02
  1.21642391e+00 -7.70240457e-01 -3.67132646e-01 -1.00872122e+00
  1.14103271e+00  7.56997959e-01 -2.10151892e-01  1.30467013e-01
  7.09791819e-01 -2.53405780e+00 -1.61314451e+00  5.85807458e-01
 -8.41441251e-01 -8.56608572e-02  7.31229647e-01  1.66420759e+00
 -3.36637940e-01  1.96978723e-01  7.37041470e-01 -3.43494189e-01
 -1.83811248e-01  1.79179793e-01 -3.27011700e-01  1.18739807e+00
  3.79889060e-01  1.34410405e+00  2.25325809e-01 -3.52210492e-01
 -1.02433717e+00 -4.18736

In [None]:
# transform data by greedy feature selection
X_transformed, scores = GreedyFeatureSelection()(X, y)

In [None]:
len(X_transformed[0,:])

61

# **Recursive feature elimination**

In [None]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing

# fetch a regression dataset
data = fetch_california_housing()

X = data["data"]
col_names = data["feature_names"]
y = data["target"]

# initialize the model
model = LinearRegression()

# initialize RFE
rfe = RFE(
 estimator=model,
 n_features_to_select=3
)

# fit RFE
rfe.fit(X, y)

# get the transformed data with
# selected columns
X_transformed = rfe.transform(X)

In [None]:
X_transformed[0,:]

array([   8.3252,   37.88  , -122.23  ])