# Feature Engineering

In [None]:
# polynomial features :
from sklearn import preprocessing
# initialize polynomial features class object
# for two-degree polynomial features
pf = preprocessing.PolynomialFeatures(
 degree=2,
 interaction_only=False,
 include_bias=False)

# fit to the features
pf.fit(df)

# create polynomial features
poly_feats = pf.transform(df)

# create a dataframe with all the features
num_feats = poly_feats.shape[1]
df_transformed = pd.DataFrame(
 poly_feats,
 columns=[f"f_{i}" for i in range(1, num_feats + 1)])

In [None]:
# create bins of the numerical columns
# 10 bins
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
# 100 bins
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)

In [None]:
# applying log to reduce variance
df.f_3.apply(lambda x: np.log(1 + x)).var()

In [2]:
import numpy as np
from sklearn import impute

# create a random numpy array with 10 samples and 6 features and values ranging from 1 to 15
X = np.random.randint(1, 15, (10, 6))

# convert the array to float
X = X.astype(float)

# randomly assign 10 elements to NaN (missing)
X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan

# use 3 nearest neighbours to fill na values
knn_imputer = impute.KNNImputer(n_neighbors=2)
knn_imputer.fit_transform(X)

array([[10. ,  6. , 13. ,  8.5,  7.5, 14. ],
       [ 2. , 11. ,  5. ,  9. ,  7. , 11. ],
       [12. ,  3. ,  7. ,  5. , 14. , 10. ],
       [ 4. , 14. , 13. ,  6. ,  5. ,  6. ],
       [ 6.5,  5. ,  2. ,  2. ,  3. ,  3. ],
       [ 7. , 12. ,  2. ,  3. ,  7. ,  1. ],
       [10. , 12. ,  2. ,  3. ,  4. ,  2. ],
       [14. ,  4. ,  1. ,  1. , 12. ,  5. ],
       [13. , 10. , 14. ,  8. ,  1. , 11. ],
       [ 6. , 12. ,  2. ,  2. ,  2. ,  2. ]])

In [None]:
#instantiate both packages to use
encoder = OrdinalEncoder()
imputer = KNN()
# create a list of categorical columns to iterate over
cat_cols = ['embarked','class1','deck1','who','embark_town','sex','adult_male','alive','alone']

def encode(data):
    '''function to encode non-null data and replace it in the original data'''
    #retains only non-null values
    nonulls = np.array(data.dropna())
    #reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    #encode date
    impute_ordinal = encoder.fit_transform(impute_reshape)
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

#create a for loop to iterate through each column in the data
for columns in cat_cols:
    encode(impute_data[columns])
    
# impute data and convert 
encode_data = pd.DataFrame(np.round(imputer.fit_transform(impute_data)),columns = impute_data.columns)

# Feature selection

In [None]:
from sklearn.feature_selection import VarianceThreshold
data = ...
var_thresh = VarianceThreshold(threshold=0.1)
transformed_data = var_thresh.fit_transform(data)
# transformed data will have all columns with variance less
# than 0.1 removed

Univariate
feature selection is nothing but a scoring of each feature against a given target.
Mutual information, ANOVA F-test and chi2 are some of the most popular
methods for univariate feature selection. There are two ways of using these in scikitlearn.
- SelectKBest: It keeps the top-k scoring features
- SelectPercentile: It keeps the top features which are in a percentage
specified by the user

It must be noted that you can use chi2 only for data which is non-negative in nature. 

In [None]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile

class UnivariateFeatureSelction:
    def __init__(self, n_features, problem_type, scoring):
     """
     Custom univariate feature selection wrapper on
     different univariate feature selection models from
     scikit-learn.
     :param n_features: SelectPercentile if float else SelectKBest
     :param problem_type: classification or regression
     :param scoring: scoring function, string
     """
     # for a given problem type, there are only
     # a few valid scoring methods
     # you can extend this with your own custom
     # methods if you wish
    if problem_type == "classification":
            valid_scoring = { "f_classif": f_classif, "chi2": chi2,"mutual_info_classif": mutual_info_classif }
    else:
        valid_scoring = { "f_regression": f_regression, "mutual_info_regression": mutual_info_regression }

    # raise exception if we do not have a valid scoring method
    if scoring not in valid_scoring:
        raise Exception("Invalid scoring function")

     # if n_features is int, we use selectkbest
     # if n_features is float, we use selectpercentile
     # please note that it is int in both cases in sklearn
    if isinstance(n_features, int):
        self.selection = SelectKBest( valid_scoring[scoring], k=n_features)
    
    elif isinstance(n_features, float):
        self.selection = SelectPercentile(valid_scoring[scoring], percentile=int(n_features * 100))
    
    else:
        raise Exception("Invalid type of feature")

     # same fit function
     def fit(self, X, y):
            return self.selection.fit(X, y)

     # same transform function
     def transform(self, X):
            return self.selection.transform(X)

     # same fit_transform function
     def fit_transform(self, X, y):
            return self.selection.fit_transform(X, y)

#### Example :

In [None]:
ufs = UnivariateFeatureSelction(
 n_features=0.1,
 problem_type="regression",
 scoring="f_regression"
)
ufs.fit(X, y)
X_transformed = ufs.transform(X)

###  recursive feature elimination

In [None]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing

# fetch a regression dataset
data = fetch_california_housing()
X = data["data"]
col_names = data["feature_names"]
y = data["target"]

# initialize the model
model = LinearRegression()

# initialize RFE
rfe = RFE(estimator=model, n_features_to_select=3)

# fit RFE
rfe.fit(X, y)

# get the transformed data with selected columns
X_transformed = rfe.transform(X)

#### greedy feature selection.

In [None]:
# greedy.py
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn.datasets import make_classification

class GreedyFeatureSelection:
    """
    A simple and custom class for greedy feature selection.
    You will need to modify it quite a bit to make it suitable
    for your dataset.
    """
    def evaluate_score(self, X, y):
        
        """
        This function evaluates model on data and returns
        Area Under ROC Curve (AUC)
        NOTE: We fit the data and calculate AUC on same data.
        WE ARE OVERFITTING HERE.
        But this is also a way to achieve greedy selection.
        k-fold will take k times longer.
        If you want to implement it in really correct way,
        calculate OOF AUC and return mean AUC over k folds.
        This requires only a few lines of change and has been
        shown a few times in this book.
        :param X: training data
        :param y: targets
        :return: overfitted area under the roc curve
        """
        
         # fit the logistic regression model,
         # and calculate AUC on same data
         # again: BEWARE
         # you can choose any model that suits your data
        
        model = linear_model.LogisticRegression()
        model.fit(X, y)
        predictions = model.predict_proba(X)[:, 1]
        auc = metrics.roc_auc_score(y, predictions)
        return auc

    def _feature_selection(self, X, y):
        """
         This function does the actual greedy selection
         :param X: data, numpy array
         :param y: targets, numpy array
         :return: (best scores, best features)
        """
        # initialize good features list
        # and best scores to keep track of both
        good_features = []
        best_scores = []

        # calculate the number of features
        num_features = X.shape[1]

        # infinite loop
        while True:
            # initialize best feature and score of this loop
            this_feature = None
            best_score = 0
            # loop over all features
            for feature in range(num_features):
                # if feature is already in good features, skip this for loop
                if feature in good_features:
                    continue
                # selected features are all good features till now and current feature
                selected_features = good_features + [feature]
                # remove all other features from data
                xtrain = X[:, selected_features]
                # calculate the score, in our case, AUC
                score = self.evaluate_score(xtrain, y)
                # if score is greater than the best score
                # of this loop, change best score and best feature
                if score > best_score:
                    this_feature = feature
                    best_score = score
            # if we have selected a feature, add it to the good feature list and update best scores list
            if this_feature != None:
                good_features.append(this_feature)
                best_scores.append(best_score)
            
            # if we didnt improve during the last two rounds, exit the while loop
            if len(best_scores) > 2:
                if best_scores[-1] < best_scores[-2]:
                    break
        # return best scores and good features
        # why do we remove the last data point?
        return best_scores[:-1], good_features[:-1]
        
    def __call__(self, X, y):
        """
        Call function will call the class on a set of arguments
        """
        # select features, return scores and selected indices
        scores, features = self._feature_selection(X, y)
        # transform data with selected features
        return X[:, features], scores
if __name__ == "__main__":
    # generate binary classification data
    X, y = make_classification(n_samples=1000, n_features=100)
    # transform data by greedy feature selection
    X_transformed, scores = GreedyFeatureSelection()(X, y)

The greedy feature selection implemented the way returns scores and a list of
feature indices

### importance in features 

In [None]:
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
# fetch a regression dataset in diabetes data we predict diabetes progression after one year based on some features
data = load_diabetes()
X = data["data"]
col_names = data["feature_names"]
y = data["target"]


# initialize the model
model = RandomForestRegressor()

# fit the model
model.fit(X, y)

importances = model.feature_importances_
idxs = np.argsort(importances)
plt.title('Feature Importances')
plt.barh(range(len(idxs)), importances[idxs], align='center')
plt.yticks(range(len(idxs)), [col_names[i] for i in idxs])
plt.xlabel('Random Forest Feature Importance')
plt.show()

## Select from model

In [None]:
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
# fetch a regression dataset
# in diabetes data we predict diabetes progression
# after one year based on some features
data = load_diabetes()
X = data["data"]
col_names = data["feature_names"]
y = data["target"]


# initialize the model
model = RandomForestRegressor()

# select from the model
sfm = SelectFromModel(estimator=model)
X_transformed = sfm.fit_transform(X, y)

# see which features were selected
support = sfm.get_support()

# get feature names
print([x for x, y in zip(col_names, support) if y == True])