In [8]:
import os, sys
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFE
from sklearn.base import clone

np.set_printoptions(suppress = True) # no scientific notation

In [9]:
datasource = "datasets/titanic.csv"
print(os.path.exists(datasource))

True


In [10]:
df = pd.read_csv(datasource).sample(frac = 1).reset_index(drop = True)
df.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,survived
0,1,1,70.0,1,1,71.0,1,0
1,3,1,31.0,0,0,7.25,1,0
2,3,0,36.0,0,0,7.8792,2,0
3,3,1,30.0,0,0,8.05,1,0
4,2,1,29.0,1,0,21.0,1,0


In [11]:
X = np.array(df.iloc[:, :-1])

In [12]:
y = np.array(df["survived"])

In [13]:
print(X.shape)

(890, 7)


In [14]:
print(y.shape)

(890,)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

## χ² feature selection

* http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
* http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html

In [16]:
selector = SelectKBest(chi2, k = 5) # top 5 features

In [17]:
selector.fit(X, y)

SelectKBest(k=5, score_func=<function chi2 at 0x000002121F9DDB70>)

In [18]:
print("χ² statistic:", selector.scores_)

χ² statistic: [   25.8018668     84.16856435    15.32604597     2.60751209    13.72335723
  4679.1690005      6.00473871]


In [19]:
print("Column indices:", selector.get_support(True))

Column indices: [0 1 2 4 5]


In [21]:
selectedColumnNames = np.array(df.columns[selector.get_support(True)])
print("Column names:", selectedColumnNames)

Column names: ['pclass' 'sex' 'age' 'parch' 'fare']


In [24]:
X_train_selected = selector.transform(X_train)
print(X_train_selected.shape)

(712, 5)


In [25]:
X_test_selected = selector.transform(X_test)
print(X_test_selected.shape)

(178, 5)


In [32]:
# This is the selected score
model = GaussianNB()
model.fit(X_train_selected, y_train)
selectedFeaturesScore = model.score(X_test_selected, y_test)
print("Selected features score:", selectedFeaturesScore)

Selected features score: 0.792134831461


In [31]:
# This is the score without feature selection
model = GaussianNB()
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print("Score without feature selection:", score)

Score without feature selection: 0.769662921348


## Statistical Significance of features

In [33]:
chi2_sklearn, pvalue_sklearn = chi2(X_train, y_train)
print(pvalue_sklearn)

[ 0.00000714  0.          0.00013118  0.02470079  0.00279461  0.
  0.01796204]


In [34]:
chi2_sklearn, pvalue_sklearn = chi2(X_train_selected, y_train)
print(pvalue_sklearn)

[ 0.00000714  0.          0.00013118  0.00279461  0.        ]


## Mutual information
Mutual information is generally considered a robust measure of dependence. It could apply to both regression and clasification problems. 

* http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html#sklearn.feature_selection.mutual_info_classif
* http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_regression.html#sklearn.feature_selection.mutual_info_regression

In [38]:
def mutual_info_session(X_train, y_train):
    selector = SelectKBest(mutual_info_classif, k = 3)
    selector.fit(X_train, y_train)
    print(selector.get_support(True))
    
    model = GaussianNB()
    model.fit(selector.transform(X_train), y_train)
    return model.score(selector.transform(X_test), y_test)

In [39]:
mutual_info_session(X_train, y_train)

[0 1 5]


0.7921348314606742

## Forward selection
FS is an iterative method in which we start with having no feature in the model. In each iteration, we keep adding the feature which best improves the model

In [40]:
class ForwardSelector(object):
    def __init__(self, estimator):
        self.estimator = estimator
        
    def fit(self, X, y, k):
        selected = np.zeros(X.shape[1]).astype(bool) # holds indicators of whether each feature is selected
        
        score = lambda X_features: clone(self.estimator).fit(X_features, y).score(X_features, y)
        # fit and score model based on some subset of features
        
        selected_indices = lambda: list(np.flatnonzero(selected))
        
        while np.sum(selected) < k: # keep looping until k features are selected
            rest_indices = list(np.flatnonzero(~selected)) # indices to unselected columns
            scores = list()
            
            for i in rest_indices:
                feature_subset = selected_indices() + [i]
                s = score(X[:, feature_subset])
                scores.append(s)
            idx_to_add = rest_indices[np.argmax(scores)]
            selected[idx_to_add] = True
        self.selected = selected.copy()
        return self
    
    def transform(self, X):
        return X[:, self.selected]
    
    def get_support(self, indices = False):
        return np.flatnonzero(self.selected) if indices else self.selected

In [44]:
def forward_selection_session(X_train, y_train):
    model = GaussianNB()
    selector = ForwardSelector(model)
    selector.fit(X_train, y_train, 5)
    print(selector.get_support(True))
    
    model = GaussianNB()
    model.fit(selector.transform(X_train), y_train)
    return model.score(selector.transform(X_test), y_test)

In [45]:
forward_selection_session(X_train, y_train)

[1 2 3 4 5]


0.8033707865168539

## Recursive feature elimination
RFE is an even more greedy algo provided by sklearn. It's good at performing feature subset with high efficiency. The importance of each feature is obtain either through a coef_ attribute or through a feature_importances_ attribute. So in order for RFE to work, the model is required to provide either of these attributes

In [51]:
def rfe_session(X_train, y_train):
    from sklearn.svm import SVC
    model = SVC(kernel = "linear")
    selector = RFE(model, 5)
    selector.fit(X_train, y_train)
    print(selector.get_support(True))
    
    model = SVC(kernel = "linear")
    model.fit(selector.transform(X_train), y_train)
    return model.score(selector.transform(X_test), y_test)

In [52]:
rfe_session(X_train, y_train)

[0 1 3 4 6]


0.8258426966292135