In [64]:
! pip install skfeature-chappers

Collecting skfeature-chappers
  Downloading skfeature_chappers-1.1.0-py3-none-any.whl (66 kB)
[?25l[K     |█████                           | 10 kB 25.1 MB/s eta 0:00:01[K     |█████████▉                      | 20 kB 31.6 MB/s eta 0:00:01[K     |██████████████▉                 | 30 kB 29.9 MB/s eta 0:00:01[K     |███████████████████▊            | 40 kB 23.4 MB/s eta 0:00:01[K     |████████████████████████▊       | 51 kB 15.1 MB/s eta 0:00:01[K     |█████████████████████████████▋  | 61 kB 15.3 MB/s eta 0:00:01[K     |████████████████████████████████| 66 kB 2.9 MB/s 
Installing collected packages: skfeature-chappers
Successfully installed skfeature-chappers-1.1.0


In [68]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, RFE, mutual_info_regression, SequentialFeatureSelector, VarianceThreshold, SelectFpr, SelectFromModel
from skfeature.function.similarity_based import fisher_score, lap_score
from skfeature.utility import construct_W

from sklearn.linear_model import LogisticRegression

In [30]:
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names)
dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [66]:
X = dataframe.values[:, 0:8]
Y = dataframe.values[:, 8]

# For Unsupervised

In [38]:
# Feature extraction with VarianceThreshold
selector = VarianceThreshold(threshold=(.8 * (1 - .8)))  # 80%
X_new = selector.fit_transform(X)

print(selector.get_support())

In [None]:
# lap score
kwargs_W = {"metric":"euclidean","neighbor_mode":"knn","weight_mode":"heat_kernel","k":5,'t':1}
W = construct_W.construct_W(X, **kwargs_W)
score = lap_score.lap_score(X, W=W)
print(score)

# For Supervised

In [9]:
# Feature extraction with SelectKBest
best_features = SelectKBest(score_func=chi2, k=4)
X_new = best_features.fit_transform(X, Y)

# Feature extraction with SelectPercentile
best_features = SelectPercentile(score_func=chi2, percentile=20)
X_new = best_features.fit_transform(X, Y)

In [49]:
#Select top (n) features based on mutual info regression
selector = SelectKBest(mutual_info_regression, k=4)
selector.fit(X, Y)

print(selector.get_support())

[ True  True False False False  True False  True]


In [61]:
print("Old features = ", X.shape)
print("New features = ", X_new.shape)

Old features =  (768, 8)
New features =  (768, 1)


In [59]:
# Recursive Feature Elimination (RFE)
model = LogisticRegression(max_iter=1000)

selector = RFE(model, n_features_to_select=5, step=1)
selector = selector.fit(X, Y)

print("Num Features: %s" % (selector.n_features_))
print("Selected Features: %s" % (selector.support_))
print("Feature Ranking: %s" % (selector.ranking_))

Num Features: 5
Selected Features: [ True  True False False False  True  True  True]
Feature Ranking: [1 1 2 4 3 1 1 1]


In [58]:
# Sequential Feature Selection
sfs_selector = SequentialFeatureSelector(estimator=LogisticRegression(max_iter=1000), n_features_to_select = 3, cv =10, direction ='backward')
sfs_selector.fit(X, Y)

print(sfs_selector.get_support())

[False  True False False False  True False  True]


In [60]:
# SelectFromModel
selector = SelectFromModel(estimator=LogisticRegression(max_iter=1000))
selector.fit(X, Y)
X_new = selector.transform(X)

print("threshold_ = ", selector.threshold_)

threshold_ =  0.14310081135922267


In [67]:
# Fisher’s Score
score = fisher_score.fisher_score(X, Y)
print(score)

[6 2 0 7 1 3 4 5]


In [None]:
# SelectFpr
"""
FPR test stands for False Positive Rate test. It controls the total amount of false detections.
"""

X_new = SelectFpr(chi2, alpha=0.01).fit_transform(X, Y)