In [None]:
""" 
What? Feature selection.

Feature selection is a process where you automatically select 
those features in your data that contribute most to the prediction 
variable or output in which you are interested. Having irrelevant 
features in your data can decrease the accuracy of many models,
especially linear algorithms like linear and logistic regression. 

Three benefits are:
    [1] Reduces Overfitting
    [2] Improves Accuracy
    [3] Reduces Training Time
   
"""

In [1]:
# Import python modules
from pandas import read_csv
from numpy import set_printoptions
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2
from IPython.display import Markdown, display
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# Additional functions
def myPrint(string, c = "blue"):    
    """My version of the python-native print command.
    
    Print in bold and red tect
    """
    colorstr = "<span style='color:{}'>{}</span>".format(c, '**'+ string + '**' )    
    display(Markdown(colorstr))

def printPythonModuleVersion():    
    """printPythonModuleVersion
    Quickly list the python module versions
    """
    myPrint("Checking main python modules version")
    import scipy
    print('scipy: %s' % scipy.__version__)
    import numpy
    print('numpy: %s' % numpy.__version__)    
    import matplotlib
    print('matplotlib: %s' % matplotlib.__version__)    
    import pandas
    print('pandas: %s' % pandas.__version__)
    import statsmodels
    print('statsmodels: %s' % statsmodels.__version__) 
    import sklearn
    print('sklearn: %s' % sklearn.__version__)
    import xgboost
    print('xgboostn: %s' % xgboost.__version__)    

printPythonModuleVersion()

<span style='color:blue'>**Checking main python modules version**</span>

scipy: 1.5.4
numpy: 1.19.4
matplotlib: 3.3.2
pandas: 1.1.4
statsmodels: 0.12.1
sklearn: 0.23.2
xgboostn: 1.2.1


In [3]:
# Read-in the data
filename = './datasetCollections/pima-indians-diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
myPrint("Checking size of input and labels")
print("Input's shape: ", X.shape)
print("label's shape: ", Y.shape)
myPrint("Input names")
print(dataframe.columns)
myPrint("Print dataframe")
print(dataframe)

<span style='color:blue'>**Checking size of input and labels**</span>

Input's shape:  (768, 8)
label's shape:  (768,)


<span style='color:blue'>**Input names**</span>

Index(['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'], dtype='object')


<span style='color:blue'>**Print dataframe**</span>

     preg  plas  pres  skin  test  mass   pedi  age  class
0       6   148    72    35     0  33.6  0.627   50      1
1       1    85    66    29     0  26.6  0.351   31      0
2       8   183    64     0     0  23.3  0.672   32      1
3       1    89    66    23    94  28.1  0.167   21      0
4       0   137    40    35   168  43.1  2.288   33      1
..    ...   ...   ...   ...   ...   ...    ...  ...    ...
763    10   101    76    48   180  32.9  0.171   63      0
764     2   122    70    27     0  36.8  0.340   27      0
765     5   121    72    23   112  26.2  0.245   30      0
766     1   126    60     0     0  30.1  0.349   47      1
767     1    93    70    31     0  30.4  0.315   23      0

[768 rows x 9 columns]


In [24]:
# [1] UNIVARIATE SELECTION

"""
Statistical tests can be used to select those features 
that have the strongest relationship with the output variable. 
The scikit-learn library provides the SelectKBest class2 
that can be used with a suite of different statistical tests
to select a specific number of features. The example below uses 
the chi-squared (chi2) statistical test for non-negative features 
to select 4 of the best features
"""

# feature extraction
test = SelectKBest(score_func = chi2, k = 4)
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)

print(fit.scores_)

features = fit.transform(X)
# summarize selected features
print(features[0:5,:])

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]
[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]


In [60]:
# [2] RECURSIVE FEATURE ELIMINATION

"""
The Recursive Feature Elimination (or RFE) works by recursively removing
attributes and building a model on those attributes that remain.
The example below uses RFE with the logistic regression algorithm to 
select the top 3 features.
"""

# Feature extraction
model = LogisticRegression(max_iter = 500)
rfe = RFE(model, n_features_to_select = 3)
fit = rfe.fit(X, Y)

myPrint("No of features selected")
print(fit.n_features_)
myPrint("Complete map")
for i in range(8):
    print("Feature: ", dataframe.columns[i], " selected? ", fit.support_[i], " rank? ", fit.ranking_[i])


<span style='color:blue'>**No of features selected**</span>

3


<span style='color:blue'>**Complete map**</span>

Feature:  preg  selected?  True  rank?  1
Feature:  plas  selected?  False  rank?  2
Feature:  pres  selected?  False  rank?  4
Feature:  skin  selected?  False  rank?  6
Feature:  test  selected?  False  rank?  5
Feature:  mass  selected?  True  rank?  1
Feature:  pedi  selected?  True  rank?  1
Feature:  age  selected?  False  rank?  3


In [66]:
# [3] # Feature Extraction with PCA

"""
Principal Component Analysis (or PCA) uses linear algebra to transform the 
dataset into a compressed form. Generally this is called a data reduction
technique. A property of PCA is that you can choose the number of dimensions
or principal components in the transformed result. In the example below, we 
use PCA and select 3 principal components.

8 :: No of features
"""

# feature extraction
pca = PCA(n_components = 3)
fit = pca.fit(X)
# summarize components
print("Explained Variance: %s", fit.explained_variance_ratio_)
print(fit.components_)
print(fit.components_.shape)
print(X.shape)

Explained Variance: %s [0.889 0.062 0.026]
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [-2.265e-02 -9.722e-01 -1.419e-01  5.786e-02  9.463e-02 -4.697e-02
  -8.168e-04 -1.402e-01]
 [-2.246e-02  1.434e-01 -9.225e-01 -3.070e-01  2.098e-02 -1.324e-01
  -6.400e-04 -1.255e-01]]
(3, 8)
(768, 8)


In [72]:
# [4] # Feature Importance with Extra Trees Classifier

"""
Bagged decision trees like Random Forest and Extra Trees 
can be used to estimate the importance of features.
"""

# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

myPrint("Complete map")
a = 0
for i in range(8):
    print("Feature: -->>", dataframe.columns[i], "<<-- level of importance? ", model.feature_importances_[i])
    a+=model.feature_importances_[i]
    print("Cumulative importance value: ", a)


[0.111 0.239 0.102 0.08  0.072 0.137 0.117 0.141]


<span style='color:blue'>**Complete map**</span>

Feature: -->> preg <<-- level of importance?  0.1110532119993402
Cumulative importance value:  0.1110532119993402
Feature: -->> plas <<-- level of importance?  0.23938614723873058
Cumulative importance value:  0.3504393592380708
Feature: -->> pres <<-- level of importance?  0.10236910037265962
Cumulative importance value:  0.4528084596107304
Feature: -->> skin <<-- level of importance?  0.07998599838504351
Cumulative importance value:  0.532794457995774
Feature: -->> test <<-- level of importance?  0.07196102594014515
Cumulative importance value:  0.6047554839359192
Feature: -->> mass <<-- level of importance?  0.13665768360416575
Cumulative importance value:  0.7414131675400849
Feature: -->> pedi <<-- level of importance?  0.11720534875446005
Cumulative importance value:  0.858618516294545
Feature: -->> age <<-- level of importance?  0.14138148370545522
Cumulative importance value:  1.0000000000000002
