#### Univariate Feature Selection

In [33]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# This is 111is reject H0 ,
# Accept

dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]                              # X =  independent variable
Y = array[:,8]


# feature extraction
test = SelectKBest(score_func=chi2, k=4)      # chi2 , k=4 four important varibles i want
fit = test.fit(X, Y)

# summarize scores
set_printoptions(precision=3)                # Ignore
print(fit.scores_)
features = fit.transform(X)



#For regression: f_regression, mutual_info_regression
#For classification: chi2, f_classif, mutual_info_classif


# names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
# Important ---> Preg, plas ,skin,test , mass ,class

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]


In [34]:
# [preg,    plas,     pres,    skin,   test,     mass,      pedi,   age,     class]
# [111.52  1411.887   17.605   53.108  2175.565  127.669    5.393  181.304 ]
# Trese all arechisquare values
# Important = plas , Test , mass , age      (feature)
# Not Imp   = pedi , pres

In [35]:
features

# features = fit.transform(X)   Both are same

array([[148. ,   0. ,  33.6,  50. ],
       [ 85. ,   0. ,  26.6,  31. ],
       [183. ,   0. ,  23.3,  32. ],
       ...,
       [121. , 112. ,  26.2,  30. ],
       [126. ,   0. ,  30.1,  47. ],
       [ 93. ,   0. ,  30.4,  23. ]])

#### Recursive Feature Elimination

In [36]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression


# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# This is important it is give high accuracy ---> names
# Important ---> Plas, mass ,skin ,pedi

dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]


# feature extraction
model = LogisticRegression(max_iter=400)
rfe = RFE(estimator=model, n_features_to_select=4)
fit = rfe.fit(X, Y)


In [37]:

# [ True,   True,    False,  False,  False,  True,  True,  False]
# [preg,    plas,    pres,   skin,   test,   mass,  pedi,  age,  class]
# [0.058    0.306    0.083   0.016   0.042   0.255  0.127  0.113]
# Important true ---> preg ,Plas, mass ,pedi     (age imp--->   RFE)

In [38]:
#Num Features:
fit.n_features_

4

In [39]:
#Selected Features:
fit.support_

array([ True,  True, False, False, False,  True,  True, False])

In [40]:
# Feature Ranking:
fit.ranking_

array([1, 1, 3, 5, 4, 1, 1, 2])

#### Feature Importance using Decision Tree

In [41]:
# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.tree import  DecisionTreeClassifier
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = DecisionTreeClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[0.063 0.334 0.082 0.012 0.046 0.221 0.12  0.122]


In [42]:
# [preg,    plas,    pres,   skin,   test,   mass,  pedi,  age,   class]
# [0.058    0.306    0.083   0.016   0.042   0.255  0.127  0.113]

# Important true ---> Plas, mass ,pedi , age

### Comapre three
Which one is Important


In [43]:
# Important  ---> plas , Test , mass , age      (feature)
# Important  ---> preg ,Plas, mass ,pedi        (age imp--->   RFE)
# Important  ---> Plas, mass ,pedi , age

# Final Imp  ---> plass, preg , pedi , mass , age