<a href="https://colab.research.google.com/github/kalyannith2/Deployment/blob/main/Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Univariate Feature Selection

In [None]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
test = SelectKBest(score_func=chi2, k=4)   #k= colomns
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
print(features)


#For regression: f_regression, mutual_info_regression
#For classification: chi2, f_classif, mutual_info_classif

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]
[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 ...
 [121.  112.   26.2  30. ]
 [126.    0.   30.1  47. ]
 [ 93.    0.   30.4  23. ]]


In [None]:
dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
#find chi2 values
import pandas as pd
fn= ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']
df=pd.DataFrame(fit.scores_,index=fn)
df.sort_values(by=0,ascending=False)

Unnamed: 0,0
test,2175.565273
plas,1411.887041
age,181.303689
mass,127.669343
preg,111.519691
skin,53.10804
pres,17.605373
pedi,5.392682


#### Recursive Feature Elimination

In [None]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = LogisticRegression(max_iter=400)
rfe = RFE(estimator=model, n_features_to_select=4)
fit = rfe.fit(X, Y)


In [None]:
#Num Features:
fit.n_features_

np.int64(4)

In [None]:
#Selected Features:
fit.support_

array([ True,  True, False, False, False,  True,  True, False])

In [None]:
# Feature Ranking:
fit.ranking_

array([1, 1, 3, 5, 4, 1, 1, 2])

In [None]:
import pandas as pd
dict={'fn':['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age'],
      'Support':fit.support_,
      'Ranking':fit.ranking_}
df=pd.DataFrame(dict)
df

Unnamed: 0,fn,Support,Ranking
0,preg,True,1
1,plas,True,1
2,pres,False,3
3,skin,False,5
4,test,False,4
5,mass,True,1
6,pedi,True,1
7,age,False,2


#### Feature Importance using Decision Tree

In [None]:
# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.tree import  DecisionTreeClassifier
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = DecisionTreeClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[0.05  0.332 0.091 0.015 0.037 0.22  0.144 0.111]


In [None]:
import pandas as pd
fn= ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']
df=pd.DataFrame(model.feature_importances_,index=fn)
df.sort_values(by=0,ascending=False)

Unnamed: 0,0
plas,0.332253
mass,0.220437
pedi,0.143667
age,0.111062
pres,0.091039
preg,0.050259
test,0.036637
skin,0.014647


In [None]:
data=pd.read_csv("PCA.csv",index_col=0)
data.head()

Unnamed: 0_level_0,SAT,Top10,Accept,SFRatio,Expenses,GradRate
Univ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Brown,1310,89,22,13,22704,94
CalTech,1415,100,25,6,63575,81
CMU,1260,62,59,9,25026,72
Columbia,1310,76,24,12,31510,88
Cornell,1280,83,33,13,21864,90


In [None]:
X=data.drop('Expenses',axis=1)
y=data['Expenses']


In [None]:
from sklearn.feature_selection import f_regression

model=SelectKBest(score_func=f_regression,k=3)
model.fit(X,y)

In [None]:
model.transform(X)

array([[1310,   89,   13],
       [1415,  100,    6],
       [1260,   62,    9],
       [1310,   76,   12],
       [1280,   83,   13],
       [1340,   89,   10],
       [1315,   90,   12],
       [1255,   74,   12],
       [1400,   91,   11],
       [1305,   75,    7],
       [1380,   94,   10],
       [1260,   85,   11],
       [1255,   81,   13],
       [1081,   38,   18],
       [1375,   91,    8],
       [1005,   28,   19],
       [1360,   90,   12],
       [1075,   49,   25],
       [1240,   95,   17],
       [1290,   75,   13],
       [1180,   65,   16],
       [1285,   80,   11],
       [1225,   77,   14],
       [1085,   40,   15],
       [1375,   95,   11]])

In [None]:
model.scores_

array([35.495, 13.735, 10.423, 36.167,  4.216])