<a href="https://colab.research.google.com/github/jackty9/Feature_Selection_in_Python/blob/master/Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Numerical input, numerical output

In [None]:
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
import pandas as pd

# generate dataset
X, y = make_regression(n_samples=100, n_features=50, n_informative=10)
#assign column names 
col_list = ['col_' + str(x) for x in range(0,50)]
#create a dataframe table
df = pd.DataFrame(X, columns=col_list)

#feature selection using f_regression 
fs = SelectKBest(score_func=f_regression, k=5)
fit = fs.fit(X,y)
#create df for scores
dfscores = pd.DataFrame(fit.scores_)
#create df for column names
dfcolumns = pd.DataFrame(df.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
#naming the dataframe columns
featureScores.columns = ['Selected_columns','Score_regression'] 
#print 5 best features
print(featureScores.nlargest(5,'Score_regression'))  

   Selected_columns  Score_regression
12           col_12         43.112463
29           col_29         41.179494
22           col_22         20.525630
8             col_8         17.754656
23           col_23         10.996860


# Numerical input, categorical output

In [1]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import pandas as pd

# generate dataset
X, y = make_classification(n_samples=100, n_features=50, n_informative=10)
#assign column names 
col_list = ['col_' + str(x) for x in range(0,50)]
#create a dataframe table
df = pd.DataFrame(X, columns=col_list)

#feature selection using f_classif
fs = SelectKBest(score_func=f_classif, k=5)
fit = fs.fit(X,y)
#create df for scores
dfscores = pd.DataFrame(fit.scores_)
#create df for column names
dfcolumns = pd.DataFrame(df.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
#naming the dataframe columns
featureScores.columns = ['Selected_columns','Score_ANOVA'] 
#print 5 best features
print(featureScores.nlargest(5,'Score_ANOVA'))  

   Selected_columns  Score_ANOVA
49           col_49    45.921815
31           col_31    30.376094
1             col_1    16.660251
25           col_25     8.463626
4             col_4     6.975909


# Categorical input, categorical output

## Chi-squared

In [44]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import pandas as pd

#import raw data, data can be found in Github directory 
df = pd.read_csv("car_data.csv")
X = df1.iloc[:,0:5]
X = pd.get_dummies(X)
y = df1.iloc[:,-1]
y = pd.get_dummies(y)

#feature selection using chi2
bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X,y)
#create df for scores
dfscores = pd.DataFrame(fit.scores_)
#create df for column names
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
#naming the dataframe columns
featureScores.columns = ['Selected_columns','Score_chi2'] 
#print 5 best features
print(featureScores.nlargest(5,'Score_chi2')) 

   Selected_columns  Score_chi2
12        persons_2  246.585124
1     buy_price_low   86.823364
13        persons_4   69.516298
5   maint_price_low   57.868044
14     persons_more   55.235306


## Mutual Info for classification

In [45]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

#import raw data, data can be found in Github directory 
df1 = pd.read_csv("car_data.csv")
X = df1.iloc[:,0:5]
X = pd.get_dummies(X)
y = df1.iloc[:,-1]

#feature selection using mutual_info_classif
bestfeatures = SelectKBest(score_func=mutual_info_classif, k=5)
fit = bestfeatures.fit(X,y)
#create df for scores
dfscores = pd.DataFrame(fit.scores_)
#create df for column names
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
#naming the dataframe columns
featureScores.columns = ['Selected_columns','Score_MutualInfo']  
#print 5 best features
print(featureScores.nlargest(5,'Score_MutualInfo'))  

     Selected_columns  Score_MutualInfo
12          persons_2          0.151740
17     lug_boot_small          0.024761
7   maint_price_vhigh          0.022919
0      buy_price_high          0.022710
14       persons_more          0.019861
