In [37]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from tabulate import tabulate

In [38]:
X, y = load_breast_cancer(return_X_y=True)

In [43]:

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True) 


'''Creating Pipeline [Feature Scaling & Classification]'''
pipe = Pipeline([
         ("Feature_Scaling", StandardScaler()),
         ("RandomForest_Classifier", RandomForestClassifier(random_state=1, verbose=0)) ])

pipe.fit(X_train, y_train)


# Accuracy
print("Random Forest Accuracy (Sem Seleção de Atributos): ", '{:.2%}'.format(pipe.score(X_val,y_val)))

Random Forest Accuracy (Sem Seleção de Atributos):  94.74%


In [62]:

X_new = pd.DataFrame(SelectKBest(chi2, k=5).fit_transform(X,y))

#Inspect
X_new.head()


Unnamed: 0,0,1,2,3,4
0,122.8,1001.0,153.4,184.6,2019.0
1,132.9,1326.0,74.08,158.8,1956.0
2,130.0,1203.0,94.03,152.5,1709.0
3,77.58,386.1,27.23,98.87,567.7
4,135.1,1297.0,94.44,152.2,1575.0


In [63]:
# Dataset Split
''' Training = 90% & Validation = 10%  '''
test_size = 0.2
X_train, X_val, y_train, y_val = train_test_split(X_new, y, test_size=test_size, random_state=0, shuffle=True) 

# Feature Scale & Train Model
pipe.fit(X_train, y_train)


# Tabulate Data Empty List
tab_data = []

# Accuracy
#print("Random Forest Accuracy (with feature selection): ", '{:.2%}'.format(pipe.score(X_val,y_val)))
tab_data.append(['Chi2', '{:.2%}'.format(pipe.score(X_val,y_val))])

In [64]:
'''Applying SelectKBest with Scoring Function = f_classif to Abalone dataset to retrieve the **4** best features.'''

X_new = pd.DataFrame(SelectKBest(f_classif, k=4).fit_transform(X,y))

# Dataset Split
''' Training = 90% & Validation = 10%  '''
test_size = 0.1
X_train, X_val, y_train, y_val = train_test_split(X_new, y, test_size=test_size, random_state=0, shuffle=True) 

# Feature Scale & Train Model
pipe.fit(X_train, y_train)

# Accuracy
#print("Random Forest Accuracy (with feature selection): ", '{:.2%}'.format(pipe.score(X_val,y_val)))
tab_data.append(['f_classif', '{:.2%}'.format(pipe.score(X_val,y_val))])

In [65]:


'''Applying SelectKBest with Scoring Function = mutual_info_classif to Abalone dataset to retrieve the **4** best features.'''

X_new = pd.DataFrame(SelectKBest(mutual_info_classif, k=4).fit_transform(X,y))

# Dataset Split
''' Training = 90% & Validation = 10%  '''
test_size = 0.1
X_train, X_val, y_train, y_val = train_test_split(X_new, y, test_size=test_size, random_state=0, shuffle=True) 

# Feature Scale & Train Model
pipe.fit(X_train, y_train)

# Accuracy
#print("Random Forest Accuracy (with feature selection): ", '{:.2%}'.format(pipe.score(X_val,y_val)))
tab_data.append(['mutual_info_classif', '{:.2%}'.format(pipe.score(X_val,y_val))])



In [67]:
# Tabulate Data
print("Acurácia do Random Forest (com SelectKBest):\n\n", tabulate(tab_data, headers=['Scoring_Func', 'Accuracy']))

Acurácia do Random Forest (com SelectKBest):

 Scoring_Func         Accuracy
-------------------  ----------
Chi2                 95.61%
f_classif            94.74%
mutual_info_classif  92.98%
