In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.datasets import load_breast_cancer

bc_data = load_breast_cancer()
bc_features = pd.DataFrame(bc_data.data, columns=bc_data.feature_names)
bc_classes = pd.DataFrame(bc_data.target, columns=['IsMalignant'])

In [None]:
len(bc_features.columns)

# Statistics

In [None]:
# build featureset and response class labels 
bc_X = bc_features
bc_y = np.array(bc_classes).T[0]
print('Feature set shape:', bc_X.shape)
print('Response class shape:', bc_y.shape)

In [None]:
from sklearn.feature_selection import chi2, SelectKBest

skb = SelectKBest(score_func=chi2, k=10)
skb.fit(bc_X, bc_y)

In [None]:
feature_scores = [(item, score) for item, score in zip(bc_data.feature_names, skb.scores_)]
sorted(feature_scores, key=lambda x: -x[1])[:10]

In [None]:
select_features_kbest = skb.get_support()
feature_names_kbest = bc_data.feature_names[select_features_kbest]
feature_subset_df = bc_features[feature_names_kbest]
bc_SX = np.array(feature_subset_df)
print(bc_SX.shape)
print(feature_names_kbest)

In [None]:
np.round(feature_subset_df.iloc[20:25], 2)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


# build logistic regression model
lr = LogisticRegression()

# evaluating accuracy for model built on full featureset
full_feat_acc = np.average(cross_val_score(lr, bc_X, bc_y, scoring='accuracy', cv=5))
# evaluating accuracy for model built on selected featureset
sel_feat_acc = np.average(cross_val_score(lr, bc_SX, bc_y, scoring='accuracy', cv=5))

print('Model accuracy statistics with 5-fold cross validation')
print('Model accuracy with complete feature set', bc_X.shape, ':', full_feat_acc)
print('Model accuracy with selected feature set', bc_SX.shape, ':', sel_feat_acc)

In [None]:
#Recursive Feature Selection
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


lr = LogisticRegression()
rfe = RFE(estimator=lr, n_features_to_select=10, step=1)
rfe.fit(bc_X, bc_y)

In [None]:
select_features_rfe = rfe.get_support()
feature_names_rfe = bc_data.feature_names[select_features_rfe]
print(feature_names_rfe)

rfe_subset_df = bc_features[feature_names_rfe]
bc_SX2 = np.array(feature_subset_df)

In [None]:
print(feature_names_kbest)

In [None]:
# evaluating accuracy for model built on K-best selected featureset
sel_feat_acc = np.average(cross_val_score(lr, bc_SX, bc_y, scoring='accuracy', cv=5))

#evaluating for model built on RFE selected featureset
rfe_feat_acc = np.average(cross_val_score(lr, bc_SX2, bc_y, scoring='accuracy', cv=5))


print('Model accuracy statistics with 5-fold cross validation')
print('Model accuracy with KBest selected feature set', bc_SX.shape, ':', sel_feat_acc)
print('Model accuracy with RFE selected feature set', bc_SX2.shape, ':', rfe_feat_acc)

In [None]:
#Model-based selection
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(bc_X, bc_y)

importance_scores = rfc.feature_importances_
feature_importances = [(feature, score) for feature, score in zip(bc_data.feature_names, importance_scores)]
sorted(feature_importances, key=lambda x: -x[1])[:10]

In [None]:
#write code to compare these features and the ones above - 
#be sure to index your sets/tuples to get feat names
feats = []
for feature in feature_importances:
    feats.append(feature[0])

Xrfc = bc_features[feats]
np.average(cross_val_score(lr, Xrfc, bc_y, scoring = "accuracy", cv = 5))