In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
from sklearn.feature_selection import SelectFromModel,RFE
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier


In [2]:
from sklearn.datasets import load_breast_cancer

In [3]:
data=load_breast_cancer()

In [4]:
data.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [5]:
print(data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [6]:
x=pd.DataFrame(data=data.data,columns=data.feature_names)

In [7]:
x.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
y=data.target

In [9]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=0)

In [10]:
x_train.shape,x_test.shape

((455, 30), (114, 30))

In [11]:
#feature selection using random forest classifier
sel=SelectFromModel(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1))
sel.fit(x_train,y_train)
sel.get_support()

array([ True, False,  True,  True, False, False,  True,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True, False,  True,  True, False, False, False,
        True, False, False])

In [12]:
features=x_train.columns[sel.get_support()]

In [13]:
features

Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'area error', 'worst radius', 'worst perimeter',
       'worst area', 'worst concave points'],
      dtype='object')

In [14]:
np.mean(sel.estimator_.feature_importances_)

0.03333333333333334

In [15]:
sel.estimator_.feature_importances_

array([0.03699612, 0.01561296, 0.06016409, 0.0371452 , 0.0063401 ,
       0.00965994, 0.0798662 , 0.08669071, 0.00474992, 0.00417092,
       0.02407355, 0.00548033, 0.01254423, 0.03880038, 0.00379521,
       0.00435162, 0.00452503, 0.00556905, 0.00610635, 0.00528878,
       0.09556258, 0.01859305, 0.17205401, 0.05065305, 0.00943096,
       0.01565491, 0.02443166, 0.14202709, 0.00964898, 0.01001304])

In [16]:
selected_features_df = pd.DataFrame({'Feature':list(x_train.columns),
                                     'Scores':sel.estimator_.feature_importances_})
selected_features_df.sort_values(by='Scores', ascending=False)

Unnamed: 0,Feature,Scores
22,worst perimeter,0.172054
27,worst concave points,0.142027
20,worst radius,0.095563
7,mean concave points,0.086691
6,mean concavity,0.079866
2,mean perimeter,0.060164
23,worst area,0.050653
13,area error,0.0388
3,mean area,0.037145
0,mean radius,0.036996


In [17]:
x_train_rfc=sel.transform(x_train)
x_test_rfc=sel.transform(x_test)

In [18]:
def run_randomForest(x_train,x_test,y_train,y_test):
    clf=RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1)
    clf.fit(x_train,y_train)
    y_pred=clf.predict(x_test)
    #print('Accuracy:',accuracy_score(y_test,y_pred))
    Accuracy=accuracy_score(y_test,y_pred)
    return Accuracy

In [19]:
%%time
run_randomForest(x_train_rfc,x_test_rfc,y_train,y_test)

Wall time: 416 ms


0.9473684210526315

In [20]:
%%time
run_randomForest(x_train,x_test,y_train,y_test)

Wall time: 400 ms


0.9649122807017544

In [21]:
#recursive feature elimination
sel=RFE(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1),n_features_to_select=15)
sel.fit(x_train,y_train)


RFE(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                     class_weight=None, criterion='gini',
                                     max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=100, n_jobs=-1,
                                     oob_score=False, random_state=0, verbose=0,
                                     warm_start=False),
    n_features_to_select=15, step=1, verbose=0)

In [22]:
sel.get_support()

array([ True,  True,  True,  True, False, False,  True,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True,  True,  True,  True,  True, False,  True,
        True,  True, False])

In [23]:
features=x_train.columns[sel.get_support()]

In [24]:
features

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean concavity', 'mean concave points', 'area error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst smoothness',
       'worst concavity', 'worst concave points', 'worst symmetry'],
      dtype='object')

In [25]:
x_train_rfe=sel.transform(x_train)
x_test_rfe=sel.transform(x_test)

In [26]:
%%time
run_randomForest(x_train_rfe,x_test_rfe,y_train,y_test)

Wall time: 414 ms


0.9736842105263158

In [27]:
%%time
run_randomForest(x_train,x_test,y_train,y_test)

Wall time: 415 ms


0.9649122807017544

In [28]:
#rfe using grad boost
sel=RFE(GradientBoostingClassifier(n_estimators=100,random_state=0),n_features_to_select=12)
sel.fit(x_train,y_train)

RFE(estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                         criterion='friedman_mse', init=None,
                                         learning_rate=0.1, loss='deviance',
                                         max_depth=3, max_features=None,
                                         max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         n_estimators=100,
                                         n_iter_no_change=None,
                                         presort='deprecated', random_state=0,
                                         subsample=1.0, tol=0.0001,
                                         validation_frac

In [29]:
features=x_train.columns[sel.get_support()]
features

Index(['mean texture', 'mean smoothness', 'mean concave points',
       'mean symmetry', 'area error', 'concavity error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst concavity',
       'worst concave points'],
      dtype='object')

In [30]:
x_train_rfe=sel.transform(x_train)
x_test_rfe=sel.transform(x_test)

In [31]:
%%time
run_randomForest(x_train_rfe,x_test_rfe,y_train,y_test)

Wall time: 425 ms


0.9736842105263158

In [32]:
%%time
run_randomForest(x_train,x_test,y_train,y_test)

Wall time: 432 ms


0.9649122807017544

In [33]:
'''for index in range(1,31):
    #rfe using grad boost
    sel=RFE(GradientBoostingClassifier(n_estimators=100,random_state=0),n_features_to_select=index)
    sel.fit(x_train,y_train)
    x_train_rfe=sel.transform(x_train)
    x_test_rfe=sel.transform(x_test)
    print('selected features:',index)
    run_randomForest(x_train_rfe,x_test_rfe,y_train,y_test)
    print()
    '''

"for index in range(1,31):\n    #rfe using grad boost\n    sel=RFE(GradientBoostingClassifier(n_estimators=100,random_state=0),n_features_to_select=index)\n    sel.fit(x_train,y_train)\n    x_train_rfe=sel.transform(x_train)\n    x_test_rfe=sel.transform(x_test)\n    print('selected features:',index)\n    run_randomForest(x_train_rfe,x_test_rfe,y_train,y_test)\n    print()\n    "

In [34]:
d=[]
for index in range(1,3):
    #rfe using grad boost
    sel=RFE(GradientBoostingClassifier(n_estimators=100,random_state=0),n_features_to_select=index)
    sel.fit(x_train,y_train)
    x_train_rfe=sel.transform(x_train)
    x_test_rfe=sel.transform(x_test)
    print('selected features:',index)
    features=x_train.columns[sel.get_support()]
    features_score=sel.estimator_.feature_importances_
    print('features are :',features)
    run_randomForest(x_train_rfe,x_test_rfe,y_train,y_test)
    print()
    d=[{'Selected features':index,'feature names':list(features),'Score': run_randomForest(x_train_rfe,x_test_rfe,y_train,y_test)}]
    print(d)
    
    
df= pd.DataFrame(d  for index in range(1,3))
    
    
    

selected features: 1
features are : Index(['worst concave points'], dtype='object')

[{'Selected features': 1, 'feature names': ['worst concave points'], 'Score': 0.8771929824561403}]
selected features: 2
features are : Index(['mean concave points', 'worst concave points'], dtype='object')

[{'Selected features': 2, 'feature names': ['mean concave points', 'worst concave points'], 'Score': 0.9035087719298246}]


In [35]:
for index in range(1,31):
    #rfe using grad boost
    sel=RFE(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1),n_features_to_select=index)
    sel.fit(x_train,y_train)
    x_train_rfe=sel.transform(x_train)
    x_test_rfe=sel.transform(x_test)
    print('selected features:',index)
    features=x_train.columns[sel.get_support()]
    print('features are :',features)
    
    run_randomForest(x_train_rfe,x_test_rfe,y_train,y_test)
    print()

selected features: 1
features are : Index(['worst perimeter'], dtype='object')

selected features: 2
features are : Index(['mean concave points', 'worst perimeter'], dtype='object')

selected features: 3
features are : Index(['mean concave points', 'worst perimeter', 'worst concave points'], dtype='object')

selected features: 4
features are : Index(['mean concave points', 'worst perimeter', 'worst area',
       'worst concave points'],
      dtype='object')

selected features: 5
features are : Index(['mean concave points', 'worst radius', 'worst perimeter', 'worst area',
       'worst concave points'],
      dtype='object')

selected features: 6
features are : Index(['mean concavity', 'mean concave points', 'worst radius',
       'worst perimeter', 'worst area', 'worst concave points'],
      dtype='object')

selected features: 7
features are : Index(['mean area', 'mean concavity', 'mean concave points', 'worst radius',
       'worst perimeter', 'worst area', 'worst concave points'],



selected features: 27
features are : Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'radius error', 'texture error',
       'perimeter error', 'area error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

selected features: 28
features are : Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'radius error', 'texture error',
       'perimeter error', 'area error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry