# Contents
- [Data](#data)
- [Decision Tree Model](#dt)
- [Modeling](#model)
    - [Train/Test Split](#tt_splt)
    - [Initial Model](#init)
- [Tuning](#tune)
    - [RandomizedSearch](#rand)
- [Feature Importances](#feat_imp)

# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Custom Functions

## Feature Importance Sorting

In [2]:
def feat_sort(values, labels, ret_num='all'):
    '''
    Return dataframe of sorted (by absolute value) feature weights
    values : feature weight values from analysis
    labels : names of each feature
    ret_num : number of top features to return
    '''
    
    df = pd.DataFrame(values, index=labels, columns=['feat_wgt'])
    # drop weights = 0
    df = df[df['feat_wgt'] != 0]
    # note which weights are positive
    df['positive'] = df['feat_wgt'] > 0
    # take absolute value of weights
    df['feat_wgt'] = df['feat_wgt'].apply(abs)
    # sort weights (largest to smalles)
    df.sort_values(by='feat_wgt', ascending=False, inplace=True)
    if ret_num == 'all':
        return(df)
    else:
        return(df.iloc[:ret_num, :])

# Data <a name="data"></a>

## Load Data

In [3]:
from sklearn.datasets import load_breast_cancer

In [4]:
data = load_breast_cancer()

In [5]:
df = pd.DataFrame(data['data'], columns=data['feature_names'])
df['cancer'] = data['target']

In [6]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,cancer
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


## Define Variables

In [7]:
# define predictor variables (features)
X = df.iloc[:, :-1]
# define dependent variable (target)
y = df.iloc[:, -1]

# Modeling <a name="model"></a>

## Train/Test Split <a name="tt_splt"></a>
Split data into feature training, feature test, target training and target test variables

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# specify test size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## Initial Model <a name="init"></a>

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [11]:
# create instance of model
rf = RandomForestClassifier()

In [12]:
# fit model on training data
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
# overall model accuracy
rf.score(X_test, y_test)

0.958041958041958

In [14]:
# model predictions
rf_pred = rf.predict(X_test)

In [15]:
# confusion matrix
metrics.confusion_matrix(y_test, rf_pred)

array([[52,  1],
       [ 5, 85]])

In [16]:
# classification report
print(metrics.classification_report(y_test, rf_pred))

             precision    recall  f1-score   support

          0       0.91      0.98      0.95        53
          1       0.99      0.94      0.97        90

avg / total       0.96      0.96      0.96       143



In [17]:
# assign report output values
precision, recall, fscore, support = metrics.precision_recall_fscore_support(y_test, rf_pred)

In [18]:
precision

array([0.9122807 , 0.98837209])

In [19]:
recall

array([0.98113208, 0.94444444])

In [20]:
fscore

array([0.94545455, 0.96590909])

In [21]:
support

array([53, 90])

## Tuning <a name="tune"></a>

## RandomizedSearch <a name="rand"></a>

In [22]:
from sklearn.model_selection import RandomizedSearchCV

In [23]:
# try range of values for hyperparameters

# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# number of features to consider at every split
max_features = np.arange(1, X.shape[1]+1)
# maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# method of selecting samples for training each tree
bootstrap = [True, False]

In [24]:
# define hyperparameter dictionary
hyperparams = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

In [25]:
# create instance of tuner
rand = RandomizedSearchCV(rf, hyperparams, scoring='accuracy', cv=5, random_state=0)

In [26]:
# fit tuner to data
rand.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', ra

In [27]:
# output parameters for optimal model
rand.best_params_

{'n_estimators': 1600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 5,
 'max_depth': 50,
 'bootstrap': False}

In [28]:
# output optimal overall accuracy
rand.best_score_

0.9530516431924883

In [29]:
# create model with optimal hyperparameters
rf_tune = RandomForestClassifier(
    n_estimators = rand.best_params_['n_estimators'],
    max_features = rand.best_params_['max_features'],
    max_depth = rand.best_params_['max_depth'],
    min_samples_split = rand.best_params_['min_samples_split'],
    min_samples_leaf = rand.best_params_['min_samples_leaf'],
    bootstrap = rand.best_params_['bootstrap']
)

In [30]:
# fit and score
rf_tune.fit(X_train, y_train)
rf_tune.score(X_test, y_test)

0.972027972027972

In [31]:
# model predictions
rf_pred = rf_tune.predict(X_test)

In [32]:
# confusion matrix
metrics.confusion_matrix(y_test, rf_pred)

array([[52,  1],
       [ 3, 87]])

In [33]:
# classification report
print(metrics.classification_report(y_test, rf_pred))

             precision    recall  f1-score   support

          0       0.95      0.98      0.96        53
          1       0.99      0.97      0.98        90

avg / total       0.97      0.97      0.97       143



### Within Pipeline

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [35]:
# set pipeline parameters
rf_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier())
])

In [36]:
# fit model using pipeline parameters
rf_pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
   ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [37]:
# model predictions
rf_pred = rf_pipe.predict(X_test)

In [38]:
# classification report
print(metrics.classification_report(y_test, rf_pred))

             precision    recall  f1-score   support

          0       0.90      0.98      0.94        53
          1       0.99      0.93      0.96        90

avg / total       0.95      0.95      0.95       143



In [39]:
# define hyperparameter dictionary for pipeline input
hyperparams = {
    'model__n_estimators': n_estimators,
    'model__max_features': max_features,
    'model__max_depth': max_depth,
    'model__min_samples_split': min_samples_split,
    'model__min_samples_leaf': min_samples_leaf,
    'model__bootstrap': bootstrap
}

In [40]:
# tune model pipeline for recall
rf_tune = RandomizedSearchCV(rf_pipe, hyperparams, scoring='accuracy', cv=5, random_state=0)

In [41]:
rf_tune.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
   ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'model__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'model__max_features': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), 'model__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'model__min_samples_split': [2, 5, 10], 'model__min_samples_leaf': [1, 2, 4], 'model__bootstrap': 

In [42]:
# optimal parameters
rf_tune.best_params_

{'model__n_estimators': 1600,
 'model__min_samples_split': 2,
 'model__min_samples_leaf': 1,
 'model__max_features': 5,
 'model__max_depth': 50,
 'model__bootstrap': False}

In [43]:
# assign best hyperparameter values within pipeline
rf_pipe.set_params(
    model__n_estimators = rand.best_params_['n_estimators'],
    model__max_features = rand.best_params_['max_features'],
    model__max_depth = rand.best_params_['max_depth'],
    model__min_samples_split = rand.best_params_['min_samples_split'],
    model__min_samples_leaf = rand.best_params_['min_samples_leaf'],
    model__bootstrap = rand.best_params_['bootstrap']
)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=50, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
         ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [44]:
# fit model and return predictions
rf_pipe.fit(X_train, y_train)
rf_pred = rf_pipe.predict(X_test)

In [45]:
print(metrics.classification_report(y_test, rf_pred))

             precision    recall  f1-score   support

          0       0.95      0.98      0.96        53
          1       0.99      0.97      0.98        90

avg / total       0.97      0.97      0.97       143



# Feature Importances <a name="feat_imp"></a>

In [46]:
rf_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier())
])

rf_pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
   ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [47]:
# feature coefficients
rf_pipe.named_steps['model'].feature_importances_

array([0.00742361, 0.00728235, 0.07445666, 0.12174826, 0.00236278,
       0.00664463, 0.00630278, 0.15815928, 0.00415287, 0.00475633,
       0.00641975, 0.00631274, 0.00195909, 0.00783079, 0.00566631,
       0.01228079, 0.0162066 , 0.00166413, 0.0106305 , 0.00194613,
       0.01036131, 0.01125633, 0.11578921, 0.22832878, 0.00640025,
       0.03286622, 0.06848677, 0.04631829, 0.01398407, 0.00200238])

In [48]:
# feature names
X.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [49]:
# create feature dataframe
feat_sort(rf_pipe.named_steps['model'].feature_importances_, X.columns)

Unnamed: 0,feat_wgt,positive
worst area,0.228329,True
mean concave points,0.158159,True
mean area,0.121748,True
worst perimeter,0.115789,True
mean perimeter,0.074457,True
worst concavity,0.068487,True
worst concave points,0.046318,True
worst compactness,0.032866,True
concavity error,0.016207,True
worst symmetry,0.013984,True
