In [5]:
# library for displaying plots
import matplotlib.pyplot as plt
# display plots in the notebook 
%matplotlib inline

## First, we repeat the load and preprocessing steps

# Load data
from sklearn import datasets
iris = datasets.load_iris()

# Training and test spliting
from sklearn.model_selection import train_test_split

x_iris, y_iris = iris.data, iris.target
# Test set will be the 25% taken randomly
x_train, x_test, y_train, y_test = train_test_split(x_iris, y_iris, test_size=0.25, random_state=33)

# Preprocess: normalize
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [9]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# create a composite estimator made by a pipeline of preprocessing and the KNN model
model = Pipeline([
        ('scaler', StandardScaler()),
        ('ds', DecisionTreeClassifier())
])

# Fit the model
model.fit(x_train, y_train) 

# create a k-fold cross validation iterator of k=10 folds
cv = KFold(10, shuffle=True, random_state=33)

# by default the score used is the one returned by score method of the estimator (accuracy)
scores = cross_val_score(model, x_iris, y_iris, cv=cv)

from scipy.stats import sem
def mean_score(scores):
    return ("Mean score: {0:.3f} (+/- {1:.3f})").format(np.mean(scores), sem(scores))
print(mean_score(scores))

Mean score: 0.940 (+/- 0.021)


In [16]:
model.get_params().keys()

dict_keys(['memory', 'steps', 'scaler', 'ds', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std', 'ds__class_weight', 'ds__criterion', 'ds__max_depth', 'ds__max_features', 'ds__max_leaf_nodes', 'ds__min_impurity_decrease', 'ds__min_impurity_split', 'ds__min_samples_leaf', 'ds__min_samples_split', 'ds__min_weight_fraction_leaf', 'ds__presort', 'ds__random_state', 'ds__splitter'])

In [17]:
#If we were to change a parameter:
model.set_params(ds__class_weight='balanced')

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ds', DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [18]:
# Fit the model
model.fit(x_train, y_train) 
# Using named_steps
my_decision_tree = model.named_steps['ds']
print(my_decision_tree.feature_importances_)

[0.03263434 0.01910853 0.55176597 0.39649117]


In [19]:
name, my_desision_tree = model.steps[-1]
print(my_decision_tree.feature_importances_)

[0.03263434 0.01910853 0.55176597 0.39649117]


In [20]:
#TUNING
model.get_params()

{'memory': None,
 'steps': [('scaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('ds', DecisionTreeClassifier(class_weight='balanced', criterion='gini',
               max_depth=None, max_features=None, max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, presort=False, random_state=None,
               splitter='best'))],
 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'ds': DecisionTreeClassifier(class_weight='balanced', criterion='gini',
             max_depth=None, max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,


In [21]:
model.set_params(max_depth=1)

ValueError: Invalid parameter max_depth for estimator Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ds', DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np

param_grid = {'max_depth': np.arange(3, 10)}

gs = GridSearchCV(DecisionTreeClassifier(), param_grid)

gs.fit(x_train, y_train)

# summarize the results of the grid search
print("Best score: ", gs.best_score_)
print("Best params: ", gs.best_params_)

Best score:  0.9464285714285714
Best params:  {'max_depth': 3}


In [26]:
# Show results of modifying parameters
for i, max_depth in enumerate (gs.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (gs.cv_results_['mean_test_score'][i],
                                        gs.cv_results_['std_test_score'][i] * 2,
                                        max_depth))

0.946 (+/-0.075) for {'max_depth': 3}
0.946 (+/-0.075) for {'max_depth': 4}
0.929 (+/-0.024) for {'max_depth': 5}
0.946 (+/-0.075) for {'max_depth': 6}
0.946 (+/-0.075) for {'max_depth': 7}
0.946 (+/-0.075) for {'max_depth': 8}
0.929 (+/-0.024) for {'max_depth': 9}


In [31]:
# Create a composite estimator by a pipeline of preprocessing and the KNN model

model = Pipeline([
        ('scaler', StandardScaler()),
        ('ds', DecisionTreeClassifier(max_depth = 3))
])
# Fit the model
model.fit(x_train, y_train)

# Create the k-fold cross validation iterator of k=10 folds
cv = KFold(10, shuffle = True, random_state=33)

# by default the score used is the one returned by score method of the estimator (accuracy)
scores = cross_val_score(model, x_iris, y_iris, cv=cv)
def mean_score(scores):
    return ("Mean score: {0:.3f} (+/- {1:.3f})").format(np.mean(scores), sem(scores))
print(mean_score(scores))


Mean score: 0.947 (+/- 0.022)


In [33]:
# Set the parameters by cross-validation

from sklearn.metrics import classification_report

# set of parameters to test
tuned_parameters = [{'max_depth': np.arange(3, 10),
#                     'max_weights': [1, 10, 100, 1000]},
                     'criterion': ['gini', 'entropy'], 
                     'splitter': ['best', 'random'],
                    # 'min_samples_leaf': [2, 5, 10],
                     'class_weight':['balanced', None],
                     'max_leaf_nodes': [None, 5, 10, 20]
                    }]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    # cv = the fold of the cross-validation cv, defaulted to 5
    gs = GridSearchCV(DecisionTreeClassifier(), tuned_parameters, cv=10, scoring='%s_weighted' % score)
    gs.fit(x_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(gs.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in gs.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, gs.predict(x_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:

{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 7, 'max_leaf_nodes': 10, 'splitter': 'random'}

Grid scores on development set:

0.964 (+/-0.092) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': None, 'splitter': 'best'}
0.896 (+/-0.245) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': None, 'splitter': 'random'}
0.953 (+/-0.126) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 5, 'splitter': 'best'}
0.943 (+/-0.119) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 5, 'splitter': 'random'}
0.950 (+/-0.118) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 10, 'splitter': 'best'}
0.950 (+/-0.118) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 10, 'splitter': 'random'}
0.957 (+/-0.120) for {'c



Best parameters set found on development set:

{'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_leaf_nodes': 5, 'splitter': 'random'}

Grid scores on development set:

0.946 (+/-0.140) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': None, 'splitter': 'best'}
0.902 (+/-0.169) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': None, 'splitter': 'random'}
0.929 (+/-0.155) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 5, 'splitter': 'best'}
0.911 (+/-0.182) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 5, 'splitter': 'random'}
0.938 (+/-0.137) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 10, 'splitter': 'best'}
0.929 (+/-0.135) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 10, 'splitter': 'random'}
0.938 (+/-0.137) for {'class_weigh

