In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier

from module_5 import setup_data

In [2]:
param_grid = [
    {'min_samples_split': [2, 3, 5, 8, 13, 21, 34, 55]},
    {'min_samples_leaf': [2, 3, 5, 8, 13, 21, 34, 55]},
    {'criterion': ['gini', 'entropy', 'log_loss']},
    {'max_features': ['sqrt', 'log2', None]},
    {'max_depth': [2, 3, 5, 8, 13, 21, 34, 55, 89, None]}
]

X_train, X_test, y_train, y_test = setup_data()
scores = ["precision", "recall"]

# What should we optimise for?
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, scoring=scores, refit='recall')
grid_search.fit(X_train, y_train)


           id  malignant  radius_0  texture_0  perimeter_0  area_0  \
0      842302          1     17.99      10.38       122.80  1001.0   
1      842517          1     20.57      17.77       132.90  1326.0   
2    84300903          1     19.69      21.25       130.00  1203.0   
3    84348301          1     11.42      20.38        77.58   386.1   
4    84358402          1     20.29      14.34       135.10  1297.0   
..        ...        ...       ...        ...          ...     ...   
564    926424          1     21.56      22.39       142.00  1479.0   
565    926682          1     20.13      28.25       131.20  1261.0   
566    926954          1     16.60      28.08       108.30   858.1   
567    927241          1     20.60      29.33       140.10  1265.0   
568     92751          0      7.76      24.54        47.92   181.0   

     smoothness_0  compactness_0  concavity_0  concave points_0  ...  \
0         0.11840        0.27760      0.30010           0.14710  ...   
1         0.084

In [3]:
grid_search.best_params_

{'criterion': 'entropy'}

In [4]:
estimator = grid_search.best_estimator_

In [5]:
vars(estimator)

{'criterion': 'entropy',
 'splitter': 'best',
 'max_depth': None,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'min_weight_fraction_leaf': 0.0,
 'max_features': None,
 'max_leaf_nodes': None,
 'random_state': None,
 'min_impurity_decrease': 0.0,
 'class_weight': None,
 'ccp_alpha': 0.0,
 'feature_names_in_': array(['id', 'radius_0', 'texture_0', 'perimeter_0', 'area_0',
        'smoothness_0', 'compactness_0', 'concavity_0', 'concave points_0',
        'symmetry_0', 'fractal dimension_0', 'radius_1', 'texture_1',
        'perimeter_1', 'area_1', 'smoothness_1', 'compactness_1',
        'concavity_1', 'concave points_1', 'symmetry_1',
        'fractal dimension_1', 'radius_2', 'texture_2', 'perimeter_2',
        'area_2', 'smoothness_2', 'compactness_2', 'concavity_2',
        'concave points_2', 'symmetry_2', 'fractal dimension_2'],
       dtype=object),
 'n_features_in_': 31,
 'n_outputs_': 1,
 'classes_': array([0, 1]),
 'n_classes_': 2,
 'max_features_': 31,
 'tree_': <sklearn.

In [6]:

# forest = RandomForestClassifier(criterion=estimator.criterion, min_samples_split=estimator.min_samples_split)
kwargs = estimator.get_params()
del kwargs['splitter']
forest = RandomForestClassifier(**kwargs)
forest.fit(X_train, y_train)

In [7]:
vars(forest)

{'base_estimator': DecisionTreeClassifier(),
 'n_estimators': 100,
 'estimator_params': ('criterion',
  'max_depth',
  'min_samples_split',
  'min_samples_leaf',
  'min_weight_fraction_leaf',
  'max_features',
  'max_leaf_nodes',
  'min_impurity_decrease',
  'random_state',
  'ccp_alpha'),
 'bootstrap': True,
 'oob_score': False,
 'n_jobs': None,
 'random_state': None,
 'verbose': 0,
 'warm_start': False,
 'class_weight': None,
 'max_samples': None,
 'criterion': 'entropy',
 'max_depth': None,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'min_weight_fraction_leaf': 0.0,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'ccp_alpha': 0.0,
 'feature_names_in_': array(['id', 'radius_0', 'texture_0', 'perimeter_0', 'area_0',
        'smoothness_0', 'compactness_0', 'concavity_0', 'concave points_0',
        'symmetry_0', 'fractal dimension_0', 'radius_1', 'texture_1',
        'perimeter_1', 'area_1', 'smoothness_1', 'compactness_1',
        'concavity_1', 

In [8]:
forest.score(X_test, y_test)

0.965034965034965