In [13]:
import pandas as pd
wine = pd.read_csv('https://bit.ly/wine_csv_data')

In [14]:
data = wine[['alcohol','sugar', 'pH']].to_numpy()
target = wine[['class']]

print(data.shape, target.shape)

(6497, 3) (6497, 1)


In [15]:
from sklearn.model_selection import train_test_split

train_x, train_y, target_x, target_y = train_test_split(data, target, test_size=0.2, 
                                                        random_state=42)


In [16]:
sub_input, val_input, sub_target, val_target = train_test_split(train_x, target_x,
                                                                test_size=0.2, random_state=42)

In [17]:
print(sub_input.shape, val_input.shape)
print(train_y.shape, target_y.shape)

(4157, 3) (1040, 3)
(1300, 3) (1300, 1)


In [18]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


In [19]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_x, target_x)
print(scores)

{'fit_time': array([0.00799799, 0.00700116, 0.00699639, 0.00699902, 0.00901747]), 'score_time': array([0.        , 0.0010004 , 0.00099897, 0.00100112, 0.00098538]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [20]:
import numpy as np
print(np.mean(scores['test_score']))

0.855300214703487


In [21]:
from sklearn.model_selection import StratifiedGroupKFold
scores = cross_validate(dt, train_x, target_x, cv=10)
print(np.mean(scores['test_score']))

0.8616407292129834


In [22]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease' : [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [23]:
gs.fit(train_x, target_x)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'min_impurity_decrease': [0.0001, 0.0002, 0.0003,
                                                   0.0004, 0.0005]})

In [24]:
dt = gs.best_estimator_
print(dt.score(train_x, target_x))

0.9615162593804117


In [25]:
import pandas as pd
gs_df = pd.DataFrame(gs.cv_results_)


In [26]:
gs_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_impurity_decrease,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009196,0.000748,0.001,1.181556e-06,0.0001,{'min_impurity_decrease': 0.0001},0.869231,0.868269,0.882579,0.86718,0.853705,0.868193,0.009154,1
1,0.008798,0.000401,0.001,8.476443e-07,0.0002,{'min_impurity_decrease': 0.0002},0.871154,0.863462,0.876805,0.854668,0.856593,0.864536,0.008437,5
2,0.0166,0.010558,0.001,2.002716e-06,0.0003,{'min_impurity_decrease': 0.0003},0.869231,0.859615,0.875842,0.850818,0.869105,0.864922,0.008745,4
3,0.009998,0.002099,0.001,2.174712e-06,0.0004,{'min_impurity_decrease': 0.0004},0.869231,0.863462,0.881617,0.848893,0.875842,0.867809,0.01126,2
4,0.007998,2e-06,0.0012,0.0004002818,0.0005,{'min_impurity_decrease': 0.0005},0.865385,0.869231,0.882579,0.849856,0.87103,0.867616,0.01057,3


In [27]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [28]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [29]:
params = {
    'min_impurity_decrease' : np.arange(0.0001, 0.001, 0.0001),
    'max_depth' : range(5,20,1),
    'min_samples_split' : range(2, 100, 10)
}

In [30]:
dc = DecisionTreeClassifier(random_state=42)
gs = GridSearchCV(dc, params, n_jobs=-1)
gs.fit(train_x, target_x)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 20),
                         'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009]),
                         'min_samples_split': range(2, 100, 10)})

In [31]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}


In [32]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [33]:
from scipy.stats import uniform, randint

rgen = randint(0, 10)
rgen.rvs(1000)

array([9, 2, 7, 5, 2, 5, 4, 6, 7, 6, 5, 3, 0, 2, 4, 9, 8, 2, 4, 5, 6, 7,
       5, 2, 7, 7, 5, 8, 7, 6, 1, 0, 9, 0, 9, 9, 6, 9, 7, 7, 8, 4, 4, 5,
       0, 9, 0, 0, 2, 8, 8, 5, 5, 0, 7, 1, 0, 7, 4, 1, 8, 8, 7, 6, 2, 9,
       2, 7, 0, 0, 7, 0, 1, 6, 6, 8, 5, 8, 0, 1, 8, 5, 6, 8, 7, 2, 6, 4,
       1, 7, 8, 5, 0, 8, 4, 6, 6, 3, 0, 8, 2, 3, 3, 2, 5, 8, 7, 5, 2, 3,
       5, 0, 6, 1, 4, 1, 0, 5, 4, 3, 1, 0, 5, 8, 7, 1, 2, 5, 7, 4, 7, 0,
       8, 7, 7, 8, 3, 0, 0, 0, 4, 7, 5, 8, 2, 6, 6, 4, 0, 3, 4, 4, 1, 9,
       9, 1, 4, 3, 6, 4, 7, 0, 0, 4, 9, 1, 0, 4, 5, 9, 6, 7, 4, 8, 2, 9,
       9, 6, 0, 7, 6, 1, 8, 6, 0, 8, 7, 5, 9, 3, 3, 9, 9, 1, 2, 2, 9, 5,
       9, 1, 9, 2, 0, 8, 8, 7, 8, 6, 8, 1, 9, 2, 4, 5, 4, 6, 3, 3, 9, 7,
       7, 6, 5, 5, 0, 0, 2, 1, 8, 1, 5, 9, 8, 7, 7, 2, 3, 1, 9, 5, 1, 5,
       9, 0, 6, 8, 2, 0, 8, 4, 6, 2, 3, 4, 9, 8, 1, 9, 0, 6, 9, 7, 1, 1,
       1, 6, 4, 3, 6, 3, 4, 6, 6, 7, 8, 0, 0, 0, 0, 6, 5, 2, 7, 7, 0, 2,
       1, 1, 3, 9, 7, 6, 2, 3, 0, 4, 6, 8, 1, 6, 3,

In [34]:
np.unique(rgen.rvs(1000), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([110, 111,  95,  82,  86,  95,  98, 128, 102,  93], dtype=int64))

In [35]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.19320479, 0.68250553, 0.83613112, 0.32060179, 0.22846467,
       0.23861339, 0.13586923, 0.98712842, 0.27510052, 0.46291134])

In [36]:
params = {
    'min_impurity_decrease' : uniform(0.0001, 0.001),
    'max_depth' : randint(20, 50),
    'min_samples_split' : randint(2, 25),
    'min_samples_leaf' : randint(2,25)
}

from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params, n_iter=100, n_jobs=-1, random_state=42)

gs.fit(train_x, target_x)

RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000201DA585160>,
                                        'min_impurity_decrease': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000201DA3D2F10>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000201DA585820>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000201DA4DD9A0>},
                   random_state=42)

In [37]:
print(gs.best_params_)

{'max_depth': 41, 'min_impurity_decrease': 0.0003439896433790836, 'min_samples_leaf': 7, 'min_samples_split': 7}


In [38]:
gs_df = pd.DataFrame(gs.cv_results_)

In [39]:
gs_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_impurity_decrease,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008802,0.000401,0.001000,1.910922e-06,26,0.000897,16,12,"{'max_depth': 26, 'min_impurity_decrease': 0.0...",0.847115,0.869231,0.884504,0.849856,0.866218,0.863385,0.013682,44
1,0.008801,0.000401,0.001203,3.989377e-04,27,0.000699,8,20,"{'max_depth': 27, 'min_impurity_decrease': 0.0...",0.857692,0.871154,0.884504,0.848893,0.850818,0.862612,0.013444,65
2,0.009399,0.000801,0.001200,4.008059e-04,42,0.000158,22,5,"{'max_depth': 42, 'min_impurity_decrease': 0.0...",0.861538,0.873077,0.884504,0.848893,0.860443,0.865691,0.012128,18
3,0.008800,0.000399,0.001201,4.003787e-04,27,0.000808,23,22,"{'max_depth': 27, 'min_impurity_decrease': 0.0...",0.846154,0.870192,0.881617,0.845043,0.866218,0.861845,0.014200,75
4,0.008999,0.000631,0.001201,3.985434e-04,21,0.000822,7,3,"{'max_depth': 21, 'min_impurity_decrease': 0.0...",0.856731,0.871154,0.881617,0.850818,0.850818,0.862228,0.012216,71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.008199,0.000401,0.001001,2.879244e-06,47,0.000679,24,23,"{'max_depth': 47, 'min_impurity_decrease': 0.0...",0.847115,0.870192,0.883542,0.850818,0.856593,0.861652,0.013464,85
96,0.008400,0.000800,0.001000,5.722046e-07,38,0.000566,23,23,"{'max_depth': 38, 'min_impurity_decrease': 0.0...",0.850962,0.870192,0.886429,0.848893,0.856593,0.862614,0.014035,59
97,0.010600,0.000491,0.001400,4.903501e-04,36,0.000137,7,16,"{'max_depth': 36, 'min_impurity_decrease': 0.0...",0.860577,0.856731,0.876805,0.856593,0.861405,0.862422,0.007452,66
98,0.009199,0.000399,0.001200,3.999493e-04,41,0.000637,6,2,"{'max_depth': 41, 'min_impurity_decrease': 0.0...",0.858654,0.870192,0.886429,0.851781,0.851781,0.863767,0.013178,39


In [40]:
gs_df['mean_test_score'].describe()

count    100.000000
mean       0.863587
std        0.002169
min        0.859535
25%        0.861845
50%        0.863190
75%        0.864395
max        0.869543
Name: mean_test_score, dtype: float64

In [41]:
dt = gs.best_estimator_

In [42]:
print(dt.score(val_input, val_target))

0.9
