In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

In [3]:
X,y = make_classification(
    n_features = 10,
    n_samples = 1000,
    n_informative = 4,
    n_redundant=2,
    n_repeated = 0,
    n_classes = 2,
    random_state = 42)

## METHOD: 1 using train test split evaluation

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.25,random_state=42)


from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='gini',max_depth=10)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.86      0.91      0.89       124
           1       0.91      0.86      0.88       126

    accuracy                           0.88       250
   macro avg       0.89      0.88      0.88       250
weighted avg       0.89      0.88      0.88       250



In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.25,random_state=42)


from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='gini',max_depth=5)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.85      0.94      0.89       124
           1       0.93      0.83      0.88       126

    accuracy                           0.88       250
   macro avg       0.89      0.88      0.88       250
weighted avg       0.89      0.88      0.88       250



In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.25,random_state=42)


from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy',max_depth=5)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.83      0.94      0.88       124
           1       0.94      0.81      0.87       126

    accuracy                           0.88       250
   macro avg       0.88      0.88      0.88       250
weighted avg       0.88      0.88      0.88       250



In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.25,random_state=42)


from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy',max_depth=10)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.87      0.87      0.87       124
           1       0.87      0.87      0.87       126

    accuracy                           0.87       250
   macro avg       0.87      0.87      0.87       250
weighted avg       0.87      0.87      0.87       250



In [12]:
## the above we use both gini and entropy with max depth 5 and 10

## METHOD: 2 using cross val evaluation

In [13]:
from sklearn.model_selection import cross_val_score
cross_val_score(DecisionTreeClassifier(criterion='entropy',max_depth=5),X,y,cv=5)

array([0.925, 0.915, 0.88 , 0.865, 0.855])

In [14]:
cross_val_score(DecisionTreeClassifier(criterion='entropy',max_depth=10),X,y,cv=5)

array([0.905, 0.925, 0.885, 0.86 , 0.87 ])

In [15]:
cross_val_score(DecisionTreeClassifier(criterion='gini',max_depth=5),X,y,cv=5)

array([0.91 , 0.895, 0.895, 0.87 , 0.84 ])

In [16]:
cross_val_score(DecisionTreeClassifier(criterion='gini',max_depth=10),X,y,cv=5)

array([0.875, 0.885, 0.885, 0.835, 0.855])

In [17]:
criterion = ['gini','entropy']
max_depth = [5,10,15]
scores={}
for c in criterion:
    for d in max_depth:
        clf = DecisionTreeClassifier(criterion=c,max_depth=d)
        score_list = cross_val_score(clf,X,y,cv=5)
        scores[c + "_"+ str(d)] = np.average(score_list)
scores

{'gini_5': 0.882,
 'gini_10': 0.867,
 'gini_15': 0.8649999999999999,
 'entropy_5': 0.889,
 'entropy_10': 0.8920000000000001,
 'entropy_15': 0.8880000000000001}

### both these methods are time consuming

## GridsearchCV

In [19]:
from sklearn.model_selection import GridSearchCV


In [21]:
clf = GridSearchCV(
    DecisionTreeClassifier(),
    {
        'criterion':['gini','entropy'],
        'max_depth': [5,10,15]
    },
    cv=5,
    return_train_score = False
)

clf.fit(X,y)
clf.cv_results_

{'mean_fit_time': array([0.00620904, 0.00701995, 0.00651593, 0.00705624, 0.00757766,
        0.00719991]),
 'std_fit_time': array([0.00041735, 0.00187706, 0.00037415, 0.00030348, 0.00048519,
        0.00091719]),
 'mean_score_time': array([0.00059919, 0.        , 0.00019979, 0.00040045, 0.00019932,
        0.00020161]),
 'std_score_time': array([0.00048924, 0.        , 0.00039959, 0.00049045, 0.00039864,
        0.00040321]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[5, 10, 15, 5, 10, 15],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'criterion': 'gini', 'max_depth': 5},
  {'criterion': 'gini', 'max_depth': 10},
  {'criterion': 'gini', 'max_depth': 15},
  {'criterion': 'entropy',

In [22]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006209,0.000417,0.000599,0.000489,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.905,0.895,0.885,0.875,0.845,0.881,0.020591,4
1,0.00702,0.001877,0.0,0.0,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.885,0.895,0.87,0.84,0.86,0.87,0.019235,5
2,0.006516,0.000374,0.0002,0.0004,gini,15,"{'criterion': 'gini', 'max_depth': 15}",0.86,0.875,0.865,0.83,0.825,0.851,0.019849,6
3,0.007056,0.000303,0.0004,0.00049,entropy,5,"{'criterion': 'entropy', 'max_depth': 5}",0.92,0.915,0.88,0.87,0.85,0.887,0.026758,3
4,0.007578,0.000485,0.000199,0.000399,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.915,0.93,0.865,0.855,0.885,0.89,0.028636,1
5,0.0072,0.000917,0.000202,0.000403,entropy,15,"{'criterion': 'entropy', 'max_depth': 15}",0.91,0.92,0.88,0.84,0.895,0.889,0.028,2


In [23]:
df[['param_criterion','param_max_depth','mean_test_score']]

Unnamed: 0,param_criterion,param_max_depth,mean_test_score
0,gini,5,0.881
1,gini,10,0.87
2,gini,15,0.851
3,entropy,5,0.887
4,entropy,10,0.89
5,entropy,15,0.889


In [24]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 10}

In [25]:
clf.best_estimator_

## GridSearchcsv using 2 models

In [40]:
from sklearn.svm import SVC

In [42]:
model_params = {
    'Decision_tree':{
        'model': DecisionTreeClassifier(),
        'params':{
            'criterion':['gini','entropy'],
            'max_depth':[5,10,15,20]
        }
        
    },
    'svm':{
        'model': SVC(gamma='auto'),
        'params':{
            'C':[1,10,20,30],
            'kernel':['rbf','linear']
        }
        
    }
}
scores=[]
for key,val in model_params.items():
    clf = GridSearchCV(val['model'],val['params'],cv=5,return_train_score=False)
    clf.fit(X,y)
    scores.append({
        'model':key,
        'best_score':clf.best_score_,
        'best_parmas':clf.best_params_
    })
    
scores

[{'model': 'Decision_tree',
  'best_score': 0.891,
  'best_parmas': {'criterion': 'entropy', 'max_depth': 15}},
 {'model': 'svm',
  'best_score': 0.917,
  'best_parmas': {'C': 1, 'kernel': 'rbf'}}]

In [43]:
pd.DataFrame(scores)

Unnamed: 0,model,best_score,best_parmas
0,Decision_tree,0.891,"{'criterion': 'entropy', 'max_depth': 15}"
1,svm,0.917,"{'C': 1, 'kernel': 'rbf'}"
