In [1]:
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.datasets import make_regression
from sklearn.datasets import make_classification
from sklearn.datasets import make_gaussian_quantiles
from sklearn import ensemble

In [2]:
X,Y = make_classification(n_samples = 10000, n_features = 12, 
                     n_informative = 10, n_classes = 5, random_state = 0 )
print(X.shape)
print(Y.shape)

(10000, 12)
(10000,)


In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3)

In [4]:
score = np.array([])
for j in np.arange(20) + 2:
    mod_reg = tree.DecisionTreeClassifier(max_depth = j)
    mod_cv = cross_val_score(mod_reg, X_train,Y_train, cv = 10)
    score = np.append(score, np.mean(mod_cv))
    print("j->", j, "Mean->", np.mean(mod_cv))
param = np.argmax(score) + 2
param

j-> 2 Mean-> 0.39931700949436577
j-> 3 Mean-> 0.45116054561905405
j-> 4 Mean-> 0.48715790508429624
j-> 5 Mean-> 0.5451465784813557
j-> 6 Mean-> 0.5701440571105187
j-> 7 Mean-> 0.6065685566305453
j-> 8 Mean-> 0.6227029111401191
j-> 9 Mean-> 0.6341360781194785
j-> 10 Mean-> 0.6440020435640008
j-> 11 Mean-> 0.6530062139406458
j-> 12 Mean-> 0.6457257825220596
j-> 13 Mean-> 0.6460192537299654
j-> 14 Mean-> 0.6424410772537503
j-> 15 Mean-> 0.6440082057635883
j-> 16 Mean-> 0.6460182283558801
j-> 17 Mean-> 0.6361604145864275
j-> 18 Mean-> 0.6394334978501615
j-> 19 Mean-> 0.6418702402084547
j-> 20 Mean-> 0.6397208401324772
j-> 21 Mean-> 0.6397214474648506


11

In [5]:
mod_reg = tree.DecisionTreeClassifier(max_depth = param)
mod_reg_fit = mod_reg.fit(X_train,Y_train)
error = mod_reg_fit.score(X_test,Y_test)
print(param)
print("Error - > ", error)

#mod = tree.DecisionTreeRegressor(max_depth = 3)
#mod.fit(X_train, Y_train)
#plt.figure(figsize = (12,12))
#tree.plot_tree(mod)
#plt.show()  

11
Error - >  0.67


In [6]:
Y_predict = mod_reg_fit.predict(X_train)
print(classification_report(Y_train, Y_predict))

              precision    recall  f1-score   support

           0       0.87      0.88      0.88      1402
           1       0.85      0.89      0.87      1387
           2       0.92      0.91      0.91      1407
           3       0.92      0.87      0.90      1409
           4       0.86      0.87      0.86      1395

    accuracy                           0.88      7000
   macro avg       0.88      0.88      0.88      7000
weighted avg       0.89      0.88      0.88      7000



In [7]:
Y_predict = mod_reg_fit.predict(X_test)
print(classification_report(Y_test, Y_predict))

              precision    recall  f1-score   support

           0       0.68      0.67      0.68       596
           1       0.64      0.66      0.65       613
           2       0.70      0.69      0.70       598
           3       0.68      0.66      0.67       592
           4       0.65      0.66      0.66       601

    accuracy                           0.67      3000
   macro avg       0.67      0.67      0.67      3000
weighted avg       0.67      0.67      0.67      3000



In [8]:
score = np.array([])
for j in np.arange(8) + 2:
    mod_reg = ensemble.GradientBoostingClassifier(max_depth = j)
    mod_cv = cross_val_score(mod_reg, X_train,Y_train, 
                             cv = 10)
    score = np.append(score, np.mean(mod_cv))
    print("j->", j, "Mean->", np.mean(mod_cv))
param = np.argmax(score) + 2
param

j-> 2 Mean-> 0.6828583735088314
j-> 3 Mean-> 0.7451456673121358
j-> 4 Mean-> 0.7798581240898675
j-> 5 Mean-> 0.8082801146191736
j-> 6 Mean-> 0.820714674802838
j-> 7 Mean-> 0.8208608072687351
j-> 8 Mean-> 0.8257153130667753
j-> 9 Mean-> 0.8245716337547812


8

In [9]:
mod_reg_boost = ensemble.GradientBoostingClassifier(max_depth = param)
mod_reg_boost_fit = mod_reg_boost.fit(X_train,Y_train)
error = mod_reg_boost_fit.score(X_test,Y_test)
print(param)
print("Error - > ", error)

8
Error - >  0.8343333333333334


In [10]:
Y_predict = mod_reg_boost_fit.predict(X_train)
print(classification_report(Y_train, Y_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1402
           1       1.00      1.00      1.00      1387
           2       1.00      1.00      1.00      1407
           3       1.00      1.00      1.00      1409
           4       1.00      1.00      1.00      1395

    accuracy                           1.00      7000
   macro avg       1.00      1.00      1.00      7000
weighted avg       1.00      1.00      1.00      7000



In [11]:
Y_predict = mod_reg_boost_fit.predict(X_test)
print(classification_report(Y_test, Y_predict))

              precision    recall  f1-score   support

           0       0.84      0.82      0.83       596
           1       0.81      0.83      0.82       613
           2       0.87      0.84      0.86       598
           3       0.86      0.84      0.85       592
           4       0.80      0.84      0.82       601

    accuracy                           0.83      3000
   macro avg       0.84      0.83      0.83      3000
weighted avg       0.84      0.83      0.83      3000



In [12]:
score = np.array([])
for j in np.arange(20) + 2:
    mod_reg = ensemble.RandomForestClassifier(max_depth = j,
                                             max_features = 4,
                                             n_estimators = 200)
    mod_cv = cross_val_score(mod_reg, X_train,Y_train, 
                             cv = 10)
    score = np.append(score, np.mean(mod_cv))
    print("j->", j, "Mean->", np.mean(mod_cv))
param = np.argmax(score) + 2
param

j-> 2 Mean-> 0.4740290715469115
j-> 3 Mean-> 0.5338843282619978
j-> 4 Mean-> 0.5988818073792428
j-> 5 Mean-> 0.6547292592779459
j-> 6 Mean-> 0.6932951959040592
j-> 7 Mean-> 0.7322937874216658
j-> 8 Mean-> 0.7537259307284544
j-> 9 Mean-> 0.775004003094442
j-> 10 Mean-> 0.7865783571819859
j-> 11 Mean-> 0.8001555759783014
j-> 12 Mean-> 0.8071578696268435
j-> 13 Mean-> 0.8138676958695406
j-> 14 Mean-> 0.8137264801370401
j-> 15 Mean-> 0.8151564816327846
j-> 16 Mean-> 0.8197203786595584
j-> 17 Mean-> 0.814298924748005
j-> 18 Mean-> 0.8171564763422559
j-> 19 Mean-> 0.8172930136422447
j-> 20 Mean-> 0.822441829204311
j-> 21 Mean-> 0.8197279352647199


20

In [13]:
mod_reg_rf = ensemble.RandomForestClassifier(max_depth = param,
                                             max_features = 4,
                                             n_estimators = 100)
mod_reg_rf_fit = mod_reg_rf.fit(X_train,Y_train)
error = mod_reg_rf_fit.score(X_test,Y_test)
print(param)
print("Error - > ", error)

20
Error - >  0.8286666666666667


In [14]:
Y_predict = mod_reg_rf_fit.predict(X_train)
print(classification_report(Y_train, Y_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1402
           1       1.00      1.00      1.00      1387
           2       1.00      1.00      1.00      1407
           3       1.00      1.00      1.00      1409
           4       1.00      1.00      1.00      1395

    accuracy                           1.00      7000
   macro avg       1.00      1.00      1.00      7000
weighted avg       1.00      1.00      1.00      7000



In [15]:
Y_predict = mod_reg_rf_fit.predict(X_test)
print(classification_report(Y_test, Y_predict))

              precision    recall  f1-score   support

           0       0.83      0.81      0.82       596
           1       0.82      0.83      0.82       613
           2       0.85      0.84      0.85       598
           3       0.86      0.81      0.83       592
           4       0.79      0.85      0.81       601

    accuracy                           0.83      3000
   macro avg       0.83      0.83      0.83      3000
weighted avg       0.83      0.83      0.83      3000



In [16]:
score = np.array([])
for j in np.arange(20) + 2:
    mod_reg = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth = j), 
                                         n_estimators = 100)
    mod_cv = cross_val_score(mod_reg, X_train,Y_train, 
                             cv = 10)
    score = np.append(score, np.mean(mod_cv))
    print("j->", j, "Mean->", np.mean(mod_cv))
param = np.argmax(score) + 2
param

j-> 2 Mean-> 0.64057920233902
j-> 3 Mean-> 0.677572423930647
j-> 4 Mean-> 0.6887253714850747
j-> 5 Mean-> 0.7172912443458394
j-> 6 Mean-> 0.7612737077897223
j-> 7 Mean-> 0.7894312180161278
j-> 8 Mean-> 0.810275627098985
j-> 9 Mean-> 0.8211457121534081
j-> 10 Mean-> 0.8302912594794828
j-> 11 Mean-> 0.8328651592332077
j-> 12 Mean-> 0.8351463918746738
j-> 13 Mean-> 0.8341463953632988
j-> 14 Mean-> 0.8368694761868388
j-> 15 Mean-> 0.8415733779001819
j-> 16 Mean-> 0.8387280754131436
j-> 17 Mean-> 0.8382817180434939
j-> 18 Mean-> 0.8395766432580215
j-> 19 Mean-> 0.837430713496613
j-> 20 Mean-> 0.7657628340349174
j-> 21 Mean-> 0.7151988680533717


15

In [17]:
mod_reg_ada = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth = param), 
                                         n_estimators = 100)
mod_reg_ada_fit = mod_reg_ada.fit(X_train,Y_train)
error = mod_reg_ada_fit.score(X_test,Y_test)
print(param)
print("Error - > ", error)

15
Error - >  0.8466666666666667
