In [27]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston, load_breast_cancer
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn import metrics

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
SEED = 42

In [3]:
X, y = load_boston(return_X_y=True)

In [4]:
#kfold
kfold = KFold(n_splits=5, shuffle=True, random_state=19)
kfold.get_n_splits(X, y);

In [5]:
# standadization
std_scale = preprocessing.StandardScaler().fit(X)
X = std_scale.transform(X)

### Comparar os resultados dos seguintes algoritmos: DecisionTree, RandomForest e Gradient Boosting
#### Regressão

In [6]:
learning_rate =  [0.1, 0.05, 0.01]
n_estimators = [50, 100, 200]
max_depth = [3, 5, 7]

In [7]:
# RandomForestRegressor
rfr_rmse = []
rfr_mae = []

gbr_rmse = []
gbr_mae = []

dtr_rmse = []
dtr_mae = []

modelos_gbr = dict()
for train_index, test_index in kfold.split(X,y):
    for estimator in n_estimators:
        for depth in max_depth:
            # Random Forest
            randomForestRegressor = RandomForestRegressor(max_depth=depth, random_state=SEED, n_estimators=estimator, n_jobs=-1)
            randomForestRegressor.fit(X[train_index], y[train_index])
            y_pred = randomForestRegressor.predict(X[test_index])
            rfr_rmse.append(np.sqrt(metrics.mean_squared_error(y[test_index], y_pred)))
            rfr_mae.append(metrics.mean_absolute_error(y[test_index], y_pred))
            # Gradiente Boosting
            for rate in learning_rate:
                gbr = GradientBoostingRegressor(learning_rate=rate, n_estimators=estimator,random_state=SEED)
                gbr.fit(X[train_index], y[train_index])
                y_pred = gbr.predict(X[test_index])
                gbr_rmse.append(np.sqrt(metrics.mean_squared_error(y[test_index], y_pred)))
                gbr_mae.append(metrics.mean_absolute_error(y[test_index], y_pred))
print("# Random Forest")
print(np.mean(rfr_rmse))
print(np.mean(rfr_mae))

print("\n# Gradiente Boosting")
print(np.mean(gbr_rmse))
print(np.mean(gbr_mae))

# DecisionTreeRegressor
for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = DecisionTreeRegressor(random_state=SEED)
    clf.fit(X_train, y_train);
    y_pred_gd = clf.predict(X_test)
    rmse = np.sqrt(metrics.mean_squared_error(y_test,y_pred_gd))
    mae = metrics.mean_absolute_error(y_test, y_pred_gd)
    dtr_rmse.append(rmse)
    dtr_mae.append(mae)
print("\n# DecisionTreeRegressor")
print(np.mean(dtr_rmse))
print(np.mean(dtr_mae)) 

# Random Forest
3.452894799895121
2.4528569883897884

# Gradiente Boosting
3.690215043177053
2.7070969185016533

# DecisionTreeRegressor
4.429473517786514
2.913828382838284


In [8]:
# o melhor modelo foi o randomForestRegressor
randomForestRegressor = RandomForestRegressor(max_depth=5, random_state=SEED, n_estimators=100, n_jobs=-1)
randomForestRegressor.fit(X[train_index], y[train_index])
randomForestRegressor.feature_importances_

array([4.06645591e-02, 1.82781632e-04, 3.41046471e-03, 2.34851532e-04,
       1.19450984e-02, 4.14251230e-01, 9.35151803e-03, 4.79532318e-02,
       3.41338613e-03, 9.63349633e-03, 8.56043283e-03, 3.81017192e-03,
       4.46588778e-01])

#### Classificação

In [16]:
X,y = load_breast_cancer(return_X_y=True) # Dataset: breast cancer wisconsin dataset

In [17]:
std_scale = preprocessing.StandardScaler().fit(X)
X = std_scale.transform(X)

In [33]:
dtc_rmse = []
dtc_mae = []

rfc_rmse = []
rfc_mae = []

gbc_rmse = []
gbc_mae = []
# DecisionTreeClassifier
for train_index, test_index in kfold.split(X, y):
    dtc = DecisionTreeClassifier(random_state=SEED)
    dtc.fit(X[train_index], y[train_index])
    y_pred = dtc.predict(X[test_index])
    rmse = np.sqrt(metrics.mean_squared_error(y[test_index],y_pred))
    mae = metrics.mean_absolute_error(y[test_index],y_pred)
    dtc_rmse.append(rmse)
    dtc_mae.append(mae)
    for estimator in n_estimators:
        for depth in max_depth:
            # RandomForestClassifier
            rfc = RandomForestClassifier(n_estimators=estimator, max_depth=depth, random_state=SEED, n_jobs=-1)
            rfc.fit(X[train_index], y[train_index])
            y_pred = rfc.predict(X[test_index])
            rmse = np.sqrt(metrics.mean_squared_error(y[test_index],y_pred))
            mae = metrics.mean_absolute_error(y[test_index],y_pred)
            rfc_rmse.append(rmse)
            rfc_mae.append(mae)
            for rate in learning_rate:
                # GradientBoostingClassifier
                gbc = GradientBoostingClassifier(learning_rate=rate, n_estimators=estimator, max_depth=depth, random_state=SEED)
                gbc.fit(X[train_index], y[train_index])
                y_pred = gbc.predict(X[test_index])
                rmse = np.sqrt(metrics.mean_squared_error(y[test_index],y_pred))
                mae = metrics.mean_absolute_error(y[test_index],y_pred)
                gbc_rmse.append(rmse)
                gbc_mae.append(mae)
print("# DecisionTreeClassifier")
print(np.mean(dtc_rmse))
print(np.mean(dtc_mae))
print("\n# RandomForestClassifier")
print(np.mean(rfc_rmse))
print(np.mean(rfc_mae))
print("\n# GradientBoostingClassifier")
print(np.mean(gbc_rmse))
print(np.mean(gbc_mae))

# DecisionTreeClassifier
0.24838620074695766
0.06334420121099207

# RandomForestClassifier
0.20894175200325343
0.04528454863806518

# GradientBoostingClassifier
0.2503443717222367
0.06520841599245575


In [36]:
# o melhor modelo foi o randomForestClassifier
randomForestClassifier = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=SEED, n_jobs=-1)
randomForestClassifier.fit(X[train_index], y[train_index])
randomForestClassifier.feature_importances_

array([0.03398677, 0.01510743, 0.08334399, 0.05962119, 0.0042937 ,
       0.01100051, 0.05899307, 0.05891376, 0.00353153, 0.00390844,
       0.0211925 , 0.00214323, 0.01178205, 0.03952919, 0.00244982,
       0.00396661, 0.00458008, 0.00238788, 0.00318195, 0.00384533,
       0.07456312, 0.01939381, 0.10111154, 0.16413222, 0.00822359,
       0.01232358, 0.03576346, 0.14167411, 0.0091443 , 0.00591121])