In [9]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import pickle as pk

In [2]:
mnist_data = fetch_openml('mnist_784')
X, y = mnist_data["data"], mnist_data["target"]

In [None]:
# plt.imshow(X.to_numpy()[170, :].reshape(28, 28))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), test_size=0.2, random_state=42)

#### Árvore de decisão

In [4]:
tree_clf = DecisionTreeClassifier(random_state=42)
param_grid_tree = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

grid_search_tree = GridSearchCV(tree_clf, param_grid_tree, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_tree.fit(X_train, y_train)

print("Best parameters:", grid_search_tree.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2; total time=   3.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2; total time=   3.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2; total time=   3.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=10; total time=   3.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=10; total time=   3.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=10; total time=   3.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=20; total time=   3.1s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=20; total time=   3.1s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=20; total time=   3.2s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=2; total time=   3.2s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=2; total time=   3.4s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=2; total time=   3.4s
[CV] END m

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [5]:
y_pred_tree = grid_search_tree.predict(X_test)

In [6]:
acc_tree = accuracy_score(y_test, y_pred_tree)
acc_tree

0.8742142857142857

In [10]:
with open('../modelos/model_tree.pkl', 'wb') as pkl:
    pk.dump(grid_search_tree.best_estimator_, pkl)

#### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search_rf = GridSearchCV(rf_clf, param_grid_rf, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

print("Best parameters:", grid_search_rf.best_params_)

In [7]:
y_pred_rf = grid_search_rf.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest accuracy: {acc_rf:.2f}")

Random Forest accuracy: 0.94


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.1s finished


#### XGBoost

In [11]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(n_estimators=100, max_depth=10, random_state=42, learning_rate=1, verbosity=1, n_jobs=-1)
xgb_clf.fit(X_train, y_train)

In [12]:
y_pred_xgb = xgb_clf.predict(X_test)

acc_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Random Forest accuracy: {acc_xgb:.2f}")

Random Forest accuracy: 0.97


In [13]:
with open('../modelos/model_xgb.pkl', 'wb') as pkl:
    pk.dump(xgb_clf, pkl)

#### Visualização da Árvore de Decisão e Medida de Impureza

In [None]:
#### Visualizar a árvore de decisão treinada no exercício anterior, utilizando ferramentas como o Scikit-learn ou o Graphviz.
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(20,10))
plot_tree(grid_search_tree.best_estimator_, filled=True)
plt.show()

In [17]:
#### Analisar como a medida de impureza afeta a escolha dos atributos para dividir os dados em cada nó da árvore
tree = grid_search_tree.best_estimator_.tree_
feature_indices = tree.feature
thresholds = tree.threshold

In [18]:
for i in range(tree.node_count):
    if tree.children_left[i] != tree.children_right[i]:  # If it's not a leaf node
        print(f"Node {i}: Split on feature {feature_indices[i]} at threshold {thresholds[i]}")

Node 0: Split on feature 350 at threshold 126.5
Node 1: Split on feature 435 at threshold 0.5
Node 2: Split on feature 597 at threshold 1.5
Node 3: Split on feature 486 at threshold 0.5
Node 4: Split on feature 404 at threshold 0.5
Node 5: Split on feature 538 at threshold 1.0
Node 6: Split on feature 153 at threshold 0.5
Node 7: Split on feature 460 at threshold 21.0
Node 8: Split on feature 542 at threshold 84.5
Node 9: Split on feature 157 at threshold 5.0
Node 10: Split on feature 379 at threshold 164.0
Node 11: Split on feature 582 at threshold 163.5
Node 12: Split on feature 622 at threshold 3.5
Node 13: Split on feature 733 at threshold 80.0
Node 14: Split on feature 488 at threshold 104.0
Node 18: Split on feature 244 at threshold 235.5
Node 21: Split on feature 412 at threshold 233.0
Node 24: Split on feature 633 at threshold 126.0
Node 25: Split on feature 326 at threshold 107.0
Node 27: Split on feature 713 at threshold 133.5
Node 31: Split on feature 386 at threshold 64.0
N