In [None]:
# upgrade matplotlib on Colab
# !pip install matplotlib --upgrade

# Unidad 3 - Clasificación

## Árboles de desición

- Divide el espacio n-dimensional de las variables aleatorias
  - Hace sucesivas divisiones
    - Cada división parte el espacio en dos
    - La división se da por el valor de una única variable.
    - A cada una de las sub-regiones puede volver a subvidivirse.
  - Luego de una series de divisiones quedan definidos espacios que se
    assignan a un categoria.
  - Durante el entrenamiento
    - Se eligen las variables y los valores de corte de cada división


In [None]:
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt

data_a = np.array(
  [[0, 0]] * 25 +
  [[1, 1]] * 25 +
  [[2, 0.5]] * 25 +
  [[3, 0]] * 25
)
data_a2 = st.multivariate_normal.rvs(
  [0 ,0],
  [[0, 0.1], [0.1, 0]],
  size = 100
)
data_a = data_a + data_a2

data_b = np.array(
  [[-1.5, -0.5]] * 25 +
  [[1.1, -0.5]] * 25 +
  [[-0.3, -1]] * 25 +
  [[-1.4, 0.7]] * 25
)
data_b2 = st.multivariate_normal.rvs(
  [0 ,0],
  [[0, 0.1], [0.1, 0]],
  size = 100
)
data_b = data_b + data_b2

data = np.row_stack((data_a, data_b))

plt.scatter(
  *(data.T.tolist()),
  c=[0] * 100 + [1] * 100,
)
plt.xlabel("Var 1")
plt.ylabel("Var 2")

In [None]:

plt.xlabel("Var 1")
plt.ylabel("Var 2")
plt.scatter(*(data.T.tolist()), c=[0] * 100 + [1] * 100)
# Primer corte
plt.plot([-2, 4], [0, 0], color = "red")
plt.annotate(
  xy = (3.5, 0),
  xytext = (4, 1.5),
  text = "$Var2 >= 0$",
  arrowprops={"arrowstyle": "->"},
  ha = "right"
)

# Segundo corte
plt.plot([-2, 4], [0, 0], color = "gray")
plt.plot([1.8, 1.8], [-1.5, 0], color = "red")
plt.annotate(
  xy = (1.8, -0.75),
  xytext = (4, -1.5),
  text = "$Var1 >= 1.8$",
  arrowprops={"arrowstyle": "->"},
  ha = "right"
)

# Tercer corte
plt.plot([-2, 4], [0, 0], color = "gray")
plt.plot([1.8, 1.8], [-1.5, 0], color = "gray")
plt.plot([-0.80, -0.80], [0, 1.5], color = "red")
plt.annotate(
  xy = (-0.8, 0.75),
  xytext = (4, -0.7),
  text = "$Var1 >= -0.8$",
  arrowprops={"arrowstyle": "->"},
  ha = "right"
)

# Otros cortes
plt.plot([-2, 4], [0, 0], color = "gray")
plt.plot([1.8, 1.8], [-1.5, 0], color = "gray")
plt.plot([-2, 1.8], [-0.4, -0.4], color = "gray")
plt.plot([-0.8, -0.8], [0, -0.4], color = "red")
plt.plot([0.70, 0.70], [0, -0.4], color = "red")
plt.plot([-0.80, -0.80], [0, 1.5], color = "red")

Veamos como se hace en Python.

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
fitted = dt.fit(data, [0]*100 + [1]*100)

In [None]:
predicted = fitted.predict(data)
predicted

In [None]:
all(predicted == [0] * 100 + [1] * 100)

In [None]:
from sklearn.inspection import DecisionBoundaryDisplay

DecisionBoundaryDisplay.from_estimator(
  dt,
  data,
  response_method="predict",
  alpha = 0.3
)
plt.scatter(*(data.T.tolist()), c=[0] * 100 + [1] * 100)


In [None]:
from sklearn.tree import plot_tree

fig, axes = plt.subplots(figsize=(10,8))
plot_tree(dt, filled=True, ax = axes)
fig.tight_layout()

In [None]:
from sklearn import datasets

iris = datasets.load_iris(as_frame=True)

df = iris["frame"]

df

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import pandas as pd

train, test = train_test_split(df)
tree = DecisionTreeClassifier()

fitted = tree.fit(train.drop(columns=["target"]), train["target"])

predicted = fitted.predict(test.drop(columns=["target"]))

prediction = pd.Series(predicted==test["target"]).value_counts()
accuracy = prediction[True] / prediction.sum()

print(f"Accuracy: {accuracy}")


In [None]:
from sklearn.tree import plot_tree

plot_tree(
  fitted,
  filled = True
)

### RandomForest

- Se construyen muchos arboles de decisión
- Se elije por votación simple.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

iris = load_iris()

X = iris.data
y = iris.target

# split testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Create Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100)

# Create 5-fold cross-validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)

# Perform cross-validation
cv_scores = cross_val_score(
  rf_classifier,
  X_train,
  y_train,
  cv=k_fold,
  scoring = "accuracy"
)

# cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())
print("Mean CV deviation:", cv_scores.std())

# Train full model
fitted = rf_classifier.fit(X_train, y_train)
predicted = fitted.predict(X_test)

# Full model accuracy
acc = accuracy_score(y_test, predicted)
print("Full model accuracy:", acc)
