In [None]:
# Upgrade matplotlib on Colab
# !pip install matplotlib --upgrade

## Métodos de clasificación

### Análisis de discriminantes lineales

- Es un método para clasificar elementos
  - Cada variable aleatorio se asocia con una dimensión geométrica
- Divide el espacio n-dimensional
  - por medio lineas rectas (planos, hiperplanos)
- Los sectores que se obtienen separan las categorías.

In [None]:
import scipy.stats as st
import matplotlib.pyplot as plt
import numpy as np

data_a = st.multivariate_normal.rvs([1, 1], [[0.0, 0.1],[0.1, 0.0]], size=100)
data_b = st.multivariate_normal.rvs([-1, -1], [[0.0, 0.1],[0.1, 0.0]], size=100)
# data_c = st.multivariate_normal.rvs([1, -1], [[0.0, 0.1],[0.1, 0.0]], size=100)

joint_data = np.row_stack(
    [
        data_a,
        data_b,
        # data_c
    ]
)

plt.scatter(
    joint_data[:, 0],
    joint_data[:, 1],
)
plt.xlim(-4, 4)
plt.ylim(-4, 4)


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
fitted = lda.fit(
    joint_data,
    [1]*100 +
    [2]*100
    # + [3] *100
)


Podemos predecir las categorías de nuevos datos.

In [None]:
fitted.predict(joint_data)

Podemos extraer los datos de las rectas

In [None]:

print(f"Los valores de la/s ordenadas al origen es/son: {fitted.intercept_}")
print(f"Los valores de los coeficientes son: {fitted.coef_}")


También podemos extraer loo puntos medio de cada categoría

In [None]:
print(f"Las medias de las categorías son:\n{fitted.means_}")


Las rectas que describen los planos se definen como:
- $0 = a + b_1 \times X_1 + b_2\times X_2 \ldots $

In [None]:
m = fitted.coef_[0, 0] / fitted.coef_[0, 1]
m1 = fitted.intercept_ / fitted.coef_[0, 1]
# m1 = fitted.intercept_[2] / fitted.coef_[2, 1]

line_div = lambda x: - m * x - m1
plt.scatter(
    joint_data[:, 0],
    joint_data[:, 1],
    c = [0] * 100 + [1] * 100 # [2] * 100
)
plt.xlim(-4, 4)
plt.ylim(-4, 4)

xs = np.linspace(-4, 4, 100)
plt.plot(
    xs,
    line_div(xs),
    color = "red"
)
plt.text(
    1, 3,
    s = "$X_2 = -\\frac{b_1}{b_2}\\times X_1 - \\frac{a}{b_2}$",
    fontsize = 16
)
plt.xlabel("$X_1$")
plt.ylabel("$X_2$")
plt.tight_layout()

Ejemplo con el dataset Iris

In [None]:
import sklearn.datasets as datasets

iris = datasets.load_iris(as_frame = True)
df = iris["frame"]

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split

train, test = train_test_split(df)
lda = LinearDiscriminantAnalysis()
fitted = lda.fit(train.drop(columns=["target"]), train["target"])
predicted_classes = fitted.predict(test.drop(columns=["target"]))

In [None]:
prediction_table = (predicted_classes == test["target"]).value_counts()
accuracy = prediction_table[True] / (prediction_table.sum())
print(f"Accuracy: {accuracy*100:0.2f}%")

### Análisis de discriminantes cuadráticos

In [None]:

import scipy.stats as st
import matplotlib.pyplot as plt
import numpy as np

data_a = st.multivariate_normal.rvs([0, 1.3], [[0.0, 0.1],[0.1, 0.0]], size=100)
data_b = st.multivariate_normal.rvs([-1, 0], [[0.0, 0.2],[0.2, 0.0]], size=100)
data_c = st.multivariate_normal.rvs([1, 0], [[0.0, 0.2],[0.2, 0.0]], size=100)

joint_data = np.row_stack(
    [
        data_a,
        data_b,
        data_c
    ]
)

plt.scatter(
    joint_data[:, 0],
    joint_data[:, 1],
    c = [0] * 100 + [1] * 200
)
plt.xlim(-4, 4)
plt.ylim(-4, 4)

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split

joint_data = np.column_stack([joint_data, [0]*100 + [1]*200])

train, test = train_test_split(joint_data)

qda = QuadraticDiscriminantAnalysis()

fitted = qda.fit(joint_data[:, [0, 1]], joint_data[:, 2])

predicted = fitted.predict(test[:, [0, 1]])

In [None]:
import pandas as pd
prediction_table = pd.Series(predicted == test[:,2]).value_counts()

print(prediction_table)

accuracy = prediction_table[True] / prediction_table.sum()
print(f"accuracy = {accuracy}")

Podemos visualizar el área de decisión

In [None]:
xy = np.meshgrid(
    np.linspace(-4, 4, 50),
    np.linspace(-4, 4, 50)
)
xy = np.array([xy[0].ravel(), xy[1].ravel()])

plt.scatter(
    xy.T[:,0],
    xy.T[:,1],
    c = (fitted.decision_function(xy.T)>0),
    alpha = 0.3,
    s = 5
)

plt.scatter(
    joint_data[:, 0],
    joint_data[:, 1],
    c = ['red'] * 100 + ['blue'] * 200
)


In [None]:
import pandas as pd

data = [
    [0, "A1", "B1", "C1"],
    [0, "A1", "B1", "C1"],
    [0, "A1", "B1", "C2"],
    [0, "A1", "B1", "C2"],
    [0, "A1", "B1", "C1"],
    [0, "A2", "B1", "C2"],
    [1, "A1", "B3", "C1"],
    [1, "A1", "B3", "C2"],
    [1, "A1", "B1", "C1"],
    [1, "A2", "B1", "C2"],
    [1, "A2", "B1", "C1"],
    [1, "A2", "B1", "C2"],
    [1, "A2", "B3", "C1"],
    [1, "A2", "B3", "C2"],
]

df = pd.DataFrame(
    data = data,
    columns = ["Group", "VarA", "VarB", "VarC"]
)

df = pd.get_dummies(df)

lda = LinearDiscriminantAnalysis()

lda.fit(df.iloc[:, 1:], df["Group"])

predicted = lda.predict(df.iloc[:, 1:])

predicted == df["Group"]