# Outliers

In [None]:
import numpy as np
import matplotlib.pyplot as plt

d1 = np.loadtxt("outlier_1d.txt")
d2 = np.loadtxt("outlier_2d.txt")
d3 = np.loadtxt("outlier_curve.txt")
print(d1.shape, d2.shape)

plt.scatter(d1, np.random.normal(7, 0.2, size=d1.size), s=1, alpha=0.5)
plt.scatter(d2[:, 0], d2[:, 1], s=5)
plt.show();
plt.plot(d3[:, 0], d3[:, 1]);

# Basico

In [None]:
media, std = np.mean(d1), np.std(d1)
z_score = np.abs((d1 - media) / std)
umbral = 3
buenos = z_score < umbral

print(f"Rechazar {(~buenos).sum()} puntos")
from scipy.stats import norm
print(f"z-score de 3 corresponde a la probabilidad de {100 * 2 * norm.sf(umbral):0.2f}%")
visual_scatter = np.random.normal(size=d1.size)
plt.scatter(d1[buenos], visual_scatter[buenos], s=2, label="Good", color="#4CAF50")
plt.scatter(d1[~buenos], visual_scatter[~buenos], s=8, label="Bad", color="#F44336")
plt.legend();

In [None]:
from scipy.stats import multivariate_normal as mn

media, cov = np.mean(d2, axis=0), np.cov(d2.T)
buenos = mn(media, cov).pdf(d2) > 0.01 / 100

plt.scatter(d2[buenos, 0], d2[buenos, 1], s=2, label="Good", color="#4CAF50")
plt.scatter(d2[~buenos, 0], d2[~buenos, 1], s=8, label="Bad", color="#F44336")
plt.legend();

# Valores atípicos en el ajuste de curvas

In [None]:
xs, ys = d3.T
p = np.polyfit(xs, ys,deg=5)
ps = np.polyval(p, xs)
plt.plot(xs, ys, ".", label="Datos", ms=1)
plt.plot(xs, ps, label="Mal ajuste del polinomio")
plt.legend();

In [None]:
x, y = xs.copy(), ys.copy()
for i in range(5):
    p = np.polyfit(x, y, deg=5)
    ps = np.polyval(p, x)
    buenos = y - ps < 3  
    
    x_mala, y_mala = x[~buenos], y[~buenos]
    x, y = x[buenos], y[buenos]
    
    plt.plot(x, y, ".", label="Datos Usados", ms=1)
    plt.plot(x, np.polyval(p, x), label=f"Polinomio Ajustado de grado {i}")
    plt.plot(x_mala, y_mala, ".", label="Datos No Usados", ms=5, c="r")
    plt.legend()
    plt.show()
    
    if (~buenos).sum() == 0:
        break

# Automating it

In [None]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(n_neighbors=20, contamination=0.005)
good = lof.fit_predict(d2) == 1
plt.scatter(d2[good, 0], d2[good, 1], s=2, label="Buenos", color="#4CAF50")
plt.scatter(d2[~good, 0], d2[~good, 1], s=8, label="Malos", color="#F44336")
plt.legend();