In [1]:
from sklearn import preprocessing
import numpy as np

In [2]:
# Carga de datos
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

In [3]:
# Limpieza de datos: estandarización.
standardizer = preprocessing.StandardScaler()
X_train_std = standardizer.fit_transform(X_train)

In [4]:
# Salida de resultados.
print(X_train_std)

[[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]


In [5]:
# Ejemplo para ver que los resultados son iguales si aplicamos los métodos "fit" y "transform" por separado
X_train_2 = np.copy(X_train)
f = standardizer.fit(X_train)
X_train_std_2 = f.transform(X_train_2)
print(X_train_std_2)

[[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]


In [6]:
# Comprobar si las matrices estandarizadas son iguales
print(np.array_equiv(X_train_std, X_train_std_2))

True


In [7]:
# Estandarizar un nuevo conjunto de datos de test (SOLO HAY QUE APLICAR EL MÉTODO "transform", NO EL "fit")
X_test = np.array([[ 1., -24.,  2.],
                    [ 400.,  0.,  0.],
                    [ 0.,  1., -1.]])

X_test_std = f.transform(X_test)
print(X_test_std)

[[ 0.00000000e+00 -2.93938769e+01  1.33630621e+00]
 [ 4.88673204e+02  0.00000000e+00 -2.67261242e-01]
 [-1.22474487e+00  1.22474487e+00 -1.06904497e+00]]


#### Ejemplo para DETECTAR OUTLIERS a partir de la estandarización 

In [8]:
# Definimos una matriz de datos grande y colocamos manualmente dos outliers en las posiciones [9,1] y [14,2]
X_train = np.array([[ 1., -1.,  2.],
                    [ 0,  0.,  0.],
                    [ 0.,  1., -1.], 
                   [ 1., -1.,  2.],
                    [ 1.,  0.,  0.],
                    [ 0.,  1., -1.],
                   [ 1., -1.,  2.],
                    [ 1.,  0.,  0.],
                    [ 0.,  1., -1.],
                   [ 1., 300.,  2.],
                    [ 1.,  0.,  0.],
                    [ 0.,  1., -1.],
                    [ 1., -1.,  2.],
                    [ 1.,  0.,  0.],
                    [ 0.,  1., 300.],
                   [ 1.,  0.,  0.],
                    [ 0.,  1., -1.],
                    [ 1., -1.,  2.],
                    [ 1.,  0.,  0.],
                    [ 0.,  1., -1.]])

In [9]:
# Estandarizamos
standardizer = preprocessing.StandardScaler()
X_train_std = standardizer.fit_transform(X_train)
print(X_train_std)

[[ 0.81649658 -0.24630899 -0.2035961 ]
 [-1.22474487 -0.2310103  -0.23421206]
 [-1.22474487 -0.2157116  -0.24952003]
 [ 0.81649658 -0.24630899 -0.2035961 ]
 [ 0.81649658 -0.2310103  -0.23421206]
 [-1.22474487 -0.2157116  -0.24952003]
 [ 0.81649658 -0.24630899 -0.2035961 ]
 [ 0.81649658 -0.2310103  -0.23421206]
 [-1.22474487 -0.2157116  -0.24952003]
 [ 0.81649658  4.35859824 -0.2035961 ]
 [ 0.81649658 -0.2310103  -0.23421206]
 [-1.22474487 -0.2157116  -0.24952003]
 [ 0.81649658 -0.24630899 -0.2035961 ]
 [ 0.81649658 -0.2310103  -0.23421206]
 [-1.22474487 -0.2157116   4.35818119]
 [ 0.81649658 -0.2310103  -0.23421206]
 [-1.22474487 -0.2157116  -0.24952003]
 [ 0.81649658 -0.24630899 -0.2035961 ]
 [ 0.81649658 -0.2310103  -0.23421206]
 [-1.22474487 -0.2157116  -0.24952003]]


In [10]:
# Los outliers serán aquellos valores cuyo resultado estandarizado sea superior a 3 unidades
[x,y] = np.where(X_train_std>3)
print('Outliers detectados en las coordenadas: ')
for i in np.arange(0,len(x)):
    print('[', str(x[i]), ',', str(y[i]), ']')


Outliers detectados en las coordenadas: 
[ 9 , 1 ]
[ 14 , 2 ]
