<a href="https://colab.research.google.com/github/javilledo/machine-learning/blob/master/notebooks/t10_01_an%C3%A1lisis_de_componentes_principales.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Análisis de Componentes Principales - Paso a Paso

* Normalizar los datos por columnas (para cada una de las m columnas)
* Obtener los vectores y valores propios a partir de la matriz de covarianzas o de correlaciones o inluso Singular Vector Decomposition
* Ordenar los valores propios en orden descendente y quedarnos con los *p* que se correspondan a los *p-mayores* y así disminuir el número de variables del dataset (p < m)
* Construir la matriz de proyección W a partir de los p vectores propios
* Transformar el dataset original X a través de W para así obtener datos del subespacio dimensional de dimensión *p*, que será Y

In [1]:
import pandas as pd

In [2]:
url = 'https://raw.githubusercontent.com/javilledo/python-ml-course/master/datasets/iris/iris.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [65]:
X = df.iloc[:,:4].values
y = df.iloc[:,4].values
X, y

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5

In [4]:
X.shape, Y.shape

((150, 4), (150,))

In [5]:
!pip install plotly==2.5.1

Collecting plotly==2.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/a7/3d/4dcdbafc9d5c01f468d41999cd9ab733f38e9ea4e4bea5a62841fedf5f0e/plotly-2.5.1.tar.gz (24.9MB)
[K     |████████████████████████████████| 24.9MB 1.5MB/s 
Building wheels for collected packages: plotly
  Building wheel for plotly (setup.py) ... [?25l[?25hdone
  Created wheel for plotly: filename=plotly-2.5.1-cp36-none-any.whl size=24951979 sha256=773a412a0d0f7a57f4992aefbc78266c23707683148e2c96b8538102988bfc16
  Stored in directory: /root/.cache/pip/wheels/33/be/39/f82c0f53ea29777fdc29afaf7bfad87442488a280662d355fb
Successfully built plotly
[31mERROR: cufflinks 0.17.3 has requirement plotly>=4.1.1, but you'll have plotly 2.5.1 which is incompatible.[0m
Installing collected packages: plotly
  Found existing installation: plotly 4.4.1
    Uninstalling plotly-4.4.1:
      Successfully uninstalled plotly-4.4.1
Successfully installed plotly-2.5.1


In [6]:
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls

In [7]:
tls.set_credentials_file(username = 'javilledo', api_key = 'VJYRyVaWzxF7X4SVNaha')

In [8]:
traces = []

legend = {0: False, 1: False, 2: False, 3: False}

colors = {'setosa': 'rgb(255,127,20)', 'versicolor': 'rgb(31, 220, 120)', 'virginica': 'rgb(44, 50, 180)'}

for col in range(4):
  for key in colors:
    traces.append(Histogram(x = X[Y==key, col], opacity = 0.7, xaxis = 'x%s'%(col+1), marker = Marker(color = colors[key]), name = key, showlegend = legend[col]))

data = Data(traces)
layout = Layout(barmode = 'overlay', 
                xaxis = XAxis(domain = [0, 0.25], title = 'Longitud sépalos (cm)'),
                xaxis2 = XAxis(domain = [0.3, 0.5], title = 'Anchura sépalos (cm)'),
                xaxis3 = XAxis(domain = [0.55, 0.75], title = 'Longitud pétalos (cm)'),
                xaxis4 = XAxis(domain = [0.8,1.0], title = 'Anchura pétalos (cm)'),
                yaxis = YAxis(title = 'Número de ejemplares'),
                title = 'Distribución de los rasgos de las diferentes flores iris')

fig = Figure(data = data, layout = layout)
py.iplot(fig)

PlotlyError: ignored

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
X_std = StandardScaler().fit_transform(X)
X_std

array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00],
       [-5.37177559e-01,  1.47939788e+00, -1.28338910e+00,
        -1.31544430e+00],
       [-1.26418478e+00,  7.88807586e-01, -1.22655167e+00,
      

In [11]:
traces = []

legend = {0: False, 1: False, 2: False, 3: False}

colors = {'setosa': 'rgb(255,127,20)', 'versicolor': 'rgb(31, 220, 120)', 'virginica': 'rgb(44, 50, 180)'}

for col in range(4):
  for key in colors:
    traces.append(Histogram(x = X_std[Y==key, col], opacity = 0.7, xaxis = 'x%s'%(col+1), marker = Marker(color = colors[key]), name = key, showlegend = legend[col]))

data = Data(traces)
layout = Layout(barmode = 'overlay', 
                xaxis = XAxis(domain = [0, 0.25], title = 'Longitud sépalos (cm)'),
                xaxis2 = XAxis(domain = [0.3, 0.5], title = 'Anchura sépalos (cm)'),
                xaxis3 = XAxis(domain = [0.55, 0.75], title = 'Longitud pétalos (cm)'),
                xaxis4 = XAxis(domain = [0.8,1.0], title = 'Anchura pétalos (cm)'),
                yaxis = YAxis(title = 'Número de ejemplares'),
                title = 'Distribución de los rasgos de las diferentes flores iris')

fig = Figure(data = data, layout = layout)
py.iplot(fig)

PlotlyError: ignored

### 1 - Calculamos la descomposición de valores y vectores propios
####a) Matriz de Covarianza

$\sigma_{jk} = \frac{1}{n-1}\sum_{i=1}^m(x_{ij} - \overline{x_j})(x_{ik} - \overline{x_k})$

$\Sigma = \frac{1}{n-1}((X - \overline{x})^T(X - \overline{x}))$

$\overline{x} = \sum_{i=1}^n x_i \in \mathbb R^m$

In [12]:
import numpy as np

In [13]:
mean_vect = np.mean(X_std, axis = 0)
mean_vect

array([-4.73695157e-16, -7.81597009e-16, -4.26325641e-16, -4.73695157e-16])

In [15]:
cov_matrix = (X_std - mean_vect).T.dot(X_std - mean_vect)/(X_std.shape[0] - 1)
cov_matrix

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

In [16]:
np.cov(X_std.T)

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

In [17]:
eig_vals, eig_vectors = np.linalg.eig(cov_matrix)
eig_vals, eig_vectors

(array([2.93808505, 0.9201649 , 0.14774182, 0.02085386]),
 array([[ 0.52106591, -0.37741762, -0.71956635,  0.26128628],
        [-0.26934744, -0.92329566,  0.24438178, -0.12350962],
        [ 0.5804131 , -0.02449161,  0.14212637, -0.80144925],
        [ 0.56485654, -0.06694199,  0.63427274,  0.52359713]]))

####b) Usando la Matriz de Correlaciones

In [18]:
corr_matrix = np.corrcoef(X_std.T)#La matriz de correlación es una normalización de la matriz de covarianzas
corr_matrix

array([[ 1.        , -0.11756978,  0.87175378,  0.81794113],
       [-0.11756978,  1.        , -0.4284401 , -0.36612593],
       [ 0.87175378, -0.4284401 ,  1.        ,  0.96286543],
       [ 0.81794113, -0.36612593,  0.96286543,  1.        ]])

In [19]:
eig_vals, eig_vectors = np.linalg.eig(corr_matrix)
eig_vals, eig_vectors

(array([2.91849782, 0.91403047, 0.14675688, 0.02071484]),
 array([[ 0.52106591, -0.37741762, -0.71956635,  0.26128628],
        [-0.26934744, -0.92329566,  0.24438178, -0.12350962],
        [ 0.5804131 , -0.02449161,  0.14212637, -0.80144925],
        [ 0.56485654, -0.06694199,  0.63427274,  0.52359713]]))

In [20]:
#Como la matriz de correlaciones ya está entre [0, 1] (normalizada), da lo mismo obtener la matriz de correlaciones de los datos normalizados o sin normalizar
corr_matrix = np.corrcoef(X.T)
corr_matrix

array([[ 1.        , -0.11756978,  0.87175378,  0.81794113],
       [-0.11756978,  1.        , -0.4284401 , -0.36612593],
       [ 0.87175378, -0.4284401 ,  1.        ,  0.96286543],
       [ 0.81794113, -0.36612593,  0.96286543,  1.        ]])

####c) Singular Value Decomposition

In [21]:
u, s, w = np.linalg.svd(X_std.T)
u, s, w #u son los vectores propios, s los valores propios, w
#Ojo que hace el mismo proceso, y aunque da otros valores y vectores propios, no tienen por qué dar igual

(array([[-0.52106591, -0.37741762,  0.71956635,  0.26128628],
        [ 0.26934744, -0.92329566, -0.24438178, -0.12350962],
        [-0.5804131 , -0.02449161, -0.14212637, -0.80144925],
        [-0.56485654, -0.06694199, -0.63427274,  0.52359713]]),
 array([20.92306556, 11.7091661 ,  4.69185798,  1.76273239]),
 array([[ 1.08239531e-01,  9.94577561e-02,  1.12996303e-01, ...,
         -7.27030413e-02, -6.56112167e-02, -4.59137323e-02],
        [-4.09957970e-02,  5.75731483e-02,  2.92000319e-02, ...,
         -2.29793601e-02, -8.63643414e-02,  2.07800179e-03],
        [ 2.72186462e-02,  5.00034005e-02, -9.42089147e-03, ...,
         -3.84023516e-02, -1.98939364e-01, -1.12588405e-01],
        ...,
        [ 5.43380310e-02,  5.12936114e-03,  2.75184277e-02, ...,
          9.89532683e-01, -1.41206665e-02, -8.30595907e-04],
        [ 1.96438400e-03,  8.48544595e-02,  1.78604309e-01, ...,
         -1.25488246e-02,  9.52049996e-01, -2.19201906e-02],
        [ 2.46978090e-03,  5.83496936e-03,  1

###2 - Las componentes principales

In [22]:
 #Los módulos de los vectores propios deben ser 1
 for eig_vector in eig_vectors:
   print('La longitud del vector propio es', np.linalg.norm(eig_vector))

La longitud del vector propio es 0.9999999999999994
La longitud del vector propio es 1.0
La longitud del vector propio es 1.0
La longitud del vector propio es 1.0000000000000002


In [23]:
#Aunque los valores propios han venido ordenados, no tiene por qué ser así
eigen_pairs = [(np.abs(eig_vals[i]), eig_vectors[:, i]) for i in range(len(eig_vals))]
eigen_pairs

[(2.918497816531994,
  array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654])),
 (0.9140304714680692,
  array([-0.37741762, -0.92329566, -0.02449161, -0.06694199])),
 (0.14675687557131487,
  array([-0.71956635,  0.24438178,  0.14212637,  0.63427274])),
 (0.02071483642861998,
  array([ 0.26128628, -0.12350962, -0.80144925,  0.52359713]))]

In [24]:
#Ordenamos los valores propios de mayor a menor
eigen_pairs.sort(reverse = True)
eigen_pairs

[(2.918497816531994,
  array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654])),
 (0.9140304714680692,
  array([-0.37741762, -0.92329566, -0.02449161, -0.06694199])),
 (0.14675687557131487,
  array([-0.71956635,  0.24438178,  0.14212637,  0.63427274])),
 (0.02071483642861998,
  array([ 0.26128628, -0.12350962, -0.80144925,  0.52359713]))]

In [27]:
print('Valores propios en orden descendente:')
for eigen_pair in eigen_pairs:
  print(eigen_pair[0])

Valores propios en orden descendente:
2.918497816531994
0.9140304714680692
0.14675687557131487
0.02071483642861998


In [32]:
#Importante: Como el valor propio es el equivalente a la matriz de covarianzas, su valor es la varianza de cada una de las nuevas variables
total_sum = sum(eig_vals)
total_sum

3.999999999999998

In [36]:
var_exp = [(i/total_sum)*100 for i in sorted(eig_vals, reverse = True)] #Devuelven el % de explicación que tiene cada una de las nuevas variables
cum_var_exp = np.cumsum(var_exp)
var_exp, cum_var_exp

([72.96244541329989,
  22.850761786701742,
  3.6689218892828737,
  0.5178709107154997],
 array([ 72.96244541,  95.8132072 ,  99.48212909, 100.        ]))

In [40]:
plot1 = Bar(x = ['CP %s' %i for i in range(1, 5)], y = var_exp, showlegend = False)
plot2 = Scatter(x = ['CP %s' %i for i in range(1,5)], y = cum_var_exp, showlegend = True, name = '% de Varianza Explicada Acumulada')
data = Data([plot1, plot2])
layout = Layout(xaxis = XAxis(title = 'Componentes principales'),
                yaxis = YAxis(title = '% de Varianza Explicada'),
                title = 'Porcentaje de variabilidad explicada por cada componente principal')
fig = Figure(data = data, layout = layout)
py.iplot(fig)

PlotlyError: ignored

In [45]:
#Se decide que con 2 variables en este nuevo espacio vectorial ya es suficiente (explica un 95.8132072%)
W = np.hstack((eigen_pairs[0][1].reshape(4,1),
               eigen_pairs[1][1].reshape(4,1))) #Se toman los dos primeros vectores propios
W

array([[ 0.52106591, -0.37741762],
       [-0.26934744, -0.92329566],
       [ 0.5804131 , -0.02449161],
       [ 0.56485654, -0.06694199]])

In [57]:
np.transpose([eigen_pairs[0][1].tolist(), eigen_pairs[1][1].tolist()]) #Es lo mismo que el hstack

array([[ 0.52106591, -0.37741762],
       [-0.26934744, -0.92329566],
       [ 0.5804131 , -0.02449161],
       [ 0.56485654, -0.06694199]])

In [47]:
X.shape, W.shape

((150, 4), (4, 2))

###3 - Proyectando las variables en el nuevo subespacio vectorial

$Y = X \cdot W, X \in M(\mathbb R)_{150, 4}, W \in M(\mathbb R)_{4, 2}, Y \in M(\mathbb R)_{150, 2})$

In [64]:
Y = X_std.dot(W)
Y

array([[-2.26470281, -0.4800266 ],
       [-2.08096115,  0.67413356],
       [-2.36422905,  0.34190802],
       [-2.29938422,  0.59739451],
       [-2.38984217, -0.64683538],
       [-2.07563095, -1.48917752],
       [-2.44402884, -0.0476442 ],
       [-2.23284716, -0.22314807],
       [-2.33464048,  1.11532768],
       [-2.18432817,  0.46901356],
       [-2.1663101 , -1.04369065],
       [-2.32613087, -0.13307834],
       [-2.2184509 ,  0.72867617],
       [-2.6331007 ,  0.96150673],
       [-2.1987406 , -1.86005711],
       [-2.26221453, -2.68628449],
       [-2.2075877 , -1.48360936],
       [-2.19034951, -0.48883832],
       [-1.898572  , -1.40501879],
       [-2.34336905, -1.12784938],
       [-1.914323  , -0.40885571],
       [-2.20701284, -0.92412143],
       [-2.7743447 , -0.45834367],
       [-1.81866953, -0.08555853],
       [-2.22716331, -0.13725446],
       [-1.95184633,  0.62561859],
       [-2.05115137, -0.24216355],
       [-2.16857717, -0.52714953],
       [-2.13956345,

In [67]:
results = []

for name in ('setosa', 'versicolor', 'virginica'):
  result = Scatter(x = Y[y == name, 0], y = Y[y == name, 1], mode = 'markers', name = name, marker = Marker(size = 12, line = Line(color = 'rgba(220, 220, 220, 0.15)', width = 0.5), opacity = 0.8 ))
  results.append(result)

data = Data(results)
layout = Layout(showlegend = True, scene = Scene(xaxis = XAxis(title = 'Componente principal 1'),
                                                 yaxis = YAxis(title = 'Componente principal 2')))
fig = Figure(data = data, layout = layout)
py.iplot(fig)

PlotlyError: ignored