In [52]:
import numpy as np
from sklearn.decomposition import PCA
from bokeh.plotting import figure, output_notebook, show
from bokeh.palettes import Category10
np.set_printoptions(precision=3,suppress=True)
output_notebook()
N=50

## Simple Example

In [53]:
x = np.random.multivariate_normal([0,0],[[2,-2],[-2,6]],size=N)
x=x-x.mean(axis=0)
p=figure(title='Random sample from multivariate normal distribution with principal components',match_aspect=True)
p.scatter(x=x[:,0],y=x[:,1])


In [54]:
q = x.transpose() @ x
V, L =np.linalg.eigh(q)
print(V)
print(L)

[ 53.746 293.585]
[[-0.951 -0.309]
 [-0.309  0.951]]


In [55]:
xaxis=np.linspace(-4,4,10)
ux,uy=xaxis*L[0,0],xaxis*L[1,0]
vx,vy=xaxis*L[0,1],xaxis*L[1,1]

In [56]:
p.line(x=ux,y=uy,color='green')
p.line(x=vx,y=vy,color='green')
show(p)

## Dimensionality Reduction

In [57]:
data = np.loadtxt(fname="data/simulated_pca_data.csv",delimiter=',')
data.shape

(200, 16)

In [58]:
data = data - data.mean(axis=0)
Q=data.transpose() @ data

In [59]:
p=figure(title="First two features of the data")
p.scatter(x=data[:,0],y=data[:,1])
show(p)

In [60]:
L, V = np.linalg.eigh(Q)

In [61]:
p=figure(title="Eigenvalues")
p.line(x=list(range(L.shape[0])),y=L)
show(p)

In [62]:
V.shape
V[:,-2:]

array([[ 0.152, -0.116],
       [-0.017,  0.081],
       [-0.162, -0.122],
       [ 0.057, -0.089],
       [ 0.261,  0.   ],
       [-0.398,  0.01 ],
       [-0.038, -0.06 ],
       [-0.249, -0.043],
       [ 0.247,  0.212],
       [ 0.167, -0.162],
       [-0.203, -0.048],
       [ 0.466, -0.222],
       [ 0.173, -0.239],
       [ 0.305,  0.097],
       [ 0.43 ,  0.258],
       [-0.043,  0.834]])

In [63]:
PCA = data @ V[:,-2:]

In [64]:
p=figure(title="First Two Principal Components",x_range=(-3,3),y_range=(-3,3))
p.scatter(x=PCA[:,0],y=PCA[:,1])
show(p)

In [74]:

p=figure(title="First Two Principal Components with Loadings from first 8 features",x_range=(-3,3),y_range=(-3,3))
p.scatter(x=PCA[:,0],y=PCA[:,1])
for i in range(8):
    lx,ly=np.linspace(0,3,10),np.linspace(0,3,10)*V[i,1]/V[i,0]
    p.line(x=lx,y=ly,legend_label="feature {}".format(i),color=Category10[8][i],line_width=3)
show(p)

In [66]:
colors

[3, 4, 5, 6, 7, 8, 9, 10]

In [69]:
Category10[8]

('#1f77b4',
 '#ff7f0e',
 '#2ca02c',
 '#d62728',
 '#9467bd',
 '#8c564b',
 '#e377c2',
 '#7f7f7f')

In [75]:
P, D, Q = np.linalg.svd(data)

In [76]:
P.shape

(200, 200)

In [77]:
Q.shape

(16, 16)

In [78]:
D


array([22.833, 19.594, 14.744, 11.131,  4.939,  4.622,  4.487,  4.457,
        4.302,  4.163,  4.096,  3.919,  3.829,  3.624,  3.293,  2.965])