## Weights

For every PC, we get a set of weights for each feature. The weights tell us how much each feature contributes to the PC.

In [15]:
from sklearn.cluster import KMeans
import sklearn.datasets
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

dataset = sklearn.datasets.load_iris(as_frame=True)

X = (dataset
     ['data']
     .loc[:, ['sepal length (cm)', 'sepal width (cm)']]
)
y = dataset['target']

# Run PCA on all of the Iris dataset
pca = PCA()

X_all = (dataset
     ['data']
     #.loc[:, ['sepal length (cm)', 'sepal width (cm)']]
)
pca.fit(X_all)
X_pca = pca.transform(X_all)

In [16]:
# convert components to a dataframe
import pandas as pd

components = pd.DataFrame(pca.components_, columns=X_all.columns,
                          index=['PC1', 'PC2', 'PC3', 'PC4'])

components


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
PC1,0.361387,-0.084523,0.856671,0.358289
PC2,0.656589,0.730161,-0.173373,-0.075481
PC3,-0.58203,0.597911,0.076236,0.545831
PC4,0.315487,-0.319723,-0.479839,0.753657


In [17]:
# Centered data - for next cell's calculation
X_all - X_all.mean()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.743333,0.442667,-2.358,-0.999333
1,-0.943333,-0.057333,-2.358,-0.999333
2,-1.143333,0.142667,-2.458,-0.999333
3,-1.243333,0.042667,-2.258,-0.999333
4,-0.843333,0.542667,-2.358,-0.999333
...,...,...,...,...
145,0.856667,-0.057333,1.442,1.100667
146,0.456667,-0.557333,1.242,0.700667
147,0.656667,-0.057333,1.442,0.800667
148,0.356667,0.342667,1.642,1.100667


In [18]:
# calculating PC1 by hand for first row - linear combination of
# centered variables and the first component
-.743333 * .3613 + 0.4426 * -0.0845 + -2.358 * 0.8566 + -0.9993 * 0.3582

-2.6837779729

In [19]:
# Manually calculating PCA with numpy
import numpy as np
nums = X_all - X_all.mean()
vals, vecs = np.linalg.eig(nums.cov())
idxs = pd.Series(vals).argsort()

explained_variance = pd.Series(sorted(vals, reverse=True))

def set_columns(df_):
    df_.columns = [f'PC{i+1}' for i in range(len(df_.columns))]
    return df_

comps = (pd.DataFrame(vecs, index=nums.columns)
 .iloc[:, idxs[::-1]]
 .pipe(set_columns)
)

pcas = (nums.dot(comps))
pcas

Unnamed: 0,PC1,PC2,PC3,PC4
0,-2.684126,-0.319397,-0.027915,0.002262
1,-2.714142,0.177001,-0.210464,0.099027
2,-2.888991,0.144949,0.017900,0.019968
3,-2.745343,0.318299,0.031559,-0.075576
4,-2.728717,-0.326755,0.090079,-0.061259
...,...,...,...,...
145,1.944110,-0.187532,0.177825,0.426196
146,1.527167,0.375317,-0.121898,0.254367
147,1.764346,-0.078859,0.130482,0.137001
148,1.900942,-0.116628,0.723252,0.044595


In [22]:
# Use plotly to plot the first three components
import plotly.express as px
fig = px.scatter_3d(pcas, x='PC1', y='PC2', z='PC3', color=y)
fig.show()