# Example Computations

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from jupyterthemes import jtplot
jtplot.style(ticks=True, grid=True, gridlines='--')

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

## Prepping a Toy Dataset

In [None]:
creditCardDf = pd.read_csv('./data/CC GENERAL.csv').set_index('CUST_ID')

In [None]:
creditCardDf.shape

In [None]:
pd.DataFrame({
    'type' : creditCardDf.dtypes,
    'num_null' : creditCardDf.isnull().sum()
})

In [None]:
creditCardDf['MINIMUM_PAYMENTS'].fillna(0, inplace=True)
creditCardDf['CREDIT_LIMIT'].fillna(creditCardDf['CREDIT_LIMIT'].mean(), inplace=True)

## PCA

### Doing it Manually

In [None]:
X = creditCardDf.values

In [None]:
X = (X - X.mean(axis=0))

In [None]:
Sigma = np.dot(X.T, X) / X.shape[0]

In [None]:
eigValues, eigVectors = np.linalg.eig(Sigma)

In [None]:
plt.figure()
plt.plot(eigValues)

### Comparing to PCA package

In [None]:
pca = PCA().fit(X)

In [None]:
plt.figure()
plt.plot(range(len(eigValues)), eigValues, label='manual')
plt.scatter(range(len(pca.explained_variance_)), pca.explained_variance_, label='sklearn')
plt.title('Eigenvalues are identical')
plt.legend()

In [None]:
plt.figure()
plt.imshow(np.abs(pca.components_.T - eigVectors))
plt.colorbar()
plt.title('Eigenvectors seem to mostly agree')

In [None]:
perm = np.dot(np.linalg.inv(eigVectors), pca.components_.T)

plt.figure()
plt.imshow(perm)
plt.colorbar()
plt.title('They differ by a signed permutation matrix.')

I get the signed part, since that is arbitrary.  Shouldn't the eigenvectors be more or less ordered though?  I guess one of the eigenspaces has dimension 2, which is surprising to me.  Oh duh, the matrix is not full rank, so the eigenvectors in the kernel probably get assigned some eigenvalues that are below the threshold for floating point arithmetic error.