# Principal component analysis
Unsupervised dimensionality reduction

## Method 1: Using NumPy

### Load data, split data, standardization

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_wine
import matplotlib.pyplot as plt

#Load data
dataObj = load_wine()
X = dataObj.data
y = dataObj.target

# Create DataFrame with features
df = pd.DataFrame(X)
df.columns = dataObj.feature_names

# Add class column
df.insert(loc=0, column="Class", value=y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### Standardize the original dataset

In [None]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
dfOri = pd.DataFrame(X_train_std)
display(dfOri)

### Construct covarince matrix

In [None]:
cov_mat = np.cov(X_train_std, rowvar=False)
dfCov = pd.DataFrame(cov_mat)
display(dfCov)

In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(dfOri.corr(), cmap=sns.diverging_palette(220, 10, as_cmap=True),square=True, ax=ax, vmin=-1, vmax=1)

### Eigendecomposition of the covariance matrix.

In [None]:
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)

In [None]:
#Sort eigenvalues
idx = np.argsort(eigen_vals)
idx = idx[::-1] #Sort from max to min
eigen_vals = eigen_vals[idx]
eigen_vecs = eigen_vecs[:,idx]


In [None]:
#Eigenvectors
display(pd.DataFrame(eigen_vecs))

In [None]:
#Eigenvalues
display(pd.DataFrame(eigen_vals.reshape(1,-1)))

### Total and explained variance

In [None]:
tot = sum(eigen_vals)
var_explained = eigen_vals/tot
cum_var_explained = np.cumsum(var_explained)


In [None]:
plt.bar(range(1, 14), var_explained, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1, 14), cum_var_explained, where='mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
# plt.savefig('./figures/pca1.png', dpi=300)
plt.show()

### Transformation matrix, W

In [None]:
w = eigen_vecs[:,[0,1]]
display(pd.DataFrame(w))

### Transforming traning data

In [None]:
X_train_pca = X_train_std.dot(w)
display(pd.DataFrame(X_train_pca))

### Visualizing training data

In [None]:
from PlotFunction3 import plot_reduced_dim
plot_reduced_dim(X_train_pca, y_train, "PCA")

### Note
- It is possible to have Matrix W with its signs flipped. 
- If $v$ is an eigenvector of a matrix $\Sigma$, we have $\Sigma v = \lambda v,$ where $\lambda$ is our eigenvalue.
- Then $-v$ is also an eigenvector that has the same eigenvalue, since $\Sigma(-v) = -\Sigma v = -\lambda v = \lambda(-v).$

In [None]:
#Using eigenvectors with different signs
w2 = np.copy(w)
w2[:,0] = -w2[:,0]
X_train_pca = X_train_std.dot(w2)
plot_reduced_dim(X_train_pca, y_train, "PCA")

w2 = np.copy(w)
w2[:,1] = -w2[:,1]
X_train_pca = X_train_std.dot(w2)
plot_reduced_dim(X_train_pca, y_train, "PCA")

w2 = np.copy(w)
w2 = -w2
X_train_pca = X_train_std.dot(w2)
plot_reduced_dim(X_train_pca, y_train, "PCA")


## Method 2: Using SKL

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
X_train_pca = pca.fit_transform(X_train_std)
print(pca.explained_variance_ratio_)

In [None]:
plt.bar(range(1, 14), pca.explained_variance_ratio_, alpha=0.5, align='center')
plt.step(range(1, 14), np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.show()

In [None]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

In [None]:
plot_reduced_dim(X_train_pca, y_train, "PCA")

## Training with logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr = lr.fit(X_train_pca, y_train)

In [None]:
from PlotFunction2 import plot_decision_surface2

plot_decision_surface2(X_train_pca, X_test_pca, y_train, y_test, lr)