# Principal Component Analysis
This includes import files and basic cleaning required for Principal Component Analysis. We remove the categorical variables and include only the quantitative variables.

In [13]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [40]:
cols = [('latitude'), ('longitude'), ('depth'), ('Gap'), ('dmin'), ('rms'), 
    ('horizontalError'), ('depthError'), ('magError'), ('magNST')]
X = pd.read_csv('Xpca.csv', names=cols)  
Y = pd.read_csv('Y_query.csv')
#Y.astype(float)
#X.dtypes
X[cols] = X[cols].apply(pd.to_numeric, axis=1, errors='coerce')
X = X.drop(X.index[0])
#X.head()
#X.dtypes
#X.fillna(0)

all_inf_or_nan = X.isin([np.inf, -np.inf, np.nan]).all(axis='columns')
#X[~all_inf_or_nan]

X = X.fillna(value=0) # null => 0
#assert that there are no missing values
assert pd.notnull(X).all().all()
#X.info(null_counts=True)
#Y.dtypes

Split testing data from training data.

In [43]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)  
#print(X)
#X.dtypes

Standardizing the input variables to have uniformity 

In [5]:
scaler = StandardScaler()
# Fit on training set only
scaler.fit(X_train)
# Apply transform to both the training set and the test set
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

PCA is performed using Sklearn package. Shape of transformed variable is also printed.

In [44]:
pca = PCA()  
X_train = pca.fit_transform(X_train)  
X_test = pca.transform(X_test)  
#print(pca.n_components_)
#print(fit.components_)
#print(X.feature_names)
#print(X.shape)
#print(X_test.shape)
#print(X_test)
#print(X)

Explained variance and explained variance ratio are calculated

In [7]:
#ex_variance = np.var(X_test, axis=0)
#ex_variance_ratio = ex_variance / np.sum(ex_variance)
#print(ex_variance_ratio)
#print(ex_variance)

[0.23038235 0.17982504 0.13206089 0.10831807 0.09987504 0.08549434
 0.05002188 0.04506632 0.03783222 0.03112384]
[2.34254575 1.82847505 1.34280549 1.1013866  1.01553725 0.86931314
 0.50862643 0.45823787 0.38468096 0.31646965]


PCA Components and expalined variance are found

In [45]:
# access values and vectors
#print(pca.components_)
#print(pca.explained_variance_)
# transform data
#B = pca.transform(X)
#print(B)

Number of Principal Components 

In [9]:
#print(pca.n_components_)

10


Data is Centerized

In [10]:
X_tilde = X - np.mean(X, axis=0)  
#X_tilde = X - np.mean(X, axis=1)

#assert X.shape == X_tilde.shape
print('Shape of the centerized data:', X_tilde.shape)

Shape of the centerized data: (8582, 10)


Covariance matrix, Eigen Vectors, U Calculation

In [15]:
# Covariance matrix
covmat = np.dot(X_tilde.T, X_tilde) / X.shape[1]

# Compute u'_i, which is stored in the variable v
w, v = np.linalg.eig(covmat)

# Compute u_i from u'_i, and store it in the variable U
#U = np.dot(X_tilde,v)
U = np.dot(X_tilde, v[0:v.shape[1]])
           
# Normalize u_i, i.e., each column of U
U /= (np.linalg.norm(U, axis=0))

# Evaluate eigenvalues
ratios = w / np.sum(w)
print('PC1 explains {}% of the total variance'.format(ratios[0]))
print('PC2 explains {}% of the total variance'.format(ratios[1]))
print('First 100 PCs explains {}% of the total variance\n'.format(sum(ratios[:100])))

# Evaluate U
print('Shape of U:\n', U.shape)
print('First 5 elements of first column of U:\n', U[:5, 0])
print('First 5 elements of last column of U:\n', U[:5, -1])

PC1 explains 0.5570657818007323% of the total variance
PC2 explains 0.2884794436333058% of the total variance
First 100 PCs explains 0.9999999999999998% of the total variance

Shape of U:
 (8582, 10)
First 5 elements of first column of U:
 [ 0.01579272  0.01369365  0.0377011  -0.00403405  0.00303744]
First 5 elements of last column of U:
 [ 0.00357306  0.0001198  -0.01514307  0.00513204 -0.00469218]
