# Principal Component Analysis

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
cols = [('latitude'), ('longitude'), ('depth'), ('Gap'), ('dmin'), ('rms'), 
    ('horizontalError'), ('depthError'), ('magError'), ('magNST')]
X = pd.read_csv('Xpca.csv', names=cols)  
Y = pd.read_csv('Y_query.csv')
#Y.astype(float)
#X.dtypes
X[cols] = X[cols].apply(pd.to_numeric, axis=1, errors='coerce')
X = X.drop(X.index[0])
#X.head()
#X.dtypes
#X.fillna(0)

all_inf_or_nan = X.isin([np.inf, -np.inf, np.nan]).all(axis='columns')
#X[~all_inf_or_nan]

X = X.fillna(value=0) # null => 0
assert pd.notnull(X).all().all() # Assert that there are no missing values
X.info(null_counts=True)
Y.dtypes

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8582 entries, 1 to 8582
Data columns (total 10 columns):
latitude           8582 non-null float64
longitude          8582 non-null float64
depth              8582 non-null float64
Gap                8582 non-null float64
dmin               8582 non-null float64
rms                8582 non-null float64
horizontalError    8582 non-null float64
depthError         8582 non-null float64
magError           8582 non-null float64
magNST             8582 non-null float64
dtypes: float64(10)
memory usage: 737.5 KB


mag    float64
dtype: object

In [3]:
# Split training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)  
#print(X)
#X.dtypes

Standardize the data to create uniformity.

In [4]:
scaler = StandardScaler()
scaler.fit(X_train) # Fit scale to training set
# Transform both data sets to the new scale
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Perform PCA using Sklearn. Print the newly transformed data's shape.

In [5]:
pca = PCA()  
X_train = pca.fit_transform(X_train)  
X_test = pca.transform(X_test)  
print("Old shape:", X.shape)
print("New shape:", X_test.shape)
#print(X_test)
#print(X)

Old shape: (8582, 10)
New shape: (1717, 10)


In [6]:
ex_variance = np.var(X_test, axis=0) # Explained variance
ex_variance_ratio = ex_variance / np.sum(ex_variance) # Ratios for the former
#print(ex_variance)
print(ex_variance_ratio)

[0.23038235 0.17982504 0.13206089 0.10831807 0.09987504 0.08549434
 0.05002188 0.04506632 0.03783222 0.03112384]


In [7]:
print("PCA Components\n", pca.components_)
print("\nExplained Variance\n", pca.explained_variance_)
print("\nTransform data\n", pca.transform(X))

PCA Components
 [[-0.12683304  0.25710222  0.23004825  0.44442858  0.24055806 -0.25210326
   0.50366723  0.41845215  0.27810982 -0.19955123]
 [-0.04748508  0.3180444   0.33899192 -0.24531625  0.17510828 -0.16686814
   0.19695073  0.06207758 -0.55811845  0.55574278]
 [-0.46605003  0.13241605 -0.5172654   0.06143812  0.47088956 -0.08623054
   0.16035993 -0.47900257 -0.08930326 -0.0402332 ]
 [-0.53410117 -0.54371934  0.35655066 -0.43707055  0.16876909  0.10219045
   0.13466239  0.12027561  0.07303658 -0.15519589]
 [ 0.38969757 -0.1450206   0.02402129  0.11898722  0.54035309  0.69281142
   0.17280818  0.03384374 -0.03729166  0.08945386]
 [ 0.45421554 -0.45084226 -0.01840439 -0.0587368   0.39448784 -0.63714903
  -0.04445055 -0.09031406  0.08995233  0.08268809]
 [-0.25120076 -0.35418036 -0.30084315  0.31630195 -0.14393185  0.03739132
   0.01418855  0.27694621  0.17514169  0.70086468]
 [-0.05942838  0.01714044 -0.37726782 -0.08066831  0.23219949 -0.04985425
  -0.35673355  0.67093888 -0.389662

In [8]:
# Number of principal components 
pca.n_components_

10

In [9]:
# Centerized data
X_tilde = X - np.mean(X, axis=0)
#assert X.shape == X_tilde.shape

X_tilde.shape

(8582, 10)

In [10]:
# Covariance matrix
covmat = np.dot(X_tilde.T, X_tilde) / X.shape[1]

# Eigen values, Eigen vectors (u'_i)
w, v = np.linalg.eig(covmat)

# u_i
U = np.dot(X_tilde, v)
           
# Normalize u_i
U /= (np.linalg.norm(U, axis=0))

# Evaluate Eigen values
percents = w / np.sum(w) * 100
for pc in range(0, percents.shape[0]):
    print('PC{} explains {}% of the total variance'.format(pc + 1, percents[pc]))
print('First {} PCs explains {}% of the total variance\n'.format(percents.shape[0], sum(percents)))

# Evaluate U
print('Shape of U:\n', U.shape)
print('First 5 elements of first column of U:\n', U[:5, 0])
print('First 5 elements of last column of U:\n', U[:5, -1])

PC1 explains 55.70657818007323% of the total variance
PC2 explains 28.84794436333058% of the total variance
PC3 explains 10.978440765551637% of the total variance
PC4 explains 3.5617322922138643% of the total variance
PC5 explains 0.7706529851187598% of the total variance
PC6 explains 0.0660278360961915% of the total variance
PC7 explains 0.042811930244362514% of the total variance
PC8 explains 0.025472501699184566% of the total variance
PC9 explains 0.00032478390961535774% of the total variance
PC10 explains 1.43617625688934e-05% of the total variance
First 10 PCs explains 100.0% of the total variance

Shape of U:
 (8582, 10)
First 5 elements of first column of U:
 [ 0.01579272  0.01369365  0.0377011  -0.00403405  0.00303744]
First 5 elements of last column of U:
 [ 0.00357306  0.0001198  -0.01514307  0.00513204 -0.00469218]
