# What is PCA?

<div  style="color:blue;font-family:Candara,arial,helvetica;line-height:20px"><strong>

## Principal component analysis is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components
<img src="https://image.slidesharecdn.com/81ac93fsyq9qa8sdsvoq-signature-7f7e9932c1ac6134e9030bbdd4cfa1c5606f58d572ef5d7da58609957cc1fc67-poli-160617131027/95/neural-networks-principal-component-analysis-pca-8-638.jpg?cb=1467151737" alt="drawing" width="600" height="300"/>  
    
### Step 1 Centre the data
### Step 2 Create Variance/Covariance matrix
### Step 3 Project vector doing matrix transformation
### Step 4 Construct Eignevector and Eigenvalues and chose the 1 with higher Eigen value as Principal component  

</strong></div> 

# Model Implementation without PCA

In [2]:
# ----------------------------------------------------------------------
# Implement Principal Component Analysis (PCA) for the Breast Cancer 
# prediction and compare results
# ----------------------------------------------------------------------

# Import libraries, load the dataset and create X and Y
from sklearn.datasets import load_breast_cancer
import pandas as pd

lbc = load_breast_cancer()

X = pd.DataFrame(lbc['data'], columns=lbc['feature_names'])
Y = pd.DataFrame(lbc['target'], columns=['type'])

# --------------------------------------
# Perform the prediction Without PCA
# --------------------------------------

# Split the dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size = 0.3, random_state = 1234, stratify=Y)

# Import Randon Forest Classifier
from sklearn.ensemble import RandomForestClassifier 

# Default Random Forest Object
rfc1 = RandomForestClassifier(random_state=1234)
rfc1.fit(X_train, Y_train)
Y_predict1 = rfc1.predict(X_test)

# Score and Evaluate the model 
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(Y_test, Y_predict1)
score1 = rfc1.score(X_test, Y_test)


print(cm1)
print(score1)

[[ 61   3]
 [  4 103]]
0.9590643274853801




# Convert the dimensions to PCA

In [3]:
# Normalize the data with mean as zero
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)

# Check the mean of the centered data
X_scaled[:,0].mean()

# Import PCA and fit the data to create PCAs
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

In [6]:
display(X_scaled)

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ...,
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])

In [7]:
display(X_pca)

array([[ 9.19283683,  1.94858305, -1.12316523,  3.63372578, -1.19511579],
       [ 2.3878018 , -3.76817174, -0.52929287,  1.1182647 ,  0.62177604],
       [ 5.73389628, -1.07517379, -0.55174812,  0.91208462, -0.1770836 ],
       ...,
       [ 1.25617928, -1.90229671,  0.56273034, -2.08922603,  1.80999247],
       [10.37479406,  1.67201011, -1.87702957, -2.35602936, -0.03374011],
       [-5.4752433 , -0.67063679,  1.49044322, -2.29915734, -0.18470358]])

# Implement PCA and compute results

In [8]:
# Split the dataset into train and test
X_train, X_test, Y_train, Y_test = \
train_test_split(X_pca, Y, test_size = 0.3, random_state = 1234, stratify=Y)


# Default Random Forest Object
rfc2 = RandomForestClassifier(random_state=1234)
rfc2.fit(X_train, Y_train)
Y_predict2 = rfc2.predict(X_test)


# Score and Evaluate the model using transformed data
cm2 = confusion_matrix(Y_test, Y_predict2)
score2 = rfc2.score(X_test, Y_test)

  


In [9]:
print(cm2)
print(score2)

[[ 61   3]
 [  0 107]]
0.9824561403508771
