## Assignment 2:
### Visualizing High Dimensional Data
#### Kendall Stauffer (U0688677)

In [144]:
# importing important libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

### Question 1:
 - Implement PCA as a function and provide a commented version of it. Feel free to compute eigenvectors, eigenvalues etc. using numpy or other api functionality.
 - Plot (with a scatter plot) the iris dataset using your PCA implementation. Color each of the species differently. On a separate plot provide a scatter plot of the language api (sklearn) PCA for comparison.
 - Run K-Means on these results with k=2 and plot the results color according to cluster
 - Now create an alternate PCA function where you do not center the data. Using a scatter plot, show the results. Again, color each of the species differently.
 - What is the effect of neglecting to center the data? What type of data would not work well for PCA and why?
 

In [129]:
 def compute_PCA(data, scale_data = True):
        """
        Params data: A dataframe or array that contains the data for pca.
        Params scale_data: if set to True, data will be scaled with mean center scaling.
        Return: Returns a python dictionary that contains the calculated percent variance and decomposed values.
        """
        # if scaled data = True do mean center scaling
        if scale_data:
            # import mean center scaling
            from sklearn.preprocessing import StandardScaler
            # initialize the scaler
            scaler = StandardScaler()
            # fit the data to scaler
            scaler.fit(data)
            # scale the data
            temp = scaler.transform(data)
        else:
            # don't scale data
            temp = data
        # calculate covariance matrix
        cov = np.cov(temp.transpose())
        # calculate eigen vector and values
        values, vectors = np.linalg.eig(cov)
        # sorting eigen values and vectors by highest eigen value
           
        # calculate % variance for every value
        temp_vals = values.tolist()
        percent_variance = [np.round((temp_vals[i]/sum(temp_vals))*100, 2) for i in range(len(temp_vals))]
        
        # taking the dot product of scaled data by eigenvetors transposed
        temp_pca = temp.dot(vectors.transpose())
        # making columnames
        col_names = [f"PC{i}" for i in range(temp_pca.shape[1])]
        # making a pandas dataframe
        temp_pca = pd.DataFrame(temp_pca, columns = col_names)
        # returning as a python dictionary that contains the percent variation and decomposed data.
        temp_final = {'percent_variance': percent_variance,
                     'pca': temp_pca}
        return temp_final       

In [137]:
# preparing data
iris = load_iris()
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df['classes'] = iris.target
df.classes = np.where(df.classes ==  0, iris.target_names[0], df.classes)
df.classes = np.where(df.classes ==  1, iris.target_names[1], df.classes)
df.classes = np.where(df.classes ==  2, iris.target_names[2], df.classes)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),classes
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


#### Running my own pca function

In [142]:
kcs_pca_scaled = compute_PCA(df.iloc[:,:-1])
print(kcs_pca_scaled['percent_variance'])
print(kcs_pca_scaled['pca'].head())

[72.96244541329985, 22.850761786701778, 3.668921889282875, 0.5178709107154802]
       PC_0      PC_1      PC_2      PC_3
0 -0.233230 -0.863303  0.316056 -2.115802
1  0.074899  0.264668  0.203591 -2.175638
2 -0.184237 -0.109029  0.043582 -2.379393
3 -0.242289  0.163927 -0.004951 -2.360324
4 -0.383247 -1.043206  0.240091 -2.199654


#### Running sklearn pca function

In [163]:
pca_sklearn = PCA()
df_scaled = StandardScaler().fit_transform(df.iloc[:,:-1])
pca_sklearn.fit(df_scaled)
print(pca_sklearn.explained_variance_ratio_)
pca_sklearn_data = pd.DataFrame(pca_sklearn.transform(df_scaled), columns = ['PC0', 'PC1', 'PC2', 'PC3'])
print(pca_sklearn_data.head())

[0.72962445 0.22850762 0.03668922 0.00517871]
        PC0       PC1       PC2       PC3
0 -2.264703  0.480027 -0.127706 -0.024168
1 -2.080961 -0.674134 -0.234609 -0.103007
2 -2.364229 -0.341908  0.044201 -0.028377
3 -2.299384 -0.597395  0.091290  0.065956
4 -2.389842  0.646835  0.015738  0.035923


In [None]:
test

#### Running my own unscaled pca


In [165]:
kcs_pca_scaled = compute_PCA(df.iloc[:,:-1].values, scale_data=False)
print(kcs_pca_scaled['percent_variance'])
print(kcs_pca_scaled['pca'].head())

[92.46187232017272, 5.306648311706787, 1.7102609807929738, 0.5212183873275305]
       PC_0      PC_1      PC_2      PC_3
0 -1.206733 -2.213499  4.986587  3.006354
1 -0.950716 -1.831514  4.728567  2.896956
2 -1.096108 -2.020433  4.584283  2.785811
3 -1.182994 -1.819382  4.496526  2.851600
4 -1.308531 -2.278063  4.918257  2.978073


In [166]:
pca_sklearn = PCA()
df_scaled = df.iloc[:,:-1]
pca_sklearn.fit(df_scaled)
print(pca_sklearn.explained_variance_ratio_)
pca_sklearn_data = pd.DataFrame(pca_sklearn.transform(df_scaled), columns = ['PC0', 'PC1', 'PC2', 'PC3'])
print(pca_sklearn_data.head())

[0.92461872 0.05306648 0.01710261 0.00521218]
        PC0       PC1       PC2       PC3
0 -2.684126  0.319397 -0.027915 -0.002262
1 -2.714142 -0.177001 -0.210464 -0.099027
2 -2.888991 -0.144949  0.017900 -0.019968
3 -2.745343 -0.318299  0.031559  0.075576
4 -2.728717  0.326755  0.090079  0.061259
