In [None]:
# SOURCE: https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

# This a famous data set analyzed by a famous statistician, Sir Robert Fisher
# Data: 150 measurements of the flowers of three different species of Iris:
# sepal length, sepal width, petal length, petal width.  Thus a 150x4 dimensional
# data set.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# load dataset into Pandas DataFrame
df = pd.read_csv(url, names=['sepal length','sepal width','petal length','petal width','target'])

In [None]:
df[0:4]

In [None]:
sepal_lengths = df.loc[:,'sepal length']
sepal_lengths.shape, sepal_lengths[:5], 

In [None]:
np.mean(sepal_lengths), np.std(sepal_lengths)

In [None]:
normalized_sepal_lengths = (sepal_lengths - np.mean(sepal_lengths))/np.std(sepal_lengths)
normalized_sepal_lengths.shape, normalized_sepal_lengths[:5], normalized_sepal_lengths[-5:]

In [None]:
# The columns of the data must be scaled to have
# maan 0 and variance 1.


from sklearn.preprocessing import StandardScaler

features = ['sepal length', 'sepal width', 'petal length', 'petal width']

# Separating out the features
x_unscaled = df.loc[:, features].values

# Separating out the target
y = df.loc[:,['target']].values

# Standardizing the features
x = StandardScaler().fit_transform(x_unscaled)

In [None]:
# Let's check the shapes

x.shape, x_unscaled.shape, y.shape

In [None]:
# Does it check out?

print(x[0:4, 0]), print("----"), print(normalized_sepal_lengths[0:4])

In [None]:
# Let's do PCA, projecting the data onto 
# the first two principal components

from sklearn.decomposition import PCA

pca = PCA(n_components=2)

principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
# Again let's "look into" the data we are making:

principalComponents.shape, principalComponents[0:4]

In [None]:
principalDf[0:5]

In [None]:
# We need labeals for the data:
finalDf = pd.concat([principalDf, df[['target']]], axis = 1)
finalDf[0:4]

In [None]:
# And now we can visualize the projected data

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)

targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['target'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()


In [None]:
# Let's see if we can classify the first data element by hand

df[0:1]

In [None]:
# But we need its normalized form:

v = x[0]
v

In [None]:
# Now for the fun part: here is the projection matrix:

P = pca.components_
P

In [None]:
# And there is the projection of the firat data row:
# Is it in the right place ont the graph?

np.dot(P,v)

In [None]:
# Now let's try to classify an unknown flower!
# We will need a function which takes a vector of
# features as input and which yields the normalized
# vector of features as output.

# So we need the means and standard deviations of the columns.
xu = x_unscaled
xu.shape

In [None]:
means = np.mean(xu, axis=0)
means

In [None]:
stdevs = np.std(xu, axis=0)
stdevs

In [None]:
flower = np.array([5.0, 3.4, 1.3, 0.2])

In [None]:
def normalize(f):
    return (f - means)/stdevs

In [None]:
normalized_flower = normalize(flower)
normalized_flower

In [None]:
def classify2(f):
    return np.dot(P, normalize(f))

In [None]:
classify2(flower)

In [None]:
def classify(f):
    return np.dot(P, normalize(f))[0]

In [None]:
classify(flower)

In [None]:
# Let's grow or shrink the flower and then try to classify it:

flower2 = np.array([5.0, 3.4, 1.3, 0.2])/0.8
classify(flower2)

In [None]:
# Problem 1: devise a way to take a flower meaurement as input and produce
# one of the following strings as output:
# iris-setosa, iris-versicolor, iris-virginica, unclassified
#
# Problem 2: add to the above: a numerical measure of confidence in
# the classification.