# Import statements

This cell does not immediately contain all the statements.
We come back to edit this cell as we identify new packages that we need:

- First, we need `pandas` to load the data.
- Then, we need `StandardScaler` to standardise the data.
- Then, we need `PCA` to perform the analysis.
- Then we need `pyplot` to visualise the PCA projection.

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import umap

Define the path to the file in a separate cell.
Separate this global variable from the downstream code that uses it.

In [None]:
iris_file_path = "iris.data"

Load the data naively.
Demonstrate that the first row is incorrectly taken as column names.

In [None]:
iris_data = pd.read_csv(iris_file_path)
iris_data.head()

Demonstrate that in this case, there are no column names at all.

In [None]:
# loading dataset into Pandas DataFrame
iris_data = pd.read_csv(iris_file_path, header=None)
iris_data.head()

Manually provide column names.

In [None]:
iris_data = pd.read_csv(iris_file_path, names=['sepal length','sepal width','petal length','petal width','target'])
iris_data.head(n=5)

# Split measurements and labels

Demonstrate the use and behaviour of the `loc` property.

In [None]:
iris_data.loc

In [None]:
iris_data.loc[:, ['target']].head()

In [None]:
iris_class_values = iris_data.loc[:, ['target']].values
iris_class_values[0:4]

In [None]:
iris_features_names = ['sepal length', 'sepal width', 'petal length', 'petal width']
iris_features = iris_data.loc[:, iris_features_names]
iris_features.head()

In [None]:
iris_features = iris_data.loc[:, iris_features_names].values
iris_features[0:4, :]

In [None]:
iris_standardised = StandardScaler().fit_transform(iris_features)
iris_standardised[0:4, :]

In [None]:
pd.DataFrame(data = iris_standardised, columns = iris_features_names).head()

# Principal components analysis (PCA)

In [None]:
pca_iris = PCA(n_components=4)
pca_iris

In [None]:
principalComponents = pca_iris.fit_transform(iris_standardised)
principalComponents[0:4, :]

In [None]:
pca_dataframe = pd.DataFrame(data = principalComponents
             , columns = ['PC 1', 'PC 2', 'PC 3', 'PC 4'])
pca_dataframe.head()

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('PC 1', fontsize = 15)
ax.set_ylabel('PC 2', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)
ax.scatter(pca_dataframe.loc[:, 'PC 1'],
           pca_dataframe.loc[:, 'PC 2'])
ax.grid()

In [None]:
pca_dataframe_labelled = pd.concat([pca_dataframe, iris_data[['target']]], axis = 1)
pca_dataframe_labelled.head(5)

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('PC 1', fontsize = 15)
ax.set_ylabel('PC 2', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)


targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = pca_dataframe_labelled['target'] == target
    ax.scatter(pca_dataframe_labelled.loc[indicesToKeep, 'PC 1']
               , pca_dataframe_labelled.loc[indicesToKeep, 'PC 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()


In [None]:
pca_iris.explained_variance_ratio_

In [None]:
sum(pca_iris.explained_variance_ratio_)

In [None]:
pca_variance_dataframe = pd.DataFrame(data = pca.explained_variance_ratio_, columns = ["Explained variance ratio"])
pca_variance_dataframe

In [None]:
pca_variance_dataframe = pd.DataFrame(
    data = {
        "Explained variance ratio": pca.explained_variance_ratio_,
        "Principal component": np.arange(1, len(pca.explained_variance_ratio_) + 1)})
pca_variance_dataframe

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal component', fontsize = 15)
ax.set_ylabel('Explained variance ratio', fontsize = 15)
ax.set_title('PCA explained variance ratio', fontsize = 20)
ax.bar(x = pca_variance_dataframe.loc[:, "Principal component"],
       height = pca_variance_dataframe.loc[:, "Explained variance ratio"])
locs, labels = plt.xticks()  # Get the current locations and labels.
plt.xticks(np.arange(min(pca_variance_dataframe.loc[:, "Principal component"]), max(pca_variance_dataframe.loc[:, "Principal component"]) + 1, 1))  # Set label locations.
plt.yticks(np.arange(0, 1, 0.1))  # Set label locations.
ax.grid()

# Uniform Manifold Approximation and Projection (UMAP)

In [None]:
umap_reducer = umap.UMAP()
umap_reducer

In [None]:
umap_iris_embedding = umap_reducer.fit_transform(pca_dataframe)
umap_iris_embedding.shape

In [None]:
umap_iris_dataframe = pd.DataFrame(data = umap_iris_embedding
             , columns = ['UMAP 1', 'UMAP 2'])
umap_iris_dataframe.head()

In [None]:
umap_dataframe_labelled = pd.concat([umap_iris_dataframe, iris_data[['target']]], axis = 1)
umap_dataframe_labelled.head(5)

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('UMAP 1', fontsize = 15)
ax.set_ylabel('UMAP 2', fontsize = 15)
ax.set_title('2 Component UMAP', fontsize = 20)


targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = umap_dataframe_labelled['target'] == target
    ax.scatter(umap_dataframe_labelled.loc[indicesToKeep, 'UMAP 1']
               , umap_dataframe_labelled.loc[indicesToKeep, 'UMAP 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()
