**Exercise 10: Principal Component Analysis**

*CPSC 381/581: Machine Learning*

*Yale University*

*Instructor: Alex Wong*


**Prerequisites**:

1. Enable Google Colaboratory as an app on your Google Drive account

2. Create a new Google Colab notebook, this will also create a "Colab Notebooks" directory under "MyDrive" i.e.
```
/content/drive/MyDrive/Colab Notebooks
```

3. Create the following directory structure in your Google Drive
```
/content/drive/MyDrive/Colab Notebooks/CPSC 381-581: Machine Learning/Exercises
```

4. Move the 10_exercise_pca.ipynb into
```
/content/drive/MyDrive/Colab Notebooks/CPSC 381-581: Machine Learning/Exercises
```
so that its absolute path is
```
/content/drive/MyDrive/Colab Notebooks/CPSC 381-581: Machine Learning/Exercises/10_exercise_pca.ipynb
```

In this exercise, we will using PCA for dimensionality reduction as a mean of visualizing high dimensional data. Then we will test out the loss as we decrease the number of principal components. Finally, we will use it as a feature extractor and show that we can compress the data for the downstream classification task.


**Submission**:

1. Implement all TODOs in the code blocks below.

2. Report your reconstruction loss for training and testing sets and your classification scores for training and validation sets.

```
Report reconstruction loss and classification scores here.

```

3. List any collaborators.

```
Collaborators: Doe, Jane (Please write names in <Last Name, First Name> format)

Collaboration details: Discussed ... implementation details with Jane Doe.
```

Import packages

In [None]:
import numpy as np
import sklearn.datasets as skdata
import sklearn.metrics as skmetrics
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings(action='ignore')
np.random.seed = 1

Load datasets

In [None]:
# Load datasets
datasets = [
    skdata.load_iris(),
    skdata.load_wine()
]

dataset_names = [
    'iris',
    'wine'
]

# Set colors
colors = [
    'tab:blue',
    'tab:green',
    'tab:red'
]

Perform PCA on datasets and visualize

In [None]:
# Zip up all dataset options
dataset_options = zip(
    datasets,
    dataset_names)

for dataset, dataset_name in dataset_options:

    X = dataset.data
    y = dataset.target
    names = dataset.target_names

    n_dim = X.shape[-1]

    # TODO: Instantiate PCA with 2 components (dimensions)
    pca = None

    # TODO: Fit PCA to data


    # TODO: Use PCA to project (transform) all data points to lower dimensions
    Z = None

    # TODO: Create figure
    fig = None

    # TODO: Create super title 'Visualization of {} dataset projected from {} dimensions to a 2-dimensional subspace'

    # TODO: Instantiate axis for subplot of a 1 x 1 figure
    ax = None

    # Iterate through each class and plot them into the figure as scatter plot with different colors
    for label, color, name in zip(np.sort(np.unique(y)), colors, names):

        # TODO: Select from projected points the ones belonging to current class
        idx = None
        Z_label = None

        # TODO: Plot using scatter for selected points with associated color
        # set the points label as name, set alpha to 0.5

    # TODO: Turn on legend and set loc to best


Test generalization of the learned subspace through reconstruction loss

In [None]:
# Number of dimensions of subspace
dataset_components_list = [
    # For iris dataset
    range(1, 5),
    # For wine dataset
    range(1, 14)
]

# Zip up all dataset options
dataset_options = zip(
    datasets,
    dataset_components_list,
    dataset_names)

for dataset, dataset_components, dataset_name in dataset_options:

    X = dataset.data
    y = dataset.target

    # Shuffle the dataset based on sample indices
    shuffled_indices = np.random.permutation(X.shape[0])

    # Choose the first 80% as training set and the next 20% as validation
    train_split_idx = int(0.80 * X.shape[0])

    train_indices = shuffled_indices[0:train_split_idx]
    val_indices = shuffled_indices[train_split_idx:]

    # Select the examples from X and y to construct our training, validation, testing sets
    X_train, y_train = X[train_indices, :], y[train_indices]
    X_val, y_val = X[val_indices, :], y[val_indices]

    # Define empty lists to hold scores for training and validation
    mse_scores_train = []
    mse_scores_val = []

    for components in dataset_components:

        print('***** Fitting PCA with {} components on {} dataset *****'.format(components, dataset_name))

        # TODO: Instantiate PCA with specified components (dimensions)
        pca = None

        # TODO: Fit PCA to training data


        # TODO: Project the training data, reconstruct them, and measure loss
        Z_train = None
        X_hat_train = None

        mse_score_train = None
        print('Training set mean squared error: {:.4f}'.format(mse_score_train))

        # TODO: Project the validation data, reconstruct them, and measure loss
        Z_val = None
        X_hat_val = None

        mse_score_val =  None
        print('Validation set mean squared error: {:.4f}'.format(mse_score_val))

        # TODO: Append training and validation scores to lists of training and validation scores


    # TODO: Create figure with figsize=(5, 5)
    fig = None

    # TODO: Instantiate axis for subplot of a 1 x 1 figure
    ax = None

    # TODO: Plot the the number of components on the x-axis and training mse scores on the y-axis with color='blue', label='Training'


    # TODO: Plot the the number of components on the x-axis and validation mse scores on the y-axis with color='red', label='Validation'


    # TODO: Set title to 'Reconstrction Error on {} dataset'


    # TODO: Set xlabel to '# of components'


    # TODO: Set ylabel to 'MSE'


    # TODO: Set legend with loc='upper right'


    plt.show()
    print('')
    print('')

Use PCA as a feature extractor for logistic regression

In [None]:
# Number of dimensions of subspace
dataset_components_list = [
    # For iris dataset
    range(1, 5),
    # For wine dataset
    range(1, 14)
]

# Zip up all dataset options
dataset_options = zip(
    datasets,
    dataset_components_list,
    dataset_names)

for dataset, dataset_components, dataset_name in dataset_options:

    X = dataset.data
    y = dataset.target

    # Shuffle the dataset based on sample indices
    shuffled_indices = np.random.permutation(X.shape[0])

    # Choose the first 80% as training set and the next 20% as validation
    train_split_idx = int(0.80 * X.shape[0])

    train_indices = shuffled_indices[0:train_split_idx]
    val_indices = shuffled_indices[train_split_idx:]

    # Select the examples from X and y to construct our training, validation, testing sets
    X_train, y_train = X[train_indices, :], y[train_indices]
    X_val, y_val = X[val_indices, :], y[val_indices]

    # Define empty lists to hold scores for training and validation
    scores_train = []
    scores_val = []

    for components in dataset_components:

        print('***** Results of Logistic Regression using PCA with {} components on {} dataset *****'.format(components, dataset_name))

        # TODO: Instantiate PCA with specified components (dimensions)
        pca = None

        # TODO: Fit PCA to training data


        # TODO: Project the training data
        Z_train = None

        # TODO: Instantiate LogisticRegression with tol=1e-4
        logistic = None

        # TODO: Train model using projected training data


        # TODO: Score model using mean accuracy on training set
        predictions_train = None
        score_train = None
        print('Training set mean accuracy: {:.4f}'.format(score_train))

        # TODO: Project the validation data and test model on it
        Z_val = None

        # TODO: Score model using mean accuracy validation set
        predictions_val = None
        score_val = None
        print('Validation set mean accuracy: {:.4f}'.format(score_val))

        # TODO: Append training and validation scores to lists of training and validation scores


    # TODO: Create figure with figsize=(5, 5)
    fig = None

    # TODO: Instantiate axis for subplot of a 1 x 1 figure
    ax = None

    # TODO: Plot the the number of components on the x-axis and training scores on the y-axis with color='blue', label='Training'

    # TODO: Plot the the number of components on the x-axis and validation scores on the y-axis with color='red', label='Validation'

    # TODO: Set title to 'Logistic Regression using PCA on {} dataset'

    # TODO: Set xlabel to '# of components'

    # TODO: Set ylabel to 'Scores'

    # TODO: Set legend with loc='upper right'

    plt.show()
    print('')