# Comparing PCA Dimensionality Reduction to Random Projection

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as skl
import numpy as np

import seaborn as sns
sns.set(font_scale=2)

%matplotlib inline

In [None]:
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.model_selection import train_test_split

In the
[Dimensionality Reduction Notebook](./Dimensionality\ Reduction.ipynb),
we were motivated to reduce the length of our data vectors because we had more data dimensions than we had observations, leading to
[overfitting](./Cross\ Validation\ Example.ipynb).

The first choice for reducing dimensions is PCA, as described in detail in the
[Dimensionality Reduction Notebook](./Dimensionality\ Reduction.ipynb).

Though PCA has a rigorous justification in terms of information-maximizing transforms, the assumptions that would guarantee that PCA was the right choice don't hold in our (and most real-life) case.

This notebook compares PCA to what you might think would be an overly-simple model:
[random projection](https://en.wikipedia.org/wiki/Random_projection).
In random projection, instead of carefully selecting the vectors onto which we project our data,
we select them at random according to some distribution.
The most common choice is a Gaussian distribution.

Below, we run
[cross-validaiton](./Cross\ Validation\ Example.ipynb)
to determine the performance of randomly-chosen projections of
size ranging from one dimension to as many dimensions as we have data points.

Perhaps surprisingly, there doesn't seem to be any difference in performance on the test set between
random projections and projections onto the principal components!
Furthermore, we need our components to retain roughly 99% of the variance 
This would seem to indicate that the directions of large variance in our input data
are not the directions of variability that are useful for predicting our targets.

In [None]:
def runCV(num_splits,transforms,X,y,model):
    
    train_scores = np.zeros((num_splits,len(transforms)))
    test_scores = np.zeros((num_splits,len(transforms)))

    for transform_idx, transform in enumerate(transforms):

        for split_idx in range(num_splits):

            X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,)

            transformed_X_train = transform.transform(X_train)
            transformed_X_test =  transform.transform(X_test)

            model.fit(transformed_X_train,y_train)

            train_score = model.score(transformed_X_train,y_train)
            test_score = model.score(transformed_X_test,y_test)

            train_scores[split_idx,transform_idx] = train_score
            test_scores[split_idx,transform_idx] = test_score
            
    return train_scores, test_scores

def transformsFromSchedule(to_keep_schedule,X,transform_type=""):
    
    transforms = []
    
    if transform_type == "PCA":
        transform = PCA
    elif transform_type == "random":
        transform = random_projection.GaussianRandomProjection
    else:
        raise ValueError("unknown transform "+transform_type)
    
    for to_keep in to_keep_schedule:
        transforms.append(transform(n_components=to_keep).fit(X))
    
    return transforms

def plotCV(schedule,train_scores,test_scores,transform_type):
    mean_train_scores = np.mean(train_scores,axis=0)
    mean_test_scores = np.mean(test_scores,axis=0)

    sd_train_scores = np.std(train_scores,axis=0,ddof=1)
    sd_test_scores = np.std(test_scores,axis=0,ddof=1)

    plt.errorbar(schedule,mean_train_scores,
                     yerr=sd_train_scores,
                 linewidth=4,alpha=0.75,
                 label=transform_type+'-Train',
                )

    plt.errorbar(schedule,mean_test_scores,
                     yerr=sd_test_scores,
                 linewidth=4,alpha=0.75,
                 label=transform_type+'-Test',
                linestyle='--')

def makePlot(schedule,train_scores,test_scores,transform_type="PCA"):
    
    plt.figure(figsize=(12,4))
    ax = plt.subplot(111)
    ax.set_xscale("log", nonposx='clip')
    
    plotCV(schedule,train_scores,test_scores,transform_type)
    
    plt.ylim([0,1]);

    plt.xlabel("Retained Dimensions");
    plt.ylabel("$R^2$")
    plt.legend(); plt.title("Train vs. Test Scores for " +transform_type+ "-DR");
    
def getBest(test_scores,to_keep_schedule):
    
    mean_test_scores = np.mean(test_scores,axis=0)
    
    best_score_index = np.argmax(mean_test_scores)
    best_score = mean_test_scores[best_score_index]
    best_score_num_dimensions = to_keep_schedule[best_score_index]
    print("the best number of dimensions to keep is: "+ str(best_score_num_dimensions))

In [None]:
def produceCVPlot(to_keep_schedule,num_splits,
                  X,y,
                  model=skl.linear_model.LinearRegression(),
                 transform_type="PCA"):
    
    transforms = transformsFromSchedule(to_keep_schedule,X,transform_type=transform_type)
    
    train_scores, test_scores = runCV(num_splits,transforms,X,y,model)
    
    makePlot(to_keep_schedule,train_scores,test_scores,transform_type=transform_type)
    getBest(test_scores,to_keep_schedule)
    
    return train_scores,test_scores

In [None]:
train = pd.read_csv('../data/training.csv')

train.head()

In [None]:
data_columns = [column for column in train.columns if column.startswith('m')]
wavenumbers = [float(column.lstrip('m')) for column in data_columns]

output_columns = ["Ca","P","pH","SOC","Sand"]

X = train[data_columns].as_matrix()
y = train[output_columns].as_matrix()

In [None]:
to_keep_schedule = [1,2,3,4,5,6,7,8,9,
                    10,20,30,50,
                    100,200,
                    1157,
                   ]
num_splits = 20
PCA_train_scores, PCA_test_scores = produceCVPlot(to_keep_schedule,num_splits,
             X,y,
             );

In [None]:
to_keep_schedule = [1,2,3,4,5,6,7,8,9,
                    10,20,30,50,
                    100,200,
                    1157,
                   ]
num_splits = 20

random_train_scores, random_test_scores = produceCVPlot(to_keep_schedule,num_splits,
             X,y,transform_type="random"
             );

## Looking Closer

There appears to be a substantial difference between the performance of PCA and Gaussian-Random dimensionality reduction only at low numbers of retained dimensions. The cell below examines this more closely.

In [None]:
keep_up_to = 10
sub_schedule = to_keep_schedule[:keep_up_to];
plt.figure(figsize=(12,4))
ax = plt.subplot(111)

plotCV(sub_schedule,
       PCA_train_scores[:,:keep_up_to],PCA_test_scores[:,:keep_up_to],
       transform_type="PCA")

plotCV(sub_schedule,
       random_train_scores[:,:keep_up_to],random_test_scores[:,:keep_up_to],
       transform_type="random")

plt.ylim([0,1]); plt.legend(); plt.title("Train and Test Scores for PCA and Random DR");