In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as skl
import numpy as np

import seaborn as sns
sns.set(font_scale=2)

%matplotlib inline

In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

## Load and Format Data

In [None]:
train = pd.read_csv('../data/training.csv')

train.head()

In [None]:
data_columns = [column for column in train.columns if column.startswith('m')]
wavenumbers = [float(column.lstrip('m')) for column in data_columns]

output_columns = ["Ca","P","pH","SOC","Sand"]

X = train[data_columns].as_matrix()
y = train[output_columns].as_matrix()

## Cross-Validating Degree for a Fixed PCA Reduction

#### Define Cross-Validation Across Degree, Fixed PCA Reduction

In [None]:
def polynomialCV(degrees,numSplits,n_PCA_components):
    
    numDegrees = len(degrees)
    
    test_scores = np.zeros((numSplits,numDegrees))
    train_scores = np.zeros_like(test_scores)

    for degreeIdx,degree in enumerate(degrees):
        for splitIdx in range(numSplits):
            X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,)
            poly_model = Pipeline([ ('pca', PCA(n_components=n_PCA_components)),
                           ('poly', PolynomialFeatures(degree=degree)),
                           ('linear', LinearRegression(fit_intercept=False))])
            poly_model = poly_model.fit(X_train,y_train)
            train_scores[splitIdx,degreeIdx] = poly_model.score(X_train,y_train)
            test_scores[splitIdx,degreeIdx] = poly_model.score(X_test,y_test)
            
    return train_scores,test_scores

In [None]:
numSplits = 20
degrees = [1,2,3,4,5]
n_PCA_components = 10
train_scores, test_scores = polynomialCV(degrees,numSplits,n_PCA_components=n_PCA_components)

In [None]:
mean_train = np.mean(train_scores,axis=0)
sd_train = np.std(train_scores,ddof=1,axis=0)

mean_test = np.mean(test_scores,axis=0)
sd_test = np.std(test_scores,ddof=1,axis=0)

In [None]:
plt.figure(figsize=(12,6))
plt.errorbar(degrees,mean_train,yerr=sd_train,
             label="train",linewidth=4,
             color='hotpink',linestyle='--');
plt.errorbar(degrees,mean_test,yerr=sd_test,
             label="test",linewidth=4,
             color='hotpink');
plt.ylim([0,1]); plt.legend(); plt.suptitle("Cross-Validating Polynomial Degree with " +
                                            str(n_PCA_components) +
                                            " PCs Retained",
                                           fontsize='xx-large',fontweight='bold');

## Cross-Validating Quadratic Model Across PCA Reduction Levels

### Define Cross-Validaiton Across PCA Reduction for Quadratic

In [None]:
def quadraticCV(numSplits,PCA_schedule):
    
    len_schedule = len(PCA_schedule)
    
    test_scores = np.zeros((numSplits,2,len_schedule))
    train_scores = np.zeros_like(test_scores)
    
    for PCA_idx,n_PCA_components in enumerate(PCA_schedule):
        train_scores[:,:,PCA_idx], test_scores[:,:,PCA_idx] = polynomialCV([1,2],numSplits,n_PCA_components)
        
    return train_scores, test_scores

In [None]:
numSplits = 20
PCA_schedule = [2,5,10,20,50]

train_scores, test_scores = quadraticCV(numSplits,PCA_schedule)

In [None]:
mean_train = np.mean(train_scores,axis=0)
sd_train = np.std(train_scores,ddof=1,axis=0)

mean_test = np.mean(test_scores,axis=0)
sd_test = np.std(test_scores,ddof=1,axis=0)

In [None]:
plt.errorbar(PCA_schedule,mean_test[0,:],yerr=sd_test[0,:],
         label="linear_test",color='chartreuse',linewidth=4);

plt.errorbar(PCA_schedule,mean_train[0,:],yerr=sd_train[0,:],
         label="linear_train",color='chartreuse',linewidth=4,
             linestyle='--');

plt.errorbar(PCA_schedule,mean_test[1,:], yerr=sd_test[1,:],
         label="quadratic_test",color='indigo',linewidth=4);
plt.errorbar(PCA_schedule,mean_train[1,:], yerr=sd_train[1,:],
         label="quadratic_train",color='indigo',linewidth=4,linestyle='--');

plt.ylim([0,1]); plt.legend();
plt.xticks(PCA_schedule);