In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as skl
import numpy as np

import seaborn as sns
sns.set(font_scale=2)

%matplotlib inline

In [None]:
import sklearn.decomposition
import sklearn.ensemble
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('../data/training.csv')

train.head()

In [None]:
data_columns = [column for column in train.columns if column.startswith('m')]
wavenumbers = [float(column.lstrip('m')) for column in data_columns]

output_columns = ["Ca","P","pH","SOC","Sand"]

X = train[data_columns].as_matrix()
y = train[output_columns].as_matrix()

In [None]:
RFR_model = sklearn.ensemble.RandomForestRegressor(min_impurity_split=1e-5,
                                                       n_estimators=1000,
                                                       min_samples_split=2,
                                                       max_depth=20,
                                                       min_samples_leaf=0.001,
                                                       max_features=0.33,
                                                       n_jobs=-1,
                                                       oob_score=True)

transformed_X = sklearn.decomposition.PCA(n_components=100).fit_transform(X)
RFR_model.fit(transformed_X,y)

In [None]:
RFR_model.oob_score_

### Cross-Validation

In [None]:
n_estimators = 1000

hyperparameter = 'Depth'

#schedule = [1,2,3,5,10,20,30,50,None] #depth

schedule = [1,2,3,5,10,20,]#30,50,None] #depth

#schedule = ['sqrt','auto',0.16,0.33,0.5,0.67] #max_features

#schedule = [2,5,10,20,50,100] #min_samples_split

#schedule = [1e-7,1e-6,1e-5,1e-4,1e-2,1e-1] #min_impurity_split

oob_scores = np.zeros(len(schedule))
inb_scores = np.zeros_like(oob_scores)

for schedule_idx, value in enumerate(schedule):
    RFR_model = sklearn.ensemble.RandomForestRegressor(min_impurity_split=1e-5,
                                                       n_estimators=n_estimators,
                                                       min_samples_split=2,
                                                       max_depth=value,
                                                       min_samples_leaf=0.001,
                                                       max_features=0.33,
                                                       n_jobs=-1,
                                                       oob_score=True)
    
    transformed_X = sklearn.decomposition.PCA(n_components=100,whiten=False).fit_transform(X)
    RFR_model.fit(transformed_X,y)
    oob_scores[schedule_idx] = RFR_model.oob_score_
    inb_scores[schedule_idx] = RFR_model.score(transformed_X,y)

In [None]:
plot_indices = schedule
#plot_indices = list(range(len(schedule))) # for max_features only

plt.semilogx(plot_indices,inb_scores,linewidth=4,label='train')
plt.semilogx(plot_indices,oob_scores,linewidth=4,label='test')

plt.semilogx(plot_indices,[0.68]*len(plot_indices),linewidth=4,color='k',label='Best Linear Model')

#plt.xticks(plot_indices,[str(val) for val in schedule]);
plt.xlabel(hyperparameter); plt.ylabel(r'$R^2$ OOB'); plt.legend()
plt.title("Performance Across" + hyperparameter + " for Compressed Data, "+ str(n_estimators)+ " Trees");