In [1]:
import numpy as np
from load_dipa_water_nacl_training_set import load_training_set

### Load in a training set

_For details on how to construct your own training set, consult load_dipa_water_nacl_training_set.py as a template._

In [2]:
water_dipa_nacl, water_dipa, water_nacl = load_training_set(filepaths = [None, None, None, None]) 

ValueError: could not convert string to float: 'Wavelength (nm)'

In [None]:
water_dipa_nacl.chem_properties #  examine some chemical properties.

In [None]:
water_dipa_nacl

Do a test-train split. This can be done randomly, or systematically. Here, we will do it systematically (non-randomly) using as a training set only DIPA-water and water-NaCl mixtures.

In [None]:
mix_train = water_dipa + water_nacl 

# We want to test on only data that contains all three components (no two-component data.)
# filter the water-dipa-nacl set to only include samples that contain all three 
mix_test = water_dipa_nacl.filter({'water': [10 ** -5, 1], 'nacl': [10 ** -5, 1], 'dipa': [10 ** -5, 1]}) 
# Some print-outs are currently expected as the code checks for samples that exist in both datasets.

In [None]:
from sklearn.metrics import mean_absolute_error

lbounds = [800, 2500] # set global bounds on your spectrum. 

# Set the number of windows to split your spectral range into.
# if nwindows = [1, 10], the code will first try to use the entire spectral range as a training set,
# then will split the range into 10 smaller ranges and try them sequentially.
nwindows = [10] 

sc = 'neg_mean_absolute_error' # scoring metric.
random_state = 42 # a replicable random state. 
tts_size = 0.25 # the fraction of samples to use as testing in the test-train split.
metric = mean_absolute_error # the scoring metric to use
metric_label = 'MAE' # for plotting and printouts

Create your search plan. We import GridSearchCV, an exhaustive search method that will try all combos of parameters until it finds the best-working fit. 

We will use Ridge regression, a type of machine learning regression.

The only adjustable parameter that we are telling the program to search over is the alpha parameter. We are telling it to do 5-fold cross validation.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

ridge_search_plan = GridSearchCV(
    Ridge(), {'alpha': np.logspace(-7, 7, 14)}, scoring=sc, cv=5
)

Create a list of search plans. Each is typically a different kind of machine learning model.

In [None]:
models = [ridge_search_plan] # currently, only one search plan is in the list. But you could include more.

Perform the search.

In [None]:
from mixture_composition_regression.cross_validation import cv_on_model_and_wavelength

viable_models, best_model, y, X = cv_on_model_and_wavelength(
    mix_train,
    nwindows, 
    models,
    target_chem='water',
    test_data=mix_test,
    tts_test_size=tts_size,
    tts_random_state=random_state,
    tolerance=5E-3,
    metric=metric,
    metric_label=metric_label,
    l_bounds=lbounds,
    plot_comparison=True,
    plot_comparison_savefile='./plots/axes_train'
)

In [None]:
print(X)