Note: when using MKL set OMP_NUM_THREADS=1.

In [1]:
import glob
import itertools
import logging
from operator import attrgetter
import os
import pprint
import sys
import time

import matplotlib.pyplot as plt
%matplotlib inline
import scipy.misc

import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as hc
from scipy.spatial.distance import pdist

import sklearn.utils

import mrfitty
from mrfitty.base import (
    AdaptiveEnergyRangeBuilder,
    InterpolatedSpectrumSet,
    ReferenceSpectrum,
)
from mrfitty.best_subset_selection import BestSubsetSelectionFitTask
from mrfitty.linear_model import NonNegativeLinearRegression

logging.basicConfig(level=logging.WARN)

In [2]:
src_path, _ = os.path.split(mrfitty.__path__[0])
sample_data_dir_path = os.path.join(src_path, 'example', 'arsenic')
print('sample data is installed at "{}"'.format(sample_data_dir_path))
os.path.exists(sample_data_dir_path)

sample data is installed at "/home/jklynch/host/project/mr-fitty/src/example/arsenic"


True

Read all sample reference and unknown spectra.

In [3]:
sample_data_reference_glob = os.path.join(sample_data_dir_path, 'reference/*als_cal*.e')
print('sample data reference glob: {}'.format(sample_data_reference_glob))
sample_data_unknown_glob = os.path.join(sample_data_dir_path, 'unknown/*.e')
print('sample data unknown glob: {}'.format(sample_data_unknown_glob))

sample data reference glob: /home/jklynch/host/project/mr-fitty/src/example/arsenic/reference/*als_cal*.e
sample data unknown glob: /home/jklynch/host/project/mr-fitty/src/example/arsenic/unknown/*.e


In [4]:
sample_data_reference_set, _ = list(ReferenceSpectrum.read_all([sample_data_reference_glob]))
sample_data_reference_list = list(sample_data_reference_set)
print('sample data reference file count: {}'.format(len(sample_data_reference_list)))
sample_data_unknown_set, _ = list(ReferenceSpectrum.read_all([sample_data_unknown_glob]))
sample_data_unknown_list = list(sample_data_unknown_set)
print('sample data unknown file count: {}'.format(len(sample_data_unknown_list)))

sample data reference file count: 20
sample data unknown file count: 16


What are the maximum and minimum reference energies?

In [5]:
reference_min_energy = np.max([r.data_df.index.values[0] for r in sample_data_reference_list])
reference_max_energy = np.min([r.data_df.index.values[-1] for r in sample_data_reference_list])
print('reference minimum energy: {:5.2f}'.format(reference_min_energy))
print('reference maximum energy: {:5.2f}'.format(reference_max_energy))

reference minimum energy: 11830.00
reference maximum energy: 12097.14


What are the maximum and minimum unknown spectrum energies?

In [6]:
min_energy = np.max([r.data_df.index.values[0] for r in sample_data_unknown_list])
max_energy = np.min([r.data_df.index.values[-1] for r in sample_data_unknown_list])
print('minimum energy: {:5.2f}'.format(min_energy))
print('maximum energy: {:5.2f}'.format(max_energy))

minimum energy: 11777.73
maximum energy: 12171.83


Plot one fit with training and testing points marked.

In [12]:
unknown_0 = sample_data_unknown_list[0]

fitter = BestSubsetSelectionFitTask(
    ls=NonNegativeLinearRegression,
    reference_spectrum_list=sample_data_reference_list[0:3],
    unknown_spectrum_list=(unknown_0, ),
    energy_range_builder=AdaptiveEnergyRangeBuilder(),
    best_fits_plot_limit=0,
    component_count_range=(1,3+1)
)

fit_0, _ = fitter.fit(unknown_spectrum=sample_data_unknown_list[0])

for i, (predicted_b, train_index, test_index) in enumerate(fitter.fit_and_predict(fit_0)):
    #train_test_ndx = np.random.choice([0, 1], unknown_0.data_df.shape[0], p=[0.2, 0.8])
    #predicted_b, train_index, test_index = fitter.fit_and_predict(fit_0)
    print(unknown_0.data_df.shape[0])
    train_test_ndx = np.zeros_like(unknown_0.data_df.shape[0])
    print(train_test_ndx)
    train_test_ndx[test_index] = 1
    f, ax = plt.subplots()
    ax.scatter(unknown_0.data_df.index, unknown_0.data_df.norm, c=train_test_ndx, marker='.')
    plt.show()

224
0


IndexError: too many indices for array