In [6]:
import pickle
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import griddata

In [7]:
def read_range_files(filenames:list):
    ranges = []
    
    for f in filenames:
        with open(f, 'r') as file:
            ranges.append(file.readlines())
        
    ranges = [float(i.strip()) for row in ranges for i in row]
    ranges = [[ranges[idx], ranges[idx+1]] for idx, i in enumerate(ranges) if idx % 2 == 0]
    
    return ranges


def read_predictor_files(filenames:list):
    
    predictors = []
    for f in predictor_files:
        with open(f, 'rb') as file:
            predictors.append(pickle.load(file))
    return predictors

def predict_on_test_csvs(fpath, bestmodel_container, regressand, target, sample_name=None, print_sample=False, printres=False):
    new_data = pd.read_csv(fpath, 
#                        names=['wavenumber', 'absorbance'], 
                       header=0, # it was reading in the first row as data and causing problems. So I just had it read the column namese from the first row
                       dtype='float')  
    
    # renamed for less typing, but you can absolutely get rid of these column names and just rename to your preference
    new_data = new_data.rename(columns={new_data.columns[0]:'x', new_data.columns[1]:'y'}) 

    # because if you look higher in the code, we are currently regressing on the derivative of the data
    # so I calculated the derivative here
    if regressand == 'a':
        new_data[regressand] = new_data['y']
    if regressand == 'da':
        new_data[regressand] = new_data['y'].diff() 
    elif regressand == 'd2a':
        new_data[regressand] = new_data['y'].diff(order=2)


    # get the wavelength window we care about and slice the data, only keeping that stuff
    window = bestmodel_container[1] 
    new_data = new_data.where(new_data['x'] > window[0]).where(new_data['x'] < window[1]).dropna()
    new_data_dy = np.array(new_data[regressand]).reshape(1,-1)

    predictor = bestmodel_container[0]
    prediction = predictor.predict(new_data_dy)
    
    # discard unwanted nested lists
    for i in prediction.shape:
        prediction = prediction[0] 
        
    if print_sample is True:
        print('Sample: {}'.format(sample_name))
        
    if printres:
        print("predicted composition {} {:1.3f}".format(target, prediction))
    return prediction


def regrid_ir_spectrum(data: pd.DataFrame, xgrid: np.array, drop_original=True):
    # assuming the data has only absorption data in a column called 'a' and wavenumbers in a column called 'x'

    # do the regridding and add the new x axis as a column
    data['a_interp'] = griddata(data['x'], data['a'], xgrid)
    data['x_interp'] = xgrid

    # store the old data with the tag 'original' and remove th 'interp' tag from the new data 
    data = data.rename(columns = {'a':'a_original','x':'x_original'})
    data = data.rename(columns = {'a_interp': 'a', 'x_interp': 'x'})

    # do you want the old data? I don't recommend keeping it for this purpose, but I gave you the option.
    if drop_original: 
        data = data.drop(['a_original', 'x_original'])
    else:
        pass

    return data

In [8]:
predictor_files = ['cellulose_predictor.pkl', 'hemi_predictor.pkl', 'lignin_predictor.pkl']
predictor_files = ['./trained_models/' + p for p in predictor_files]
predictor_metadata_files = [p.split('.pkl')[0] + '_meta.txt' for p in predictor_files] 

ranges = read_range_files(predictor_metadata_files)
predictors = read_predictor_files(predictor_files)
containers = [[j,i] for i, j in zip(ranges, predictors)]
c_container = containers[0]
h_container = containers[1]
l_container = containers[2]

In [10]:
temp_paths = [j for j in os.listdir() if ('-' in j and ('.csv' in j or '.CSV' in j))]
sample_names = [j.split('.')[0] for j in temp_paths]
for f, n in zip(temp_paths, sample_names):
    temp_paths = regrid_ir_spectrum(temp_paths, x, True)

NameError: name 'x' is not defined

In [5]:
fpaths = [i for i in os.listdir() if ('-' in i and ('.csv' in i or '.CSV' in i))]
c_list, h_list, l_list = [], [], []
sample_names = [i.split('.')[0] for i in fpaths]
regressand = 'da'
for f, n in zip(fpaths, sample_names):
    l = predict_on_test_csvs(f, l_container, regressand, 'lignin', sample_name=n, printres=True, print_sample=True)
    l_list.append(l)
    c = predict_on_test_csvs(f, c_container, regressand, 'cellulose',sample_name=n, printres=True)
    c_list.append(c)
    h = predict_on_test_csvs(f, h_container, regressand, 'hemicellulose',sample_name=n, printres=True)
    h_list.append(h)
    # h_list.append(1 - c - l)


NameError: name 'os' is not defined