# Example of loading and processing raw EEM data

* Required meta data columns in CSV/excel
* Loading EEMs
* Process them, including blank subtraction
* Example with different metadata and no blank subtraction or Raman normalization
* Create function to show porcessing steps
    - How to store the order this was conducted in?


In [1]:
import pyeem

pyeem version 0.1


In [2]:
# for development - remove when example notebook is complete
def reload_pyeem():
    import importlib
    importlib.reload(pyeem.data_process)
    importlib.reload(pyeem)
    return
reload_pyeem()

pyeem version 0.1


### Step 1 - generate a pandas dataframe with the required meta data

In [3]:
meta_data = pyeem.load_eem_meta_data('example_data\Description_Example.xlsx')

- The columns 'File_Name' and 'Folder' are required, they point to the data files containing the eems to be loaded.
- 'Blank' and 'Raman_Area' are required if `pyeem.blank_subtract` and `pyeem.raman_normalize` functions will be used.
- All other columns are optional, as many columns as needed to describe the data may be included.

In [4]:
meta_data

Unnamed: 0,File_Name,Folder,Blank,Raman_Area,Desc,Type
0,ZEF0119_Cig_5ugmL,20180227,20180227_BCycHex0p5sec,1146.3,Cigarette 5 µg/mL,Sample
1,ZEF0132_Diesel_5ugmL,20180227,20180227_BCycHex0p5sec,1146.3,Diesel 5 µg/mL,Sample
2,ZEF00134_Wood_5ugmL,20180227,20180227_BCycHex0p5sec,1146.3,Wood Smoke 5 µg/mL,Sample
3,20180227_BCycHex0p5sec,20180227,20180227_BCycHex0p5sec,1146.3,2018-01-27 Cyclohexane Blank,Blank


### Step 2 - Initialize a h5 file and save the meta data in it.

In [5]:
pyeem.init_h5_database("EEM_data.h5", meta_data, overwrite=True)

overwriting EEM_data.h5


In [6]:
pyeem.load_eems("EEM_data.h5", 'example_data/')

EEM data collection complete, final data shape (Sample x Ex x Em): (4, 250, 151)
Dataset saved: raw_eems ... Shape =  (4, 250, 151)
Dataset saved: eems ... Shape =  (4, 250, 151)
Dataset saved: raw_ex ... Shape =  (4, 151)
Dataset saved: ex ... Shape =  (4, 151)
Dataset saved: raw_em ... Shape =  (4, 250)
Dataset saved: em ... Shape =  (4, 250)


In [7]:
pyeem.blank_subtract("EEM_data.h5", 'example_data/')

Dataset saved: blanks_subtracted ... Shape =  (4, 250, 151)
Updating dataset: eems
Dataset saved: eems ... Shape =  (4, 250, 151)


In [8]:
pyeem.apply_cleanscan('EEM_Data.h5')

Removing Scatter


100%|█████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.25s/it]


Updating dataset: scatter_removed
Dataset saved: scatter_removed ... Shape =  (4, 250, 151)
Dataset saved: excised_values ... Shape =  (4, 250, 151)
Updating dataset: eems
Dataset saved: eems ... Shape =  (4, 250, 151)


In [None]:
import numpy as np
import h5py
from tqdm import tqdm
from pyeem import cleanscan

def update_eem_database(database_name, data_dict):
    """Helper function for updating and adding EEM data to h5 file as each step of data processing is completed:
    
    Args:
        database_name (str): filename and relative path for h5 database
        data_dict (dic): dictionary containing np.arrays of data to be saved 
    Returns:
        none
    """
    with h5py.File(database_name, 'a') as f:
        for key in data_dict.keys():
            # check for existing dataset so data can be overwritten, if dataset doesn't exist pass
            try:
                del f[key]
                print('Updating dataset:', key)
            except KeyError:
                pass
            dset = f.create_dataset(key, data_dict[key].shape, compression="gzip")
            dset[:] = data_dict[key]
            print("Dataset saved:", key, "... Shape = ", data_dict[key].shape)
    return

def apply_cleanscan(database_name, tol='Default', coeff='Default'):
    """Apply the scatter removal function 'cleanscan' to all EEMs in the the dataset.
     Args:
        database_name (str): filename for hdf5 database
        data_dir (str): relative path to where EEM data is stored
        tol ():parameters for applying cleanscan (see pyeem.cleanscan documentation)
        coeff ():parameters for applying cleanscan (see pyeem.cleanscan documentation)
       
    Returns:
        no retun - scatter removal results are stored in h5 database under key 'scatter_removed' 
        the intermediate results showing what values were removed and replaced by interpolation 
        are saved as 'excised_values'
    """
    #test if function has already run
    with h5py.File(database_name, 'a') as f:
        try:
            f.create_dataset('scatter_removed', (1,1))
        except RuntimeError:
            raise Exception('Cleanscan function has already run on this dataset')
        else:
            pass

    #load EEMs for scatter removal
    try:
        with h5py.File(database_name, 'r') as f:
            eems = f['eems'][:]
            ex = f['ex'][:]
            em = f['em'][:]
        
    except OSError:
        raise OSError(database_name + ' not found - please run pyeem.init_h5_database first')
        return
    except KeyError:
        raise KeyError('eem data not found - please run pyeem.load_eems first')
        return

    # initalize storage for final and intermediate results
    scatter_removed = np.zeros(eems.shape)
    excised_values = np.zeros(eems.shape)
    print('Removing Scatter')
    for i in tqdm(range(eems.shape[0])):
        scatter_removed[i, :, :], excised_values[i, :, :], _ = cleanscan(ex[i], em[i], eems[i],
                                                                                 tol, coeff)

    # update the database
    update_eem_database(database_name, {'scatter_removed': scatter_removed,
                                   'excised_values': excised_values,
                                   'eems': scatter_removed})

    return




In [None]:
import h5py
with h5py.File("EEM_Data.h5", 'r') as f:
    
#     print('The following data is available in', raw_data_name+'.hdf5')
#     print('-' * 60)
#     print("Key:".ljust(20,' ')) #,"Shape:".ljust(18,' '),"Type:".ljust(10,' '))
#     print('-' * 60)
    for key in f.keys():
        data = f[key]
        print(key.ljust(20,' '))#, str(data.shape).ljust(18,' '), str(data.dtype).ljust(10,' '))

In [None]:
# step 4 - use the data processing fucntions for all operations
# TO DO - create some sort of log for this porcess...

In [None]:
# finally - display contour plots of the processing steps 
# (for now just code this in the notebook, maybe add a finction later)