# Example of loading and processing raw EEM data

* Required meta data columns in CSV/excel
* Loading EEMs
* Process them, including blank subtraction
* Example with different metadata and no blank subtraction or Raman normalization
* Create function to show porcessing steps
    - How to store the order this was conducted in?


In [1]:
import pyeem

pyeem version 0.1


In [2]:
# for development - remove when example notebook is complete
def reload_pyeem():
    import importlib
    importlib.reload(pyeem.data_process)
    importlib.reload(pyeem)
    return
reload_pyeem()

pyeem version 0.1


### Step 1 - generate a pandas dataframe with the required meta data

In [3]:
meta_data = pyeem.load_eem_meta_data('example_data\Description_Example.xlsx')

- The columns 'File_Name' and 'Folder' are required, they point to the data files containing the eems to be loaded.
- 'Blank' and 'Raman_Area' are required if `pyeem.blank_subtract` and `pyeem.raman_normalize` functions will be used.
- All other columns are optional, as many columns as needed to describe the data may be included.

In [4]:
meta_data

Unnamed: 0,File_Name,Folder,Blank,Raman_Area,Desc,Type
0,ZEF0119_Cig_5ugmL,20180227,20180227_BCycHex0p5sec,1146.3,Cigarette 5 µg/mL,Sample
1,ZEF0132_Diesel_5ugmL,20180227,20180227_BCycHex0p5sec,1146.3,Diesel 5 µg/mL,Sample
2,ZEF00134_Wood_5ugmL,20180227,20180227_BCycHex0p5sec,1146.3,Wood Smoke 5 µg/mL,Sample
3,20180227_BCycHex0p5sec,20180227,20180227_BCycHex0p5sec,1146.3,2018-01-27 Cyclohexane Blank,Blank


### Step 2 - Initialize a h5 file and save the meta data in it.

In [None]:
pyeem.init_h5_database("EEM_data.h5", meta_data, overwrite=True)

In [6]:
pyeem.load_eems("EEM_data.h5", 'example_data/')

OSError: EEM_data.h5 not found - please run pyeem.init_h5_database first

In [None]:
# pyeem.blank_subtract("EEM_data.h5", 'example_data/')

In [None]:
# pyeem.apply_cleanscan('EEM_Data.h5')

In [None]:
import numpy as np
import h5py
from tqdm import tqdm
from pyeem import cleanscan

def update_eem_database(database_name, data_dict):
    """Helper function for updating and adding EEM data to h5 file as each step of data processing is completed:
    
    Args:
        database_name (str): filename and relative path for h5 database
        data_dict (dic): dictionary containing np.arrays of data to be saved 
    Returns:
        none
    """
    with h5py.File(database_name, 'a') as f:
        for key in data_dict.keys():
            # check for existing dataset so data can be overwritten, if dataset doesn't exist pass
            try:
                del f[key]
                print('Updating dataset:', key)
            except KeyError:
                pass
            dset = f.create_dataset(key, data_dict[key].shape, compression="gzip")
            dset[:] = data_dict[key]
            print("Dataset saved:", key, "... Shape = ", data_dict[key].shape)
    return

def load_eems(database_name, data_dir):
    """Add eem spectra to the h5 file created with `init_h5_database`
    EEMs data files must be tab delimited .dat files (standard export format from the Horibe Aqualog)
    The first row is excitation wavelengths and the first column is emission wavelengths.
    
    Args:
        database_name (str): filename and relative path for hdf5 file for saving EEMs
        data_dir (str): relative path to where EEM data is stored
         
    Returns:
        no retun - EEMs are saved to h5 file for processing with pyeem functions
    
    """
    from pandas import read_hdf

    try:
        #load EEM file names from the metadata stored in the h5 database as np.array
        file_names = np.array(read_hdf(database_name, 'meta')['File_Name'])
        folders = np.array(read_hdf(database_name, 'meta')['Folder'])
        
    except OSError:
        raise OSError(database_name + ' not found - please run pyeem.init_h5_database first')
        return

    #test if function has already run (dataset 'raw eems should not exist')
    with h5py.File(database_name, 'r') as f:
        try:
            test = f['raw_eems'][:]
            raise Exception('Load eems function has already run on this dataset')
        except KeyError:
            pass
            
                          
    #initialize list to store data
    eem_list = []
    
    for i, (folder, file) in enumerate(zip(folders, file_names)):
        eem_file = str(data_dir) + str(folder) + '/' + str(file) + '.dat'
        # first row of EEM file is excitation wavelengths, first column is emission wavelengths
        eem = np.genfromtxt(eem_file, delimiter = '\t')
        eem_list.append(eem)
    # convert data to np arrays for saving
    eem_list = np.array(eem_list)
    
    print('EEM data collection complete, final data shape (Sample x Ex+1 x Em+1):',eem_list.shape)
    
    # save data into the h5 file
    
    update_eem_database(database_name, {'raw_eems': eem_list,
                                        'eems': eem_list})
    return           


load_eems("EEM_Data.h5", 'example_data/')

In [None]:
i = 1
excitation  = eems_list[i,0,1:]
emission = eems_list[i,1:,0]
eem_data = eems_list[i,1:,1:]

In [None]:
eem_data.shape

In [None]:
import h5py
with h5py.File("EEM_Data.h5", 'r') as f:
    
#     print('The following data is available in', raw_data_name+'.hdf5')
#     print('-' * 60)
#     print("Key:".ljust(20,' ')) #,"Shape:".ljust(18,' '),"Type:".ljust(10,' '))
#     print('-' * 60)
    for key in f.keys():
        data = f[key]
        print(key.ljust(20,' '))#, str(data.shape).ljust(18,' '), str(data.dtype).ljust(10,' '))

In [None]:
# step 4 - use the data processing fucntions for all operations
# TO DO - create some sort of log for this porcess...

In [None]:
# finally - display contour plots of the processing steps 
# (for now just code this in the notebook, maybe add a finction later)