# Example of loading and processing raw EEM data

* Required meta data columns in CSV/excel
* Loading EEMs
* Process them, including blank subtraction
* Example with different metadata and no blank subtraction or Raman normalization
* Create function to show porcessing steps
    - How to store the order this was conducted in?


In [1]:
import sys
sys.path.append('..')
import pyeem

In [2]:
# step 1 - generate a pandas dataframe with the required meta data

# the return of this function should be a pandas dataframe that will be saved to H5 with the EEM datasets
# I'll make a function for importing it form the spreadsheet template I'm using,
# but also make it work with any pandas data frame

def load_eem_meta_data(excel_file = "EEMs.xls"):
    """Read EEM meta data into a pandas dataframe from excel template provided in the pyeem examples folder:
    
    Args:
        excel_file (str): relative path and file name of meta data excel file
    
    Returns:
        meta_data (pandas DataFrame): meta data in a pandas data frame
    """
    import pandas as pd
    meta_data = pd.read_excel(excel_file, sheet_name='Sample', skiprows=1)
    meta_data = meta_data.drop(columns='Index')
    return meta_data

meta_data = load_eem_meta_data('example_data\Description_Example.xlsx')

In [3]:
meta_data

Unnamed: 0,File_Name,Folder,Blank,Raman_Area,Desc,Type
0,ZEF0119_Cig_5ugmL,20180227,20180227_BCycHex0p5sec,1146.3,Cigarette 5 µg/mL,Sample
1,ZEF0132_Diesel_5ugmL,20180227,20180227_BCycHex0p5sec,1146.3,Diesel 5 µg/mL,Sample
2,ZEF00134_Wood_5ugmL,20180227,20180227_BCycHex0p5sec,1146.3,Wood Smoke 5 µg/mL,Sample
3,20180227_BCycHex0p5sec,20180227,20180227_BCycHex0p5sec,1146.3,2018-01-27 Cyclohexane Blank,Blank


In [4]:
def initialize_hdf5_database(database_name = "EEM_data.h5", df = meta_data):
    """Description:
    
    Args:
        database_name (str): filename for hdf5 database
        df (pandas DataFrame): DataFrame containing eem meta data from 'pyeem.load_eem_meta_data' 
        function or created manually - see examples for required columns 
    Returns:
        no retun - data is saved as h5 and may be loaded using 'pyeem.load_eem_data'
    """
    import numpy as np
    from pandas import HDFStore
    
    # create an hdf5 file to store EEM meta data
    hdf = HDFStore(database_name)
    hdf.put('meta', df, format='table', data_columns=True)
    hdf.close()
    return

initialize_hdf5_database(database_name = "EEM_data.h5", df = meta_data)

In [5]:
from pandas import read_hdf
df2 = read_hdf('EEM_data.h5', 'meta')['File_Name']

In [6]:
import numpy as np
df2 = np.array(df2)
df2

array(['ZEF0119_Cig_5ugmL', 'ZEF0132_Diesel_5ugmL', 'ZEF00134_Wood_5ugmL',
       '20180227_BCycHex0p5sec'], dtype=object)

In [8]:
# step 3 - load raw EEMs into H5 (should be almost the same as smoogle)

def meta_data_saver(database_name, data_dir):
    """Add eem spectra to the H5 file created with `initialize_hdf5_database`
    Args:
        database_name (str): filename for hdf5 database (including relative path to file)
        data_directory (str): relative path to where data is stored
         
    Returns:
        no retun - data is saved as h5 and may be loaded using 'pyeem.load_eem_data'
    
    """
    import h5py
    import numpy as np
    from pandas import read_hdf
    
    #load EEM file names from the saved metadata as np.array
    file_names = np.array(read_hdf(database_name, 'meta')['File_Name'])
    folders = np.array(read_hdf(database_name, 'meta')['Folder'])
                          
    #initialize lists to store data
    eem_list = []
    names_list = []
    excitation_list = []
    emission_list = []
    
    for i, (folder, file) in enumerate(zip(folders, file_names)):
        eem_file = str(data_dir) + str(folder) + '/' + str(file) + '.dat'
        # first row of EEM file is excitation wavelengths, skip when reading in file
        eem = np.genfromtxt(eem_file, delimiter = '\t', skip_header=1)
        # emisson wavelengths stored in first column, store then remove
        emission = eem[:,0]
        eem = eem[:,1:]
        # load the excitaion wavelenths 
        excitation = np.genfromtxt(eem_file, delimiter = '\t', skip_header=0)[0,1:]
        eem_list.append(eem)
        excitation_list.append(excitation)
        emission_list.append(emission)
    # convert data to np arrays for saving
    eem_list = np.array(eem_list)
    excitation_list = np.array(excitation_list)
    emission_list = np.array(emission_list)
    print('Data collection complete, final data shape (Sample x Ex x Em):',eem_list.shape)
    print(excitation_list.shape, emission_list.shape)    

    # save data into the h5 file
    with h5py.File(database_name, "w") as f:
        dset = f.create_dataset("Raw Data", eem_list.shape, compression="gzip")
        dset[:] = eem_list
        dset2 = f.create_dataset("Excitation", excitation_list.shape, compression="gzip")
        dset2[:] = excitation_list
        dset3 = f.create_dataset("Emission", emission_list.shape, compression="gzip")
        dset3[:] = emission_list

    return
                          
meta_data_saver("EEM_data.h5", 'example_data/')

Data collection complete, final data shape (Sample x Ex x Em): (4, 250, 151)
(4, 151) (4, 250)


In [29]:
import h5py
with h5py.File("EEM_Data.h5", 'r') as f:
    
#     print('The following data is available in', raw_data_name+'.hdf5')
#     print('-' * 60)
#     print("Key:".ljust(20,' ')) #,"Shape:".ljust(18,' '),"Type:".ljust(10,' '))
#     print('-' * 60)
    for key in f.keys():
        data = f[key]
        print(key.ljust(20,' '))#, str(data.shape).ljust(18,' '), str(data.dtype).ljust(10,' '))

Emission            
Excitation          
Raw Data            


In [5]:
# step 4 - use the data processing fucntions for all operations
# TO DO - create some sort of log for this porcess...

In [6]:
# finally - display contour plots of the processing steps 
# (for now just code this in the notebook, maybe add a finction later)