## Get preprocessed all data and combine and save to files

In [1]:
# import plot function
import polars as pl
from xpectrass_v002 import FTIRdataprocessing
from xpectrass_v002 import load_all_datasets, get_data_info
dataset = load_all_datasets()
info = get_data_info()

LABEL_COLUMN = "type"

FLAT_WINDOWS = [(1880, 1900), (2400, 2700)]

DENOISING_METHOD = 'wavelet'
BASELINE_CORRECTION_METHOD = 'aspls'

# Define regions
EXCLUDE_REGIONS = [
    (0, 679),       # Exclude everything below 680, CO₂ bending mode, 670 cm-1
    # (1350, 1450),   # Exclude H2O bend region
    # (1250, 1900),  # Exclude H2O bend region
    # (2300, 2400),   # Exclude CO2 stretch region, 2350 cm-1
    (3001, 5000)    # Exclude everything above 3500, O–H stretch region
    ]
INTERPOLATE_REGIONS = [
    (1250, 2700)    # Interpolate over H2O region
    ]

INTERPOLATE_METHOD = "zero"

NORMALIZATION_METHOD = "spectral_moments"

info

UMAP not installed. Run: pip install umap-learn
SHAP not installed. Run: pip install shap


{'jung_2018': {'exists': True,
  'path': '/Users/julhashkazi/Documents/PythonScripts/FTIR/scripts/xpectrass_app/xpectrass_v002/data/jung_2018.csv.xz',
  'filename': 'jung_2018.csv.xz',
  'size_mb': 1.5899505615234375},
 'kedzierski_2019': {'exists': True,
  'path': '/Users/julhashkazi/Documents/PythonScripts/FTIR/scripts/xpectrass_app/xpectrass_v002/data/kedzierski_2019.csv.xz',
  'filename': 'kedzierski_2019.csv.xz',
  'size_mb': 7.486408233642578},
 'kedzierski_2019_u': {'exists': True,
  'path': '/Users/julhashkazi/Documents/PythonScripts/FTIR/scripts/xpectrass_app/xpectrass_v002/data/kedzierski_2019_u.csv.xz',
  'filename': 'kedzierski_2019_u.csv.xz',
  'size_mb': 10.197105407714844},
 'frond_2021': {'exists': True,
  'path': '/Users/julhashkazi/Documents/PythonScripts/FTIR/scripts/xpectrass_app/xpectrass_v002/data/frond_2021.csv.xz',
  'filename': 'frond_2021.csv.xz',
  'size_mb': 1.553314208984375},
 'villegas_camacho_2024_c4': {'exists': True,
  'path': '/Users/julhashkazi/Docum

In [2]:
jung_2018 = dataset['jung_2018']
kedzierski_2019 = dataset['kedzierski_2019']
kedzierski_2019_u = dataset['kedzierski_2019_u']
frond_2021 = dataset['frond_2021']
villegas_camacho_2024_c4 = dataset['villegas_camacho_2024_c4']
villegas_camacho_2024_c8 = dataset['villegas_camacho_2024_c8']

print('jung_2018: ', jung_2018.shape)
print('kedzierski_2019: ', kedzierski_2019.shape)
print('kedzierski_2019_u: ', kedzierski_2019_u.shape)
print('frond_2021: ', frond_2021.shape)
print('villegas_camacho_2024_c4: ', villegas_camacho_2024_c4.shape)
print('villegas_camacho_2024_c8: ', villegas_camacho_2024_c8.shape)

jung_2018:  (800, 3556)
kedzierski_2019:  (970, 1767)
kedzierski_2019_u:  (4064, 1768)
frond_2021:  (380, 1874)
villegas_camacho_2024_c4:  (3000, 3741)
villegas_camacho_2024_c8:  (3000, 1874)


In [3]:
# Initiate FTIRdataprocessing class
fdp1 = FTIRdataprocessing(df = jung_2018,
            label_column = LABEL_COLUMN,
            exclude_regions = EXCLUDE_REGIONS,
            interpolate_regions = INTERPOLATE_REGIONS,
            flat_windows = FLAT_WINDOWS
            )
# Get denoised, baseline corrected and atmospheric interference corrected  data
jung_2018_corr = fdp1._get_normalized_data(
                denoising_method = DENOISING_METHOD,
                baseline_correction_method = BASELINE_CORRECTION_METHOD,
                interpolate_method = INTERPOLATE_METHOD,
                normalization_method = NORMALIZATION_METHOD,
                plot = False,
            )

jung_2018_deriv1 = fdp1.derivatives(
        data = jung_2018_corr,
        order = 1,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )

jung_2018_deriv2 = fdp1.derivatives(
        data = jung_2018_corr,
        order = 2,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )

Auto-detected: Transmittance → Converting to Absorbance


Denoising (wavelet): 100%|██████████| 800/800 [00:00<00:00, 12104.51it/s]
Baseline correction (aspls): 100%|██████████| 800/800 [00:18<00:00, 42.56it/s]
Processing Regions: 100%|██████████| 800/800 [00:00<00:00, 10489.04it/s]
Normalization (spectral_moments): 100%|██████████| 800/800 [00:00<00:00, 78855.12it/s]


Computing 1st derivative for 800 samples...
Computing 2nd derivative for 800 samples...


In [4]:
# Initiate FTIRdataprocessing class
fdp2 = FTIRdataprocessing(df = kedzierski_2019,
            label_column = LABEL_COLUMN,
            exclude_regions = EXCLUDE_REGIONS,
            interpolate_regions = INTERPOLATE_REGIONS,
            flat_windows = FLAT_WINDOWS
            )

kedzierski_2019_corr = kedzierski_2019.copy()

kedzierski_2019_deriv1 = fdp2.derivatives(
        data = kedzierski_2019_corr,
        order = 1,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )

kedzierski_2019_deriv2 = fdp2.derivatives(
        data = kedzierski_2019_corr,
        order = 2,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )

Computing 1st derivative for 970 samples...
Computing 2nd derivative for 970 samples...


In [5]:
fdp3 = FTIRdataprocessing(df = kedzierski_2019,
            label_column = LABEL_COLUMN,
            exclude_regions = EXCLUDE_REGIONS,
            interpolate_regions = INTERPOLATE_REGIONS,
            flat_windows = FLAT_WINDOWS
            )

kedzierski_2019_u_corr = kedzierski_2019_u.copy()

kedzierski_2019_u_deriv1 = fdp3.derivatives(
        data = kedzierski_2019_u_corr,
        order = 1,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )

kedzierski_2019_u_deriv2 = fdp3.derivatives(
        data = kedzierski_2019_u_corr,
        order = 2,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )

Computing 1st derivative for 4064 samples...
Computing 2nd derivative for 4064 samples...


In [6]:
# Initiate FTIRdataprocessing class
fdp4 = FTIRdataprocessing(df = frond_2021,
            label_column = LABEL_COLUMN,
            exclude_regions = EXCLUDE_REGIONS,
            interpolate_regions = INTERPOLATE_REGIONS,
            flat_windows = FLAT_WINDOWS
            )
# Get denoised, baseline corrected and atmospheric interference corrected  data
frond_2021_corr = fdp4._get_normalized_data(
                denoising_method = DENOISING_METHOD,
                baseline_correction_method = BASELINE_CORRECTION_METHOD,
                interpolate_method = INTERPOLATE_METHOD,
                normalization_method = NORMALIZATION_METHOD,
                plot = False,
            )

frond_2021_deriv1 = fdp4.derivatives(
        data = frond_2021_corr,
        order = 1,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )

frond_2021_deriv2 = fdp4.derivatives(
        data = frond_2021_corr,
        order = 2,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )

Found 0 negative and 54720 zero transmittance values. These are physically invalid and will be clipped to 0.01% for conversion. This indicates data quality issues in the input.


Auto-detected: Transmittance → Converting to Absorbance


Denoising (wavelet): 100%|██████████| 380/380 [00:00<00:00, 10997.28it/s]
Baseline correction (aspls): 100%|██████████| 380/380 [00:02<00:00, 161.39it/s]
Processing Regions: 100%|██████████| 380/380 [00:00<00:00, 15938.36it/s]
Normalization (spectral_moments): 100%|██████████| 380/380 [00:00<00:00, 88931.79it/s]

Computing 1st derivative for 380 samples...
Computing 2nd derivative for 380 samples...





In [7]:
# Initiate FTIRdataprocessing class
fdp5 = FTIRdataprocessing(df = villegas_camacho_2024_c4,
            label_column = LABEL_COLUMN,
            exclude_regions = EXCLUDE_REGIONS,
            interpolate_regions = INTERPOLATE_REGIONS,
            flat_windows = FLAT_WINDOWS
            )
# Get denoised, baseline corrected and atmospheric interference corrected  data
villegas_camacho_2024_c4_corr = fdp5._get_normalized_data(
                denoising_method = DENOISING_METHOD,
                baseline_correction_method = BASELINE_CORRECTION_METHOD,
                interpolate_method = INTERPOLATE_METHOD,
                normalization_method = NORMALIZATION_METHOD,
                plot = False,
            )
villegas_camacho_2024_c4_deriv1 = fdp5.derivatives(
        data = villegas_camacho_2024_c4_corr,
        order = 1,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )

villegas_camacho_2024_c4_deriv2 = fdp5.derivatives(
        data = villegas_camacho_2024_c4_corr,
        order = 2,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )

Auto-detected: Transmittance → Converting to Absorbance


Denoising (wavelet): 100%|██████████| 3000/3000 [00:00<00:00, 10779.11it/s]
Baseline correction (aspls): 100%|██████████| 3000/3000 [01:17<00:00, 38.50it/s]
Processing Regions: 100%|██████████| 3000/3000 [00:00<00:00, 9121.03it/s]
Normalization (spectral_moments): 100%|██████████| 3000/3000 [00:00<00:00, 72327.23it/s]


Computing 1st derivative for 3000 samples...
Computing 2nd derivative for 3000 samples...


In [8]:
# Initiate FTIRdataprocessing class
fdp6 = FTIRdataprocessing(df = villegas_camacho_2024_c8,
            label_column = LABEL_COLUMN,
            exclude_regions = EXCLUDE_REGIONS,
            interpolate_regions = INTERPOLATE_REGIONS,
            flat_windows = FLAT_WINDOWS
            )
# Get denoised, baseline corrected and atmospheric interference corrected  data
villegas_camacho_2024_c8_corr = fdp6._get_normalized_data(
                denoising_method = DENOISING_METHOD,
                baseline_correction_method = BASELINE_CORRECTION_METHOD,
                interpolate_method = INTERPOLATE_METHOD,
                normalization_method = NORMALIZATION_METHOD,
                plot = False,
            )

villegas_camacho_2024_c8_deriv1 = fdp6.derivatives(
        data = villegas_camacho_2024_c8_corr,
        order = 1,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )

villegas_camacho_2024_c8_deriv2 = fdp6.derivatives(
        data = villegas_camacho_2024_c8_corr,
        order = 2,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )

Auto-detected: Transmittance → Converting to Absorbance


Denoising (wavelet): 100%|██████████| 3000/3000 [00:00<00:00, 13857.77it/s]
Baseline correction (aspls): 100%|██████████| 3000/3000 [00:40<00:00, 74.48it/s]
Processing Regions: 100%|██████████| 3000/3000 [00:00<00:00, 16037.81it/s]
Normalization (spectral_moments): 100%|██████████| 3000/3000 [00:00<00:00, 96608.82it/s]


Computing 1st derivative for 3000 samples...
Computing 2nd derivative for 3000 samples...


In [9]:
jung_2018_corr.drop(columns=['study'], inplace=True)
kedzierski_2019_corr.drop(columns=['study'], inplace=True)
kedzierski_2019_u_corr.drop(columns=['study'], inplace=True)
frond_2021_corr.drop(columns=['study'], inplace=True)
villegas_camacho_2024_c4_corr.drop(columns=['study'], inplace=True)
villegas_camacho_2024_c8_corr.drop(columns=['study'], inplace=True)

In [10]:
print('jung_2018: ', jung_2018_corr.shape)
print('kedzierski_2019: ', kedzierski_2019_corr.shape)
print('kedzierski_2019_u: ', kedzierski_2019_u_corr.shape)
print('frond_2021: ', frond_2021_corr.shape)
print('villegas_camacho_2024_c4: ', villegas_camacho_2024_c4_corr.shape)
print('villegas_camacho_2024_c8: ', villegas_camacho_2024_c8_corr.shape)

jung_2018:  (800, 2325)
kedzierski_2019:  (970, 1766)
kedzierski_2019_u:  (4064, 1767)
frond_2021:  (380, 1208)
villegas_camacho_2024_c4:  (3000, 2412)
villegas_camacho_2024_c8:  (3000, 1208)


In [11]:
jung_2018_deriv1.drop(columns=['study'], inplace=True)
kedzierski_2019_deriv1.drop(columns=['study'], inplace=True)
kedzierski_2019_u_deriv1.drop(columns=['study'], inplace=True)
frond_2021_deriv1.drop(columns=['study'], inplace=True)
villegas_camacho_2024_c4_deriv1.drop(columns=['study'], inplace=True)
villegas_camacho_2024_c8_deriv1.drop(columns=['study'], inplace=True)

In [12]:
print('jung_2018: ', jung_2018_deriv1.shape)
print('kedzierski_2019: ', kedzierski_2019_deriv1.shape)
print('kedzierski_2019_u: ', kedzierski_2019_u_deriv1.shape)
print('frond_2021: ', frond_2021_deriv1.shape)
print('villegas_camacho_2024_c4: ', villegas_camacho_2024_c4_deriv1.shape)
print('villegas_camacho_2024_c8: ', villegas_camacho_2024_c8_deriv1.shape)

jung_2018:  (800, 2325)
kedzierski_2019:  (970, 1766)
kedzierski_2019_u:  (4064, 1767)
frond_2021:  (380, 1208)
villegas_camacho_2024_c4:  (3000, 2412)
villegas_camacho_2024_c8:  (3000, 1208)


In [13]:
jung_2018_deriv2.drop(columns=['study'], inplace=True)
kedzierski_2019_deriv2.drop(columns=['study'], inplace=True)
kedzierski_2019_u_deriv2.drop(columns=['study'], inplace=True)
frond_2021_deriv2.drop(columns=['study'], inplace=True)
villegas_camacho_2024_c4_deriv2.drop(columns=['study'], inplace=True)
villegas_camacho_2024_c8_deriv2.drop(columns=['study'], inplace=True)

In [14]:
print('jung_2018: ', jung_2018_deriv2.shape)
print('kedzierski_2019: ', kedzierski_2019_deriv2.shape)
print('kedzierski_2019_u: ', kedzierski_2019_u_deriv2.shape)
print('frond_2021: ', frond_2021_deriv2.shape)
print('villegas_camacho_2024_c4: ', villegas_camacho_2024_c4_deriv2.shape)
print('villegas_camacho_2024_c8: ', villegas_camacho_2024_c8_deriv2.shape)

jung_2018:  (800, 2325)
kedzierski_2019:  (970, 1766)
kedzierski_2019_u:  (4064, 1767)
frond_2021:  (380, 1208)
villegas_camacho_2024_c4:  (3000, 2412)
villegas_camacho_2024_c8:  (3000, 1208)


In [15]:
from xpectrass_v002 import combine_datasets
combined_norm_data, _ = combine_datasets(
    datasets = [jung_2018_corr, kedzierski_2019_corr, kedzierski_2019_u_corr,
                frond_2021_corr, villegas_camacho_2024_c4_corr,
                villegas_camacho_2024_c8_corr],
    wn_min = 680,
    wn_max = 3000,
    resolution = 2.0,
    descending = True,
    method = "pchip",
    label_column = "type",
    exclude_columns = None,
    add_study_column = ['sample_id', 'environmental', 'resolution'],
    study_names = ['jung_2018', 'kedzierski_2019', 'kedzierski_2019_u',
                'frond_2021', 'villegas_camacho_2024_c4',
                'villegas_camacho_2024_c8'],
    show_progress = True,
    n_jobs = 12,
    data_mode="normalized"
)
combined_norm_data.to_csv('combined_norm_data.csv.xz', compression='xz', index=None)




DATASET COVERAGE ANALYSIS
Target grid: 680.0 - 3000.0 cm⁻¹ (2320.0 cm⁻¹ range)
Grid mode: intersection
----------------------------------------------------------------------
  jung_2018: 800 samples, range 680.0-3000.0 cm⁻¹, coverage: ✓ FULL
  kedzierski_2019: 970 samples, range 599.8-3996.0 cm⁻¹, coverage: ✓ FULL
  kedzierski_2019_u: 4064 samples, range 599.8-3997.9 cm⁻¹, coverage: ✓ FULL
  frond_2021: 380 samples, range 680.8-3000.8 cm⁻¹, coverage: ✓ FULL
  villegas_camacho_2024_c4: 3000 samples, range 679.8-3000.7 cm⁻¹, coverage: ✓ FULL
  villegas_camacho_2024_c8: 3000 samples, range 680.7-3000.7 cm⁻¹, coverage: ✓ FULL
----------------------------------------------------------------------
Total: 12214 samples, 12214 with full coverage (100.0%)



Resampling (pchip):   0%|          | 0/800 [00:00<?, ?it/s]

UMAP not installed. Run: pip install umap-learn
UMAP not installed. Run: pip install umap-learn
UMAP not installed. Run: pip install umap-learn
UMAP not installed. Run: pip install umap-learn
UMAP not installed. Run: pip install umap-learn
UMAP not installed. Run: pip install umap-learn
UMAP not installed. Run: pip install umap-learn
UMAP not installed. Run: pip install umap-learn
UMAP not installed. Run: pip install umap-learn
UMAP not installed. Run: pip install umap-learn
UMAP not installed. Run: pip install umap-learn
UMAP not installed. Run: pip install umap-learn
SHAP not installed. Run: pip install shap
SHAP not installed. Run: pip install shap
SHAP not installed. Run: pip install shap
SHAP not installed. Run: pip install shap
SHAP not installed. Run: pip install shap
SHAP not installed. Run: pip install shap
SHAP not installed. Run: pip install shap
SHAP not installed. Run: pip install shap
SHAP not installed. Run: pip install shap
SHAP not installed. Run: pip install shap
SHAP



Resampling (pchip):   0%|          | 0/970 [00:00<?, ?it/s]



Resampling (pchip):   0%|          | 0/4064 [00:00<?, ?it/s]

Resampling (pchip):   0%|          | 0/380 [00:00<?, ?it/s]

Resampling (pchip):   0%|          | 0/3000 [00:00<?, ?it/s]

Resampling (pchip):   0%|          | 0/3000 [00:00<?, ?it/s]

In [16]:
combined_deriv1_data, _ = combine_datasets(
    datasets = [jung_2018_deriv1, kedzierski_2019_deriv1, kedzierski_2019_u_deriv1,
                frond_2021_deriv1, villegas_camacho_2024_c4_deriv1,
                villegas_camacho_2024_c8_deriv1],
    wn_min = 680,
    wn_max = 3000,
    resolution = 2.0,
    descending = True,
    method = "pchip",
    label_column = "type",
    exclude_columns = None,
    add_study_column = ['sample_id', 'environmental', 'resolution'],
    study_names = ['jung_2018', 'kedzierski_2019', 'kedzierski_2019_u',
                'frond_2021', 'villegas_camacho_2024_c4',
                'villegas_camacho_2024_c8'],
    show_progress = True,
    n_jobs = 12,
    data_mode="normalized"
)
combined_deriv1_data.to_csv('combined_deriv1_data.csv.xz', compression='xz', index=None)




DATASET COVERAGE ANALYSIS
Target grid: 680.0 - 3000.0 cm⁻¹ (2320.0 cm⁻¹ range)
Grid mode: intersection
----------------------------------------------------------------------
  jung_2018: 800 samples, range 680.0-3000.0 cm⁻¹, coverage: ✓ FULL
  kedzierski_2019: 970 samples, range 599.8-3996.0 cm⁻¹, coverage: ✓ FULL
  kedzierski_2019_u: 4064 samples, range 599.8-3997.9 cm⁻¹, coverage: ✓ FULL
  frond_2021: 380 samples, range 680.8-3000.8 cm⁻¹, coverage: ✓ FULL
  villegas_camacho_2024_c4: 3000 samples, range 679.8-3000.7 cm⁻¹, coverage: ✓ FULL
  villegas_camacho_2024_c8: 3000 samples, range 680.7-3000.7 cm⁻¹, coverage: ✓ FULL
----------------------------------------------------------------------
Total: 12214 samples, 12214 with full coverage (100.0%)



Resampling (pchip):   0%|          | 0/800 [00:00<?, ?it/s]

Resampling (pchip):   0%|          | 0/970 [00:00<?, ?it/s]

Resampling (pchip):   0%|          | 0/4064 [00:00<?, ?it/s]

Resampling (pchip):   0%|          | 0/380 [00:00<?, ?it/s]

Resampling (pchip):   0%|          | 0/3000 [00:00<?, ?it/s]

Resampling (pchip):   0%|          | 0/3000 [00:00<?, ?it/s]

In [17]:
combined_deriv2_data, _ = combine_datasets(
    datasets = [jung_2018_deriv2, kedzierski_2019_deriv2, kedzierski_2019_u_deriv2,
                frond_2021_deriv2, villegas_camacho_2024_c4_deriv2,
                villegas_camacho_2024_c8_deriv2],
    wn_min = 680,
    wn_max = 3000,
    resolution = 2.0,
    descending = True,
    method = "pchip",
    label_column = "type",
    exclude_columns = None,
    add_study_column = ['sample_id', 'environmental', 'resolution'],
    study_names = ['jung_2018', 'kedzierski_2019', 'kedzierski_2019_u',
                'frond_2021', 'villegas_camacho_2024_c4',
                'villegas_camacho_2024_c8'],
    show_progress = True,
    n_jobs = 12,
    data_mode="normalized"
)
combined_deriv2_data.to_csv('combined_deriv2_data.csv.xz', compression='xz', index=None)




DATASET COVERAGE ANALYSIS
Target grid: 680.0 - 3000.0 cm⁻¹ (2320.0 cm⁻¹ range)
Grid mode: intersection
----------------------------------------------------------------------
  jung_2018: 800 samples, range 680.0-3000.0 cm⁻¹, coverage: ✓ FULL
  kedzierski_2019: 970 samples, range 599.8-3996.0 cm⁻¹, coverage: ✓ FULL
  kedzierski_2019_u: 4064 samples, range 599.8-3997.9 cm⁻¹, coverage: ✓ FULL
  frond_2021: 380 samples, range 680.8-3000.8 cm⁻¹, coverage: ✓ FULL
  villegas_camacho_2024_c4: 3000 samples, range 679.8-3000.7 cm⁻¹, coverage: ✓ FULL
  villegas_camacho_2024_c8: 3000 samples, range 680.7-3000.7 cm⁻¹, coverage: ✓ FULL
----------------------------------------------------------------------
Total: 12214 samples, 12214 with full coverage (100.0%)



Resampling (pchip):   0%|          | 0/800 [00:00<?, ?it/s]

Resampling (pchip):   0%|          | 0/970 [00:00<?, ?it/s]

Resampling (pchip):   0%|          | 0/4064 [00:00<?, ?it/s]

Resampling (pchip):   0%|          | 0/380 [00:00<?, ?it/s]

Resampling (pchip):   0%|          | 0/3000 [00:00<?, ?it/s]

Resampling (pchip):   0%|          | 0/3000 [00:00<?, ?it/s]

In [18]:
# Initiate FTIRdataprocessing class
fdp = FTIRdataprocessing(df = combined_norm_data,
            label_column = LABEL_COLUMN,
            exclude_regions = EXCLUDE_REGIONS,
            interpolate_regions = INTERPOLATE_REGIONS,
            flat_windows = FLAT_WINDOWS
            )

combined_norm_deriv1_data = fdp.derivatives(
        data = combined_norm_data,
        order = 1,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )

combined_norm_deriv2_data = fdp.derivatives(
        data = combined_norm_data,
        order = 2,
        window_length = 15,
        polyorder = 3,
        delta = 1.0,
        plot = False,
        save_plot = False,
        save_path = None,
    )
combined_norm_deriv1_data.to_csv('combined_norm_deriv1_data.csv.xz', compression='xz', index=None)
combined_norm_deriv2_data.to_csv('combined_norm_deriv2_data.csv.xz', compression='xz', index=None)

Computing 1st derivative for 12214 samples...
Computing 2nd derivative for 12214 samples...
