# **XAI4Spectra**

# **Loading data**

In [103]:
# loading a soil spectral dataset based on X-ray fluorescence (XRF)

import pandas as pd

data_complete = pd.read_csv('https://raw.githubusercontent.com/joseviniciusr/XAI4Spectra/refs/heads/main/Toledo22.csv', sep=';')
data = data_complete.loc[:, '1':'15']
data.insert(0, 'exCa', data_complete['exCa'])  # inserting the target variable (e.g., exCa (exchangeable calcium))

## **PLSR fitting**

**Regression case**

In [104]:
# splitting the data into calibration and prediction sets by kennard-stone algorithm
import kennard_stone as ks
datacal, datapred = ks.train_test_split(data, test_size=0.25)
Xcal = datacal.iloc[:, 1:].reset_index(drop=True)
ycal = datacal.iloc[:, 0].reset_index(drop=True)
Xpred = datapred.iloc[:, 1:].reset_index(drop=True)
ypred = datapred.iloc[:, 0].reset_index(drop=True)

2025-10-29 10:54:42,755 - kennard_stone.utils._pairwise:109[INFO] - Calculating pairwise distances using scikit-learn.

2025-10-29 10:54:42,759 - kennard_stone.utils._pairwise:109[INFO] - Calculating pairwise distances using scikit-learn.



In [105]:
def pls_optimized(Xcal, ycal, LVmax, Xpred=None, ypred=None, aim='regression', cv=10):

    """
    ## PLS optimized
    Function to fit a PLS regression or PLS-DA model with optimization of latent variables (LVs)
    using cross-validation. It calculates various performance metrics for calibration, cross-validation,
    and prediction (if provided) datasets
    **Parameters**:
    - **Xcal** : pd.DataFrame
        Calibration dataset features.
    - **ycal** : pd.Series or np.ndarray
        Calibration dataset target variable.
    - **LVmax** : int
        Maximum number of latent variables to consider.
    - **Xpred** : pd.DataFrame, optional
        Prediction dataset features. Default is None.
    - **ypred** : pd.Series or np.ndarray, optional
        Prediction dataset target variable. Default is None.
    - **aim** : str, optional
        Type of analysis: 'regression' for PLS regression or 'classification' for PLS-DA. Default is 'regression'.
    - **cv** : int, optional
        Number of cross-validation folds. Default is 10
        
    **Returns**:
    - **df_results** : pd.DataFrame
        DataFrame containing performance metrics for each number of latent variables.
    - **calres** : pd.DataFrame
        DataFrame containing predicted values for the calibration dataset.
    - **predres** : pd.DataFrame
        DataFrame containing predicted values for the prediction dataset (if provided).
    """

    if aim == 'regression':
        # Importing necessary libraries
        import numpy as np
        import pandas as pd
        from sklearn.cross_decomposition import PLSRegression
        from sklearn.model_selection import cross_val_predict
        from sklearn.metrics import mean_squared_error, r2_score
        from scipy.stats import iqr

        results = []
        # Dataframes for storing predicted values (calibration and prediction)
        calres = pd.DataFrame(index=range(len(ycal)))
        predres = pd.DataFrame(index=range(len(ypred))) if Xpred is not None and ypred is not None else None
        
        # Loop for each number of latent variables from 1 to LVmax
        for n_comp in range(1, LVmax + 1):
            # define the PLS model
            plsr = PLSRegression(n_components=n_comp, scale=False)
            
            # Adjust the model to the calibration data
            plsr.fit(Xcal, ycal)
            y_cal = plsr.predict(Xcal).flatten()  # flatten to convert to 1D array

            # Add predicted calibration values to calres
            calres[f'LV_{n_comp}'] = y_cal

            # Cross-validation
            y_cv = cross_val_predict(plsr, Xcal, ycal, cv=cv)
            
            # Calculate calibration metrics
            r2_cal = r2_score(ycal, y_cal)
            rmse_cal = np.sqrt(mean_squared_error(ycal, y_cal))

            # Calculate cross-validation metrics
            r2_cv = r2_score(ycal, y_cv)
            rmsecv = np.sqrt(mean_squared_error(ycal, y_cv))
            rpd_cv = ycal.std() / rmsecv
            rpiq_cv = iqr(ycal, rng=(25, 75)) / rmsecv
            bias_cv = sum(ycal - y_cv)/ycal.shape[0]
            SDV_cv = (ycal - y_cv) - bias_cv
            SDV_cv = SDV_cv*SDV_cv
            SDV_cv = np.sqrt(sum(SDV_cv)/(ycal.shape[0] - 1))
            tbias_cv = abs(bias_cv)*(np.sqrt(ycal.shape[0])/SDV_cv)

            # Verify if prediction data is provided
            if Xpred is not None and ypred is not None:
                # Perform prediction
                y_pred = plsr.predict(Xpred).flatten()
                
                # Store predicted values in predres
                predres[f'LV_{n_comp}'] = y_pred
                
                # Calculate prediction metrics
                r2_pred = r2_score(ypred, y_pred)
                rmsep = np.sqrt(mean_squared_error(ypred, y_pred))
                rpd_pred = ypred.std() / rmsep
                rpiq_pred = iqr(ypred, rng=(25, 75)) / rmsep
                bias_pred = sum(ypred - y_pred)/ypred.shape[0]
                SDV_pred = (ypred - y_pred) - bias_pred
                SDV_pred = SDV_pred*SDV_pred
                SDV_pred = np.sqrt(sum(SDV_pred)/(ypred.shape[0] - 1))
                tbias_pred = abs(bias_pred)*(np.sqrt(ypred.shape[0])/SDV_pred)
            else:
                # Define the prediction metrics as None if no prediction data is provided
                r2_pred = rmsep = rpd_pred = rpiq_pred = None
            
            # Store all metrics in the results list
            results.append({
                'LVs': n_comp,
                'R2 Cal': r2_cal,
                'RMSEC': rmse_cal,
                'R2 CV': r2_cv,
                'RMSECV': rmsecv,
                'RPD CV': rpd_cv,
                'RPIQ CV': rpiq_cv,
                'Bias_CV': bias_cv,
                'tbias_CV': tbias_cv,
                'R2 Pred': r2_pred,
                'RMSEP': rmsep,
                'RPD Pred': rpd_pred,
                'RPIQ Pred': rpiq_pred,
                'Bias_Pred': bias_pred,
                'tbias_Pred': tbias_pred
            })
        
        # Convert the results list to a DataFrame
        df_results = pd.DataFrame(results)
        calres.insert(0, 'Ref', ycal)
        predres.insert(0, 'Ref', ypred)

    elif aim == 'classification':
        # Importing necessary libraries
        import numpy as np
        import pandas as pd
        from sklearn.cross_decomposition import PLSRegression
        from sklearn.model_selection import cross_val_predict
        from sklearn.metrics import accuracy_score, confusion_matrix
        results = []
        calres = pd.DataFrame(index=range(len(ycal)))
        predres = pd.DataFrame(index=range(len(ypred))) if Xpred is not None and ypred is not None else None
        ycal_numeric = np.where(ycal == 'eut', 1, 0)  # assuming 'eut' is the positive class
        ypred_numeric = np.where(ypred == 'eut', 1, 0) if ypred is not None else None
        for n_comp in range(1, LVmax + 1):
            # define the PLS-DA model
            plsda = PLSRegression(n_components=n_comp, scale=False)
            
            # Fit the model to the calibration data
            plsda.fit(Xcal, ycal_numeric)
            y_cal = plsda.predict(Xcal)
            y_cal_class = np.where(y_cal >= 0.5, 1, 0).flatten() # converting to class labels
            
            # cross-validation
            y_cv = cross_val_predict(plsda, Xcal, ycal_numeric, cv=cv)
            y_cv_class = np.where(y_cv >= 0.5, 1, 0).flatten()

            # Calculate calibration and cross-validation metrics
            acc_cal = accuracy_score(ycal_numeric, y_cal_class  )
            cm_cal = confusion_matrix(ycal_numeric, y_cal_class)
            tn, fp, fn, tp = cm_cal.ravel()
            sensitivity = tp / (tp + fn) if (tp + fn) > 0 else np.nan
            specificity = tn / (tn + fp) if (tn + fp) > 0 else np.nan

            acc_cv = accuracy_score(ycal_numeric, y_cv_class)
            cm_cv = confusion_matrix(ycal_numeric, y_cv_class)
            tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
            sensitivity_cv = tp_cv / (tp_cv + fn_cv) if (tp_cv + fn_cv) > 0 else np.nan
            specificity_cv = tn_cv / (tn_cv + fp_cv) if (tn_cv + fp_cv) > 0 else np.nan

            if Xpred is not None and ypred is not None:
                y_pred = plsda.predict(Xpred)
                y_pred_class = np.where(y_pred >= 0.5, 1, 0).flatten()
                acc_pred = accuracy_score(ypred_numeric, y_pred_class)
                cm_pred = confusion_matrix(ypred_numeric, y_pred_class)
                tn_p, fp_p, fn_p, tp_p = cm_pred.ravel()
                sensitivity_p = tp_p / (tp_p + fn_p) if (tp_p + fn_p) > 0 else 0
                specificity_p = tn_p / (tn_p + fp_p) if (tn_p + fp_p) > 0 else 0
            else:
                acc_pred = sensitivity_p = specificity_p = cm_pred = tn_p = fp_p = fn_p = tp_p = None

            results.append({
                'LVs': n_comp,
                'Accuracy Cal': acc_cal,
                'Sensitivity Cal': sensitivity,
                'Specificity Cal': specificity,
                'CM Cal': cm_cal,
                'Accuracy CV': acc_cv,
                'Sensitivity CV': sensitivity_cv,
                'Specificity CV': specificity_cv,
                'CM CV': cm_cv,
                'Accuracy Pred': acc_pred,
                'Sensitivity Pred': sensitivity_p,
                'Specificity Pred': specificity_p,
                'CM Pred': cm_pred 
            })
        
        # Convert the results list to a DataFrame
        df_results = pd.DataFrame(results)
        calres.insert(0, 'Ref', ycal)
        predres.insert(0, 'Ref', ypred)    

    return df_results, calres, predres

In [106]:
# preprocessings
import preprocessings as prepr # preprocessing methods for XRF data
import numpy as np 

Xcal_prep, mean_cal, mean_cal_poisson  = prepr.poisson(Xcal, mc=True)
Xpred_prep = ((Xpred/np.sqrt(mean_cal)) - mean_cal_poisson)

In [107]:
plsr_results = pls_optimized(Xcal, ycal, 
                             LVmax=5, 
                             Xpred=Xpred,
                             ypred=ypred,
                             aim='regression',
                             cv=10)
plsr_results[0]

Unnamed: 0,LVs,R2 Cal,RMSEC,R2 CV,RMSECV,RPD CV,RPIQ CV,Bias_CV,tbias_CV,R2 Pred,RMSEP,RPD Pred,RPIQ Pred,Bias_Pred,tbias_Pred
0,1,0.098131,1.705001,0.072118,1.729415,1.041415,1.413773,0.002246,0.016322,-0.490434,1.749357,0.826951,1.131844,-0.899618,4.323922
1,2,0.330874,1.468612,0.165737,1.639851,1.098294,1.490989,-0.021344,0.16362,-0.310195,1.640174,0.882,1.207189,-0.74376,3.668875
2,3,0.584046,1.157912,0.306264,1.495375,1.204406,1.635041,0.01567,0.131726,0.121042,1.343403,1.076842,1.473869,-0.72804,4.650021
3,4,0.747444,0.902259,0.642244,1.073858,1.677167,2.276838,-0.021054,0.246495,0.657386,0.838735,1.72478,2.360699,-0.322048,2.998697
4,5,0.763778,0.872595,0.641812,1.074504,1.676157,2.275468,-0.025522,0.298643,0.692705,0.794328,1.821205,2.492675,-0.190588,1.782263


**Classification case**

In [108]:
# Creating a new column 'Class' based on the condition of 'BSP' values
data_complete['Class'] = np.where(data_complete['BSP'] > 50.00, 'eut', 'dist') # eutrophic (eut) if BSP > 50.00 (higher fertility), otherwise dystrophic (dist)
data_eut = data_complete[data_complete['Class'] == 'eut'].reset_index(drop=True)
data_dist = data_complete[data_complete['Class'] == 'dist'].reset_index(drop=True)

In [109]:
# splitting the data into calibration and prediction sets by kennard-stone algorithm
import kennard_stone as ks

Xeut_cal, Xeut_pred = ks.train_test_split(data_eut.loc[:, '1':'15'], test_size=0.30) # class eutrophic
Xeut_cal = Xeut_cal.reset_index(drop=True)
Xeut_pred = Xeut_pred.reset_index(drop=True)

Xdist_cal, Xdist_pred = ks.train_test_split(data_dist.loc[:, '1':'15'], test_size=0.30) # class dystrophic
Xdist_cal = Xdist_cal.reset_index(drop=True)
Xdist_pred = Xdist_pred.reset_index(drop=True)

Xcal = pd.concat([Xeut_cal, Xdist_cal], axis=0).reset_index(drop=True) # concatenating both classes
Xpred = pd.concat([Xeut_pred, Xdist_pred], axis=0).reset_index(drop=True)
ycal = pd.Series(['eut']*Xeut_cal.shape[0] + ['dist']*Xdist_cal.shape[0]) # creating the target variable for calibration set
ypred = pd.Series(['eut']*Xeut_pred.shape[0] + ['dist']*Xdist_pred.shape[0]) # creating the target variable for prediction set

2025-10-29 10:54:50,397 - kennard_stone.utils._pairwise:109[INFO] - Calculating pairwise distances using scikit-learn.

2025-10-29 10:54:50,399 - kennard_stone.utils._pairwise:109[INFO] - Calculating pairwise distances using scikit-learn.

2025-10-29 10:54:50,417 - kennard_stone.utils._pairwise:109[INFO] - Calculating pairwise distances using scikit-learn.

2025-10-29 10:54:50,421 - kennard_stone.utils._pairwise:109[INFO] - Calculating pairwise distances using scikit-learn.



In [110]:
# preprocessings
import preprocessings as prepr # preprocessing methods for XRF data
import numpy as np 

Xcal_prep, mean_cal, mean_cal_poisson  = prepr.poisson(Xcal, mc=True)
Xpred_prep = ((Xpred/np.sqrt(mean_cal)) - mean_cal_poisson)

In [114]:
# performing PLS-DA with optimized latent variables
plsda_results = pls_optimized(Xcal, 
                              ycal,
                              LVmax=4,
                              Xpred=Xpred,
                              ypred=ypred,
                              aim='classification',
                              cv=10)
plsda_results[0]

Unnamed: 0,LVs,Accuracy Cal,Sensitivity Cal,Specificity Cal,CM Cal,Accuracy CV,Sensitivity CV,Specificity CV,CM CV,Accuracy Pred,Sensitivity Pred,Specificity Pred,CM Pred
0,1,0.648649,0.649351,0.647887,"[[46, 25], [27, 50]]",0.425676,0.493506,0.352113,"[[25, 46], [39, 38]]",0.53125,0.606061,0.451613,"[[14, 17], [13, 20]]"
1,2,0.668919,0.727273,0.605634,"[[43, 28], [21, 56]]",0.5,0.61039,0.380282,"[[27, 44], [30, 47]]",0.546875,0.515152,0.580645,"[[18, 13], [16, 17]]"
2,3,0.682432,0.74026,0.619718,"[[44, 27], [20, 57]]",0.601351,0.675325,0.521127,"[[37, 34], [25, 52]]",0.5625,0.606061,0.516129,"[[16, 15], [13, 20]]"
3,4,0.871622,0.818182,0.929577,"[[66, 5], [14, 63]]",0.75,0.74026,0.760563,"[[54, 17], [20, 57]]",0.875,0.787879,0.967742,"[[30, 1], [7, 26]]"
