# **XAI4Spectra**

# **Loading data**

In [42]:
# loading a soil spectral dataset based on X-ray fluorescence (XRF)

import pandas as pd

data_complete = pd.read_csv('https://raw.githubusercontent.com/joseviniciusr/XAI4Spectra/refs/heads/main/Toledo22.csv', sep=';')
data = data_complete.loc[:, '1':'15']
data.insert(0, 'exCa', data_complete['exCa'])  # inserting the target variable (e.g., exCa (exchangeable calcium))

## **PLSR fitting**

In [43]:
# splitting the data into calibration and prediction sets by kennard-stone algorithm
import kennard_stone as ks
datacal, datapred = ks.train_test_split(data, test_size=0.25)
Xcal = datacal.iloc[:, 1:].reset_index(drop=True)
ycal = datacal.iloc[:, 0].reset_index(drop=True)
Xpred = datapred.iloc[:, 1:].reset_index(drop=True)
ypred = datapred.iloc[:, 0].reset_index(drop=True)

2025-10-29 09:13:10,416 - kennard_stone.utils._pairwise:109[INFO] - Calculating pairwise distances using scikit-learn.

2025-10-29 09:13:10,421 - kennard_stone.utils._pairwise:109[INFO] - Calculating pairwise distances using scikit-learn.





In [44]:
def pls_optimized(Xcal, ycal, LVmax, Xpred=None, ypred=None):

    """
    Function to perform PLS regression with cross-validation and optional prediction set.
    It calculates various metrics for calibration, cross-validation, and prediction.
    Parameters:
    Xcal : array-like, shape (n_samples, n_features)
        Calibration predictor variables.
    ycal : array-like, shape (n_samples,)
        Calibration response variable.
    LVmax : int
        Maximum number of latent variables to consider.
    Xpred : array-like, shape (m_samples, n_features), optional
        Prediction predictor variables.
    ypred : array-like, shape (m_samples,), optional
        Prediction response variable.
    Returns:
    df_results : pandas DataFrame
        DataFrame containing metrics for each number of latent variables.
    calres : pandas DataFrame
        DataFrame containing predicted values for calibration set.
    predres : pandas DataFrame
        DataFrame containing predicted values for prediction set (if provided).
    """

     # Importing necessary libraries
    import numpy as np
    import pandas as pd
    from sklearn.cross_decomposition import PLSRegression
    from sklearn.model_selection import cross_val_predict
    from sklearn.metrics import mean_squared_error, r2_score
    from scipy.stats import iqr

    results = []

    # Dataframes for storing predicted values (calibration and prediction)
    calres = pd.DataFrame(index=range(len(ycal)))
    predres = pd.DataFrame(index=range(len(ypred))) if Xpred is not None and ypred is not None else None
    
    # Loop for each number of latent variables from 1 to LVmax
    for n_comp in range(1, LVmax + 1):
        # define the PLS model
        pls = PLSRegression(n_components=n_comp, scale=False)
        
        # Adjust the model to the calibration data
        pls.fit(Xcal, ycal)
        y_cal = pls.predict(Xcal).flatten()  # flatten to convert to 1D array

        # Add predicted calibration values to calres
        calres[f'LV_{n_comp}'] = y_cal

        # Cross-validation
        y_cv = cross_val_predict(pls, Xcal, ycal, cv=10)
        
        # Calculate calibration metrics
        r2_cal = r2_score(ycal, y_cal)
        rmse_cal = np.sqrt(mean_squared_error(ycal, y_cal))

        # Calculate cross-validation metrics
        r2_cv = r2_score(ycal, y_cv)
        rmsecv = np.sqrt(mean_squared_error(ycal, y_cv))
        rpd_cv = ycal.std() / rmsecv
        rpiq_cv = iqr(ycal, rng=(25, 75)) / rmsecv
        bias_cv = sum(ycal - y_cv)/ycal.shape[0]
        SDV_cv = (ycal - y_cv) - bias_cv
        SDV_cv = SDV_cv*SDV_cv
        SDV_cv = np.sqrt(sum(SDV_cv)/(ycal.shape[0] - 1))
        tbias_cv = abs(bias_cv)*(np.sqrt(ycal.shape[0])/SDV_cv)

        # Verify if prediction data is provided
        if Xpred is not None and ypred is not None:
            # Perform prediction
            y_pred = pls.predict(Xpred).flatten()
            
            # Store predicted values in predres
            predres[f'LV_{n_comp}'] = y_pred
            
            # Calculate prediction metrics
            r2_pred = r2_score(ypred, y_pred)
            rmsep = np.sqrt(mean_squared_error(ypred, y_pred))
            rpd_pred = ypred.std() / rmsep
            rpiq_pred = iqr(ypred, rng=(25, 75)) / rmsep
            bias_pred = sum(ypred - y_pred)/ypred.shape[0]
            SDV_pred = (ypred - y_pred) - bias_pred
            SDV_pred = SDV_pred*SDV_pred
            SDV_pred = np.sqrt(sum(SDV_pred)/(ypred.shape[0] - 1))
            tbias_pred = abs(bias_pred)*(np.sqrt(ypred.shape[0])/SDV_pred)
        else:
            # Define the prediction metrics as None if no prediction data is provided
            r2_pred = rmsep = rpd_pred = rpiq_pred = None
        
        # Store all metrics in the results list
        results.append({
            'LVs': n_comp,
            'R2 Cal': r2_cal,
            'RMSEC': rmse_cal,
            'R2 CV': r2_cv,
            'RMSECV': rmsecv,
            'RPD CV': rpd_cv,
            'RPIQ CV': rpiq_cv,
            'Bias_CV': bias_cv,
            'tbias_CV': tbias_cv,
            'R2 Pred': r2_pred,
            'RMSEP': rmsep,
            'RPD Pred': rpd_pred,
            'RPIQ Pred': rpiq_pred,
            'Bias_Pred': bias_pred,
            'tbias_Pred': tbias_pred
        })
    
    # Convert the results list to a DataFrame
    df_results = pd.DataFrame(results)
    calres.insert(0, 'Ref', ycal)
    predres.insert(0, 'Ref', ypred)

    return df_results, calres, predres

In [45]:
# preprocessings
import preprocessings as prepr # preprocessing methods for XRF data
import numpy as np 

Xcal_prep, mean_cal, mean_cal_poisson  = prepr.poisson(Xcal, mc=True)
Xpred_prep = ((Xpred/np.sqrt(mean_cal)) - mean_cal_poisson)

In [47]:
pls_results = pls_optimized(Xcal, ycal, LVmax=5, Xpred=Xpred, ypred=ypred) # fittinf the PLS model
pls_results[0]

Unnamed: 0,LVs,R2 Cal,RMSEC,R2 CV,RMSECV,RPD CV,RPIQ CV,Bias_CV,tbias_CV,R2 Pred,RMSEP,RPD Pred,RPIQ Pred,Bias_Pred,tbias_Pred
0,1,0.098131,1.705001,0.072118,1.729415,1.041415,1.413773,0.002246,0.016322,-0.490434,1.749357,0.826951,1.131844,-0.899618,4.323922
1,2,0.330874,1.468612,0.165737,1.639851,1.098294,1.490989,-0.021344,0.16362,-0.310195,1.640174,0.882,1.207189,-0.74376,3.668875
2,3,0.584046,1.157912,0.306264,1.495375,1.204406,1.635041,0.01567,0.131726,0.121042,1.343403,1.076842,1.473869,-0.72804,4.650021
3,4,0.747444,0.902259,0.642244,1.073858,1.677167,2.276838,-0.021054,0.246495,0.657386,0.838735,1.72478,2.360699,-0.322048,2.998697
4,5,0.763778,0.872595,0.641812,1.074504,1.676157,2.275468,-0.025522,0.298643,0.692705,0.794328,1.821205,2.492675,-0.190588,1.782263
