# Introduction

This code file is to do the preprocessing of original dataset. <br>
1. Baseline correction <br>
2. SG Smoothing <br>
3. Moving Average Smoothing <br>
4. Multiplicative Scatter Correction (MSC) <br>
5. Extended Multiplicative Scatter Correction (EMSC) <br>
6. Standard Normal Variate (SNV) <br>
7. Standardization <br>
8. Min-max Normalization <br>


This way total 9 datasets will be ready to train on models and best one can be used for optimization.

In [None]:
## Loading Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import savgol_filter
from sklearn.preprocessing import StandardScaler

In [None]:
## Loading Dataset

In [None]:
df=pd.read_excel('Dataset O.xlsx', index_col=0)
df.head()

Unnamed: 0,499.24539,501.23394,501.2999,503.28818,503.35441,505.34242,505.40892,507.39666,507.46343,509.45089,...,3989.32917,3989.85414,3991.38341,3991.90864,3993.43765,3993.96315,3995.49188,3996.01766,3997.54612,Target
0,0.285685,0.28567,0.299095,0.31252,0.322775,0.33303,0.32597,0.31891,0.30636,0.29381,...,0.00614,0.00605,0.00596,0.00593,0.0059,0.00595,0.006,0.006115,0.00623,0.0
1,0.31023,0.30089,0.31042,0.31995,0.326085,0.33222,0.3225,0.31278,0.30191,0.29104,...,0.00665,0.006585,0.00652,0.006555,0.00659,0.00665,0.00671,0.0067,0.00669,0.0
2,0.353515,0.34502,0.34466,0.3443,0.343595,0.34289,0.340165,0.33744,0.333335,0.32923,...,0.00153,0.001465,0.0014,0.001395,0.00139,0.001445,0.0015,0.001565,0.00163,0.0
3,0.244705,0.22002,0.21549,0.21096,0.228045,0.24513,0.26015,0.27517,0.28105,0.28693,...,0.00188,0.001765,0.00165,0.00159,0.00153,0.001585,0.00164,0.001755,0.00187,0.0
4,0.385975,0.39838,0.3849,0.37142,0.358035,0.34465,0.32886,0.31307,0.302525,0.29198,...,0.00014,0.000185,0.00023,0.00023,0.00023,0.00018,0.00013,9e-05,5e-05,0.0


### P1: Baseline correction

It involves removing or reducing a background signal that can obscure the true peaks or features of interest. This process improves the signal-to-noise ratio, making it easier to identify and quantify these features.


In [None]:
intensity_df=df.drop(['Target'], axis=1).copy()
intensity = intensity_df.astype(float).to_numpy()
intensity=pd.DataFrame(intensity).apply(pd.to_numeric, errors='coerce').to_numpy()
feature = intensity_df.columns.astype(float).to_numpy(copy=True)

In [None]:
def baseline_correction(intensity, feature, degree=2):

    baseline_corrected = np.zeros_like(intensity)

    for i in range(intensity.shape[0]):
        coeffs = np.polyfit(feature, intensity[i, :], degree)
        baseline = np.polyval(coeffs, feature)
        baseline_corrected[i, :] = intensity[i, :] - baseline

    return baseline_corrected

In [None]:
new_intensity=baseline_correction(intensity, feature)
df_p=pd.DataFrame(new_intensity, columns=feature)
df_p['Target']=df['Target']

In [None]:
df_p.to_excel('Dataset P1.xlsx')

### P2: SG Smoothing

Savitzky–Golay (SG) filtering, based on local least-squares fitting of the data by polynomials, is a popular method for smoothing data and calculations of derivatives of noisy data.

In [None]:
def sg_smoothing(intensity, window_size=7, poly_order=3):

    smoothed_data = np.zeros_like(intensity)

    for i in range(intensity.shape[0]):
        smoothed_data[i, :] = savgol_filter(intensity[i, :], window_size, poly_order)

    return smoothed_data

In [None]:
new_intensity=sg_smoothing(intensity)
df_p=pd.DataFrame(new_intensity, columns=feature)
df_p['Target']=df['Target']

In [None]:
df_p.to_excel('Dataset P2.xlsx')

### P3: Moving Average Smoothing

Moving average smoothing is a technique used to reduce noise and highlight underlying trends in time series data by averaging data points over a specified period.

In [None]:
def moving_average_smoothing(intensity, window_size=5):

    smoothed_data = np.zeros_like(intensity)

    for i in range(intensity.shape[0]):
        smoothed_data[i, :] = np.convolve(intensity[i, :], np.ones(window_size)/window_size, mode='same')

    return smoothed_data

In [None]:
new_intensity=moving_average_smoothing(intensity)
df_p=pd.DataFrame(new_intensity, columns=feature)
df_p['Target']=df['Target']

In [None]:
df_p.to_excel('Dataset P3.xlsx')

### P4: MSC

In the context of spectral analysis, "MSC" stands for Multiplicative Scatter Correction. MSC is a spectral preprocessing technique used to reduce variations in spectral data caused by factors like particle size and measurement conditions, especially in near-infrared (NIR) spectroscopy.

In [None]:
def msc_correction(intensity):

    mean_spectrum = np.mean(intensity, axis=0)
    msc_corrected = np.zeros_like(intensity)

    for i in range(intensity.shape[0]):
        spectrum = intensity[i, :]
        msc_corrected[i, :] = spectrum / mean_spectrum

    return msc_corrected

In [None]:
new_intensity=msc_correction(intensity)
df_p=pd.DataFrame(new_intensity, columns=feature)
df_p['Target']=df['Target']

In [None]:
df_p.to_excel('Dataset P4.xlsx')

### P5: EMSC

In spectral analysis, Extended Multiplicative Signal Correction (EMSC) is a preprocessing technique used to isolate and remove various multiplicative effects, particularly those caused by physical phenomena like light scattering, from spectral data.

In [None]:
def emsc_correction(intensity, poly_order = 2):

    mean_spectrum = np.mean(intensity, axis=0)
    emsc_corrected = np.zeros_like(intensity)

    for i in range(intensity.shape[0]):
        spectrum = intensity[i, :]
        eps=1e-10
        non_zero_spectrum=np.maximum(spectrum, eps)
        coeffs = np.polyfit(feature, np.log(non_zero_spectrum), poly_order)
        baseline = np.polyval(coeffs, feature)
        corrected_spectrum = spectrum / np.exp(baseline)
        emsc_corrected[i, :] = corrected_spectrum / np.mean(corrected_spectrum)

    return emsc_corrected

In [None]:
new_intensity=emsc_correction(intensity)
df_p=pd.DataFrame(new_intensity, columns=feature)
df_p['Target']=df['Target']

In [None]:
df_p.to_excel('Dataset P5.xlsx')

### P6: SNV

In spectral analysis, Standard Normal Variate (SNV) is a normalization method that corrects spectra for baseline variations and scatter effects. It transforms spectral data so that each spectrum has a mean of 0 and a standard deviation of 1, making intensities comparable across different spectra.

In [None]:
def snv_correction(intensity):

    mean_spectrum = np.mean(intensity, axis=0)
    std_spectrum = np.std(intensity, axis=0)
    snv_corrected = (intensity - mean_spectrum) / std_spectrum

    return snv_corrected

In [None]:
new_intensity=snv_correction(intensity)
df_p=pd.DataFrame(new_intensity, columns=feature)
df_p['Target']=df['Target']

In [None]:
df_p.to_excel('Dataset P6.xlsx')

### P7: Standardization

Spectra standardization is a process that aims to make spectral data from different instruments or measurements comparable by reducing variations due to instrument differences, data processing methods, or environmental factors. It involves correcting spectral features, like baseline drift and noise, to align spectra from different sources, allowing for more reliable and consistent analysis and comparison.

In [None]:
df_p=df.copy(deep=True)

In [None]:
X=df_p.iloc[:, :-1]
Y=df_p.iloc[:, -1].values

In [None]:
Y.shape

(179,)

In [None]:
scaler_features = StandardScaler()
scaler_target = StandardScaler()

In [None]:
df_p.iloc[:, :-1] = scaler_features.fit_transform(X)
df_p.iloc[:, -1] = scaler_target.fit_transform(Y.reshape(-1, 1)).flatten()

In [None]:
df_p.to_excel('Dataset P7.xlsx')

### P8: Min-max Normalization

Min-max normalization in spectral data, also known as feature scaling, scales the data to a fixed range, typically 0 to 1. This is useful for comparing spectra with varying intensity scales and for machine learning algorithms that perform better with standardized inputs.

In [None]:
df_p=df.copy(deep=True)

In [None]:
min_val=(df_p.iloc[:, :-1].min()).min()
max_val=(df_p.iloc[:, :-1].max()).max()
df_p.iloc[:, :-1]=df_p.iloc[:, :-1].apply(lambda x: (x-min_val)/(max_val-min_val))

In [None]:
df_p.to_excel('Dataset P8.xlsx')