In [1]:
# Introduction

In [2]:
## Loading Libraries

In [3]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import savgol_filter
from sklearn.preprocessing import StandardScaler

In [4]:
## Loading Dataset

In [5]:
df=pd.read_excel('Dataset O.xlsx', index_col=0)
df.head()

Unnamed: 0,499.24539,501.23394,501.2999,503.28818,503.35441,505.34242,505.40892,507.39666,507.46343,509.45089,...,3989.32917,3989.85414,3991.38341,3991.90864,3993.43765,3993.96315,3995.49188,3996.01766,3997.54612,Target
0,0.285685,0.28567,0.299095,0.31252,0.322775,0.33303,0.32597,0.31891,0.30636,0.29381,...,0.00614,0.00605,0.00596,0.00593,0.0059,0.00595,0.006,0.006115,0.00623,0.0
1,0.31023,0.30089,0.31042,0.31995,0.326085,0.33222,0.3225,0.31278,0.30191,0.29104,...,0.00665,0.006585,0.00652,0.006555,0.00659,0.00665,0.00671,0.0067,0.00669,0.0
2,0.353515,0.34502,0.34466,0.3443,0.343595,0.34289,0.340165,0.33744,0.333335,0.32923,...,0.00153,0.001465,0.0014,0.001395,0.00139,0.001445,0.0015,0.001565,0.00163,0.0
3,0.244705,0.22002,0.21549,0.21096,0.228045,0.24513,0.26015,0.27517,0.28105,0.28693,...,0.00188,0.001765,0.00165,0.00159,0.00153,0.001585,0.00164,0.001755,0.00187,0.0
4,0.385975,0.39838,0.3849,0.37142,0.358035,0.34465,0.32886,0.31307,0.302525,0.29198,...,0.00014,0.000185,0.00023,0.00023,0.00023,0.00018,0.00013,9e-05,5e-05,0.0


In [6]:
## Preprocessing

In [7]:
intensity_df=df.drop(['Target'], axis=1).copy()
intensity = intensity_df.astype(float).to_numpy()
intensity=pd.DataFrame(intensity).apply(pd.to_numeric, errors='coerce').to_numpy()
feature = intensity_df.columns.astype(float).to_numpy(copy=True)

In [8]:
### P1: Baseline correction

In [9]:
def baseline_correction(intensity, feature, degree=2):
    
    baseline_corrected = np.zeros_like(intensity)
    
    for i in range(intensity.shape[0]):
        coeffs = np.polyfit(feature, intensity[i, :], degree)
        baseline = np.polyval(coeffs, feature)
        baseline_corrected[i, :] = intensity[i, :] - baseline

    return baseline_corrected

In [10]:
new_intensity=baseline_correction(intensity, feature)
df_p=pd.DataFrame(new_intensity, columns=feature)
df_p['Target']=df['Target']

In [11]:
df_p.to_excel('Dataset P1.xlsx')

In [12]:
### P2: SG Smoothing

In [13]:
def sg_smoothing(intensity, window_size=7, poly_order=3):
    
    smoothed_data = np.zeros_like(intensity)

    for i in range(intensity.shape[0]):
        smoothed_data[i, :] = savgol_filter(intensity[i, :], window_size, poly_order)

    return smoothed_data

In [14]:
new_intensity=sg_smoothing(intensity)
df_p=pd.DataFrame(new_intensity, columns=feature)
df_p['Target']=df['Target']

In [15]:
df_p.to_excel('Dataset P2.xlsx')

In [16]:
### P3: Moving Average Smoothing

In [17]:
def moving_average_smoothing(intensity, window_size=5):
    
    smoothed_data = np.zeros_like(intensity)

    for i in range(intensity.shape[0]):
        smoothed_data[i, :] = np.convolve(intensity[i, :], np.ones(window_size)/window_size, mode='same')

    return smoothed_data

In [18]:
new_intensity=moving_average_smoothing(intensity)
df_p=pd.DataFrame(new_intensity, columns=feature)
df_p['Target']=df['Target']

In [19]:
df_p.to_excel('Dataset P3.xlsx')

In [20]:
### P4: MSC

In [21]:
def msc_correction(intensity):
    
    mean_spectrum = np.mean(intensity, axis=0)
    msc_corrected = np.zeros_like(intensity)

    for i in range(intensity.shape[0]):
        spectrum = intensity[i, :]
        msc_corrected[i, :] = spectrum / mean_spectrum

    return msc_corrected

In [22]:
new_intensity=msc_correction(intensity)
df_p=pd.DataFrame(new_intensity, columns=feature)
df_p['Target']=df['Target']

In [23]:
df_p.to_excel('Dataset P4.xlsx')

In [24]:
### P5: EMSC

In [25]:
def emsc_correction(intensity, poly_order = 2):
    
    mean_spectrum = np.mean(intensity, axis=0)
    emsc_corrected = np.zeros_like(intensity)

    for i in range(intensity.shape[0]):
        spectrum = intensity[i, :]
        eps=1e-10
        non_zero_spectrum=np.maximum(spectrum, eps)
        coeffs = np.polyfit(feature, np.log(non_zero_spectrum), poly_order)
        baseline = np.polyval(coeffs, feature)
        corrected_spectrum = spectrum / np.exp(baseline)
        emsc_corrected[i, :] = corrected_spectrum / np.mean(corrected_spectrum)

    return emsc_corrected

In [26]:
new_intensity=emsc_correction(intensity)
df_p=pd.DataFrame(new_intensity, columns=feature)
df_p['Target']=df['Target']

In [27]:
df_p.to_excel('Dataset P5.xlsx')

In [28]:
### P6: SNV

In [29]:
def snv_correction(intensity):
    
    mean_spectrum = np.mean(intensity, axis=0)
    std_spectrum = np.std(intensity, axis=0)
    snv_corrected = (intensity - mean_spectrum) / std_spectrum

    return snv_corrected

In [30]:
new_intensity=snv_correction(intensity)
df_p=pd.DataFrame(new_intensity, columns=feature)
df_p['Target']=df['Target']

In [31]:
df_p.to_excel('Dataset P6.xlsx')

In [32]:
### P7: Standardization

In [33]:
df_p=df.copy(deep=True)

In [34]:
X=df_p.iloc[:, :-1]
Y=df_p.iloc[:, -1].values

In [35]:
Y.shape

(179,)

In [36]:
scaler_features = StandardScaler()
scaler_target = StandardScaler()

In [37]:
df_p.iloc[:, :-1] = scaler_features.fit_transform(X)
df_p.iloc[:, -1] = scaler_target.fit_transform(Y.reshape(-1, 1)).flatten()

In [38]:
df_p.to_excel('Dataset P7.xlsx')

In [39]:
### P8: Min-max Normalization

In [40]:
df_p=df.copy(deep=True)

In [41]:
min_val=(df_p.iloc[:, :-1].min()).min()
max_val=(df_p.iloc[:, :-1].max()).max()
df_p.iloc[:, :-1]=df_p.iloc[:, :-1].apply(lambda x: (x-min_val)/(max_val-min_val))

In [42]:
df_p.to_excel('Dataset P8.xlsx')

In [43]:
## Conclusion