In [1]:
import spectrum
import pike as pk
import pike_parallel as parall
import csv
import pandas as pd
from glob import glob
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import PairwiseKernel
import random
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pymzml
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from itertools import cycle
from sklearn.metrics import accuracy_score


In [2]:
def varianceStabilizer(spectrumObject):
    '''Pre-processing function for manipulating intensities.
    Commonly performed to stabilize their variance
    Parameters
    ----------
    SpectrumObject
    
    Returns
    -------
    SpectrumObject with stabilized variance.
    '''
    stabilizer = spectrum.VarStabilizer()
    return stabilizer(spectrumObject)

def baselineRemove( spectrumObject):
    '''Pre-processing function for baseline correction (also referred to as background removal).
    Parameters
    ----------
    SpectrumObject
    
    Returns
    -------
    SpectrumObject.
    '''
    remover = spectrum.BaselineCorrecter(method="SNIP")
    return remover(spectrumObject)

def intensityCalibration( spectrumObject):
    '''Pre-processing function for normalizing the intensity of a spectrum.
    Commonly referred to as total ion current (TIC) calibration.

    Parameters
    ----------
    SpectrumObject
    
    Returns
    -------
    SpectrumObject.
    '''

    calibration = spectrum.Normalizer()
    return calibration(spectrumObject)

def smoother( spectrumObject):
    '''Pre-processing function for smoothing. Uses Savitzky-Golay filter.

    Parameters
    ----------
    SpectrumObject

    Returns
    -------
    SpectrumObject
    '''
    smoother =  spectrum.Smoother()
    return smoother(spectrumObject)

def trimming(spectrumObject):
    '''Pre-processing function for trimming ends of a spectrum.
    This can be used to remove inaccurate measurements.

    Parameters
    ----------
    SpectrumObject
    '''

    trimmer = spectrum.Trimmer()
    return trimmer(spectrumObject)


def binner(spectrumObject):
    '''Pre-processing function for binning spectra in equal-width bins.

    Parameters
    ----------
    spectrumObject
    '''
    binner = spectrum.Binner()
    return binner(spectrumObject)
spectra = spectrum.SpectrumObject()

In [22]:
# Datos del Gregorio sin extracción de proteínas
semanas = ['Semana 1', 'Semana 2', 'Semana 3']
clases = ['RT023', 'RT027', 'RT078', 'RT106', 'RT165', 'RT181']
medios = ['Medio Ch', 'Medio Br', 'Medio Cl', 'Medio Sc', 'GU']

dic = {}
Y_train = []
baseline = []

for medio in medios:
    for semana in semanas:
        samples = []
        for clase in clases:
            
            ruta = f'Z:/bacteria_id/C_diff/Reproducibilidad/ClostiRepro/ClostriRepro/Reproducibilidad No extracción/{medio}/{semana}/{clase}' 
            
            if os.path.exists(ruta):
                for f in os.listdir(ruta):
                    ruta_f = os.path.join(ruta, f)
                    
                    if medio == 'Medio Sc'and semana == 'Semana 1':
                        # Si el archivo es un .mzml
                        if 'mzml' in ruta_f:
                            run = pymzml.run.Reader(ruta_f)
                            
                            spectro = [r for r in run]
                            s = spectrum.SpectrumObject(mz=spectro[0].mz, intensity=spectro[0].i)
                            baseline.append(s)
                            Y_train.append(clase)
                        else: 
                            carpetas = [subf for subf in os.listdir(ruta_f)]
                            if carpetas:
                                ruta_f = os.path.join(ruta_f, carpetas[0])
                                # Buscar archivos 'fid' y 'acqu' en las subcarpetas
                                fid_files = glob(os.path.join(ruta_f, '*', '1SLin', 'fid'))
                                acqu_files = glob(os.path.join(ruta_f, '*', '1SLin', 'acqu'))

                                ruta_fid = fid_files[0]
                                ruta_acqu = acqu_files[0]
                                s = spectra.from_bruker(ruta_acqu, ruta_fid)
                                baseline.append(s)
                                Y_train.append(clase)       
                    
                    else:
                        # Si el archivo es un .mzml
                        if 'mzml' in ruta_f:
                            run = pymzml.run.Reader(ruta_f)
                            
                            spectro = [r for r in run]
                            s = spectrum.SpectrumObject(mz=spectro[0].mz, intensity=spectro[0].i)
                            samples.append(s)
                                    
                                    
                        else: 
                            carpetas = [subf for subf in os.listdir(ruta_f)]
                            
                            ruta_f = os.path.join(ruta, f, carpetas[0])
                            fid_files = glob(os.path.join(ruta_f, '*', '1SLin', 'fid'))
                            acqu_files = glob(os.path.join(ruta_f, '*', '1SLin', 'acqu'))

                            ruta_fid = fid_files[0]
                            ruta_acqu = acqu_files[0]
                            s = spectra.from_bruker(ruta_acqu, ruta_fid)
                            samples.append(s) 
                            

            dic[f'{semana}_{medio}'] = samples


In [23]:
# Datos del Gregorio con extracción de proteínas

clases = ['RT023', 'RT027', 'RT078', 'RT106', 'RT165', 'RT181']
medios = ['Chx', 'Brx', 'Clx', 'Scx', 'GU', 'Chx_24h']

dic_extra = {}


for medio in medios:
        samples = []
        for clase in clases:
            
            ruta = f'Z:/bacteria_id/C_diff/Reproducibilidad/ClostiRepro/ClostriRepro/Reproducibilidad Extracción/{medio}/{clase}' 
            
            if os.path.exists(ruta):
                for f in os.listdir(ruta):
                    ruta_f = os.path.join(ruta, f)
                    
                        
                    if 'mzml' in ruta_f:
                        run = pymzml.run.Reader(ruta_f)
                        
                        spectro = [r for r in run]
                        s = spectrum.SpectrumObject(mz=spectro[0].mz, intensity=spectro[0].i)
                        samples.append(s)
                                
                                
                    else: 
                        carpetas = [subf for subf in os.listdir(ruta_f)]
                        
                        ruta_f = os.path.join(ruta, f, carpetas[0])
                        fid_files = glob(os.path.join(ruta_f, '*', '1SLin', 'fid'))
                        acqu_files = glob(os.path.join(ruta_f, '*', '1SLin', 'acqu'))

                        ruta_fid = fid_files[0]
                        ruta_acqu = acqu_files[0]
                        s = spectra.from_bruker(ruta_acqu, ruta_fid)
                        samples.append(s) 
                            

            dic_extra[f'{medio}'] = samples


In [5]:
def stepOne(intensidades, porcentaje):
    '''
    Selecciona los picos de forma aleatoria de acuerdo a un porcentaje de picos.

    Args:
        intensidades: intensidades del espectro
        porcentaje: porcentaje a seleccionar
    '''
    posiciones_no_cero = [index for index, valor in enumerate(intensidades) if valor != 0]
    n_seleccionar = int(len(posiciones_no_cero) * porcentaje)
    posiciones_seleccionadas = random.sample(posiciones_no_cero, n_seleccionar)
    
    return posiciones_seleccionadas

In [81]:
#preprocesado test samples
specTest_mz = []
specTest_intensity = []
#test = dic['Semana 3_Medio Sc']
test = dic_extra['Scx']
for i, s in enumerate(test):
    
    
    # Preprocessing: Varaince stab.,smoother, baseline removal, intensity caibration (TIC), trimming
    sO_s1 = varianceStabilizer(s)
    sO_s1 = smoother(sO_s1)
    sO_s1 = baselineRemove(sO_s1)
    sO_s1 = intensityCalibration(sO_s1)
    sO_s1 = trimming(sO_s1)
    # Binning
    sO_s1 = binner(sO_s1)

    specTest_mz.append(np.array(sO_s1.mz))
    specTest_intensity.append(np.array(sO_s1.intensity))


specTest_mz = np.array(specTest_mz)
specTest_intensity = np.array(specTest_intensity)



Método DA

In [25]:
def data_augmentation(n_increase):
    #preprocesado baseline
    spec1C_mz = []
    spec1C_intensity = []

    for i, s in enumerate(baseline):
        
        # Preprocessing: Varaince stab.,smoother, baseline removal, intensity caibration (TIC), trimming
        sO_s1 = varianceStabilizer(s)
        sO_s1 = smoother(sO_s1)
        sO_s1 = baselineRemove(sO_s1)
        sO_s1 = intensityCalibration(sO_s1)
        sO_s1 = trimming(sO_s1)
        # Binning
        sO_s1 = binner(sO_s1)

        spec1C_mz.append(np.array(sO_s1.mz))
        spec1C_intensity.append(np.array(sO_s1.intensity))


    spec1C_mz = np.array(spec1C_mz)
    spec1C_intensity = np.array(spec1C_intensity)
    augmented_spec1C_intensity = []
    augmented_spec1C_mz = []
    augmented_labels = []
    n_shift = spec1C_mz.shape[1]
    n_50 = int(n_shift * 0.50)
    n_30 = int(n_shift * 0.30)
    n_15 = int(n_shift * 0.15)
    n_5 = int(n_shift * 0.05)
    # Permutar aleatoriamente los índices para seleccionarlos
    indices = np.random.permutation(n_shift)
    indices_50 = indices[:n_50]
    indices_30 = indices[n_50:n_50 + n_30]
    indices_15 = indices[n_50 + n_30:n_50 + n_30 + n_15]
    indices_5 = indices[n_50 + n_30 + n_15:] 

    for mz, intensity, label in zip(spec1C_mz, spec1C_intensity, Y_train):
        
        int_modificados = []
        mz_modificados = []
        Y_modificados = []


        shifted_mz  = mz.copy()
        # Desplazar el 30% en ±3 Da
        desplazamiento_20 = np.random.uniform(-3, 3, size=n_30)
        shifted_mz[indices_30] += desplazamiento_20

        # Desplazar el 15% en ±60 Da
        desplazamiento_10_1 = np.random.uniform(-6, 6, size=n_15)
        shifted_mz[indices_15] += desplazamiento_10_1

        # Desplazar el último 5% en ±9 Da
        desplazamiento_10_2 = np.random.uniform(-9, 9, size=n_5)
        shifted_mz[indices_5] += desplazamiento_10_2

        interpolated = np.interp(mz, shifted_mz, intensity, left=0, right=0)

        for _ in range(n_increase):
            int_modificados.append(interpolated)
            mz_modificados.append(shifted_mz)
            Y_modificados.append(label)

        augmented_spec1C_intensity.append(intensity)
        augmented_spec1C_intensity.extend(int_modificados)
        augmented_spec1C_mz.append(mz)
        augmented_spec1C_mz.extend(mz_modificados)
        augmented_labels.append(label)
        augmented_labels.extend(Y_modificados)

    spec1C_mz = np.array(augmented_spec1C_mz)
    spec1C_intensity = np.array(augmented_spec1C_intensity)
    labels_1C = np.array(augmented_labels).reshape(-1, 1)
    return spec1C_mz, spec1C_intensity, labels_1C

# RF

In [26]:
n_estimators = [10, 50, 100, 200, 300]

## baseline

In [27]:

param_grid = {'n_estimators': n_estimators}
loo = LeaveOneOut()
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=loo, n_jobs=-1)
accuracies = []
for _ in range(100):
    spec1C_mz, spec1C_intensity, labels_1C = data_augmentation(2)

    grid_search.fit(spec1C_intensity, labels_1C)
    accuracies.append(grid_search.best_score_)
    
mean_accuracy = 100 * np.mean(accuracies)
std_accuracy = 100 * np.sqrt(np.var(accuracies))
print("The accuracy is %2.2f ± %2.2f" % (mean_accuracy, std_accuracy))


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


The accuracy is 100.00 ± 0.00


  return fit_method(estimator, *args, **kwargs)


## Test

In [82]:
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
accuracies = []
for _ in range(100):
    spec1C_mz, spec1C_intensity, labels_1C = data_augmentation(2)
    rf_model.fit(spec1C_intensity, labels_1C.ravel())
    pred = rf_model.predict(specTest_intensity)
    accuracies.append(accuracy_score(pred, Y_train))
mean_accuracy = 100 * np.mean(accuracies)
std_accuracy = 100 * np.sqrt(np.var(accuracies))
print("The accuracy is %2.2f ± %2.2f" % (mean_accuracy, std_accuracy))

The accuracy is 86.00 ± 1.86


# SVM + RBF

In [29]:
gamma = np.logspace(-3, 3, 10)
C = np.logspace(-3, 3, 10)

In [68]:
k = PairwiseKernel(metric='rbf')
param_grid = {'estimator__C': C, 'estimator__gamma': gamma}
svm_1vsallClos = OneVsRestClassifier(SVC(kernel='precomputed'))
loo = LeaveOneOut()
grid_search = GridSearchCV(estimator=svm_1vsallClos, param_grid=param_grid, cv=loo, n_jobs=-1)
accuracies = []
for _ in range(100):
    spec1C_mz, spec1C_intensity, labels_1C = data_augmentation(2)
    K_train = k( X=spec1C_intensity)
    grid_search.fit(K_train, labels_1C)
    accuracies.append(grid_search.best_score_)
    
mean_accuracy = 100 * np.mean(accuracies)
std_accuracy = 100 * np.sqrt(np.var(accuracies))
print("The accuracy is %2.2f ± %2.2f" % (mean_accuracy, std_accuracy))

The accuracy is 98.44 ± 0.89


## Test

In [79]:
specTest_intensity[49] = 0

In [80]:
np.where(np.isnan(specTest_intensity))

(array([], dtype=int64), array([], dtype=int64))

In [83]:
k = PairwiseKernel(metric='rbf')
svm_1vsallClos = OneVsRestClassifier(SVC(kernel='precomputed', C=grid_search.best_params_['estimator__C'], gamma=grid_search.best_params_['estimator__gamma']))
accuracies = []

for _ in range(100):
    spec1C_mz, spec1C_intensity, labels_1C = data_augmentation(2)
    K_train = k( X=spec1C_intensity)
    svm_1vsallClos.fit(K_train, labels_1C)
    K_test = k( X=spec1C_intensity, Y=specTest_intensity).T
    pred = svm_1vsallClos.predict(K_test)
    accuracies.append(accuracy_score(pred, Y_train))
mean_accuracy = 100 * np.mean(accuracies)
std_accuracy = 100 * np.sqrt(np.var(accuracies))
print("The accuracy is %2.2f ± %2.2f" % (mean_accuracy, std_accuracy))

The accuracy is 61.50 ± 1.89


# SVM + Linear

In [116]:
gamma = np.logspace(-3, 3, 10)
C = np.logspace(-3, 3, 10)

In [32]:
k = PairwiseKernel(metric='linear')
param_grid = {'estimator__C': C}
svm_1vsallClos = OneVsRestClassifier(SVC(kernel='precomputed'))
loo = LeaveOneOut()
grid_search = GridSearchCV(estimator=svm_1vsallClos, param_grid=param_grid, cv=loo, n_jobs=-1)
accuracies = []
for _ in range(100):
    spec1C_mz, spec1C_intensity, labels_1C = data_augmentation(2)
    K_train = k( X=spec1C_intensity)
    grid_search.fit(K_train, labels_1C)
    accuracies.append(grid_search.best_score_)
    
mean_accuracy = 100 * np.mean(accuracies)
std_accuracy = 100 * np.sqrt(np.var(accuracies))
print("The accuracy is %2.2f ± %2.2f" % (mean_accuracy, std_accuracy))

The accuracy is 93.78 ± 1.49


## Test

In [84]:
k = PairwiseKernel(metric='linear')
svm_1vsallClos = OneVsRestClassifier(SVC(kernel='precomputed', C=grid_search.best_params_['estimator__C']))
accuracies = []

for _ in range(100):
    spec1C_mz, spec1C_intensity, labels_1C = data_augmentation(2)
    K_train = k( X=spec1C_intensity)
    svm_1vsallClos.fit(K_train, labels_1C)
    K_test = k( X=spec1C_intensity, Y=specTest_intensity).T
    pred = svm_1vsallClos.predict(K_test)
    accuracies.append(accuracy_score(pred, Y_train))
mean_accuracy = 100 * np.mean(accuracies)
std_accuracy = 100 * np.sqrt(np.var(accuracies))
print("The accuracy is %2.2f ± %2.2f" % (mean_accuracy, std_accuracy))

The accuracy is 56.83 ± 4.68


# SVM + PIKE

In [34]:
k = parall.PIKE(t=1)
svm_1vsallClos = OneVsRestClassifier(SVC(kernel='precomputed'))
loo = LeaveOneOut()
grid_search = GridSearchCV(estimator=svm_1vsallClos, param_grid=param_grid, cv=loo, n_jobs=-1)
accuracies = []
for _ in range(100):
    spec1C_mz, spec1C_intensity, labels_1C = data_augmentation(2)
    K_train = k( X_i=spec1C_intensity, X_mz=spec1C_mz)
    grid_search.fit(K_train, labels_1C)
    accuracies.append(grid_search.best_score_)
    
mean_accuracy = 100 * np.mean(accuracies)
std_accuracy = 100 * np.sqrt(np.var(accuracies))
print("The accuracy is %2.2f ± %2.2f" % (mean_accuracy, std_accuracy))

The accuracy is 79.33 ± 2.57


## Test

In [85]:
k = parall.PIKE(t=1)
svm_1vsallClos = OneVsRestClassifier(SVC(kernel='precomputed', C=grid_search.best_params_['estimator__C']))
accuracies = []

for _ in range(100):
    spec1C_mz, spec1C_intensity, labels_1C = data_augmentation(2)
    K_train = k( X_i=spec1C_intensity, X_mz=spec1C_mz)
    svm_1vsallClos.fit(K_train, labels_1C)
    K_test = k( X_i=spec1C_intensity, X_mz=spec1C_mz, Y_i=specTest_intensity, Y_mz=specTest_mz).T
    pred = svm_1vsallClos.predict(K_test)
    accuracies.append(accuracy_score(pred, Y_train))
mean_accuracy = 100 * np.mean(accuracies)
std_accuracy = 100 * np.sqrt(np.var(accuracies))
print("The accuracy is %2.2f ± %2.2f" % (mean_accuracy, std_accuracy))

The accuracy is 41.67 ± 1.83


# KNN

In [36]:
n_neighbors = [3, 5, 7, 9, 11]

In [42]:
loo = LeaveOneOut()
knn = KNeighborsClassifier()

param_grid = {
    'n_neighbors': n_neighbors  
}
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=loo, n_jobs=-1)

for _ in range(10):
    spec1C_mz, spec1C_intensity, labels_1C = data_augmentation(2)

    grid_search.fit(spec1C_intensity, labels_1C.ravel())
    accuracies.append(grid_search.best_score_)
    
mean_accuracy = 100 * np.mean(accuracies)
std_accuracy = 100 * np.sqrt(np.var(accuracies))
print("The accuracy is %2.2f ± %2.2f" % (mean_accuracy, std_accuracy))


The accuracy is 77.56 ± 15.93


## Test

In [86]:
knn = KNeighborsClassifier(n_neighbors=3)
accuracies = []
for _ in range(100):
    spec1C_mz, spec1C_intensity, labels_1C = data_augmentation(2)
    knn.fit(spec1C_intensity, labels_1C.ravel())
    pred = knn.predict(specTest_intensity)
    accuracies.append(accuracy_score(pred, Y_train))
mean_accuracy = 100 * np.mean(accuracies)
std_accuracy = 100 * np.sqrt(np.var(accuracies))
print("The accuracy is %2.2f ± %2.2f" % (mean_accuracy, std_accuracy))


The accuracy is 44.17 ± 1.54


# LGBM

In [39]:
Y_train = np.array(Y_train)
Y_train[Y_train=='RT023']=0
Y_train[Y_train=='RT027']=1
Y_train[Y_train=='RT078']=2
Y_train[Y_train=='RT106']=3
Y_train[Y_train=='RT165']=4
Y_train[Y_train=='RT181']=5

In [40]:
param = {'num_leaves': 31, 'objective': 'multiclass', 'num_class': 6}
accuracies = []
for _ in range(100):
    spec1C_mz, spec1C_intensity, labels_1C = data_augmentation(2)
    train_data = lgb.Dataset(spec1C_intensity, label=labels_1C)
    bst = lgb.train(param, train_data)
    ypred = bst.predict(spec1C_intensity, num_iteration=bst.best_iteration)
    y_pred = np.argmax(ypred, axis=1)
    Y_train_int = labels_1C.astype(int)
    accuracies.append(accuracy_score(Y_train_int, y_pred))
mean_accuracy = 100 * np.mean(accuracies)
std_accuracy = 100 * np.sqrt(np.var(accuracies))
print("The accuracy is %2.2f ± %2.2f" % (mean_accuracy, std_accuracy))




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072628 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 324966
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.076154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 324839
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325364
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070025 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325287
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 324939
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072450 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 324983
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070134 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325083
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.056569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325491
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069982 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325549
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325253
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
The accuracy is 100.00 ± 0.00


## Test

In [87]:
param = {'num_leaves': 31, 'objective': 'multiclass', 'num_class': 6}
accuracies = []
for _ in range(100):
    spec1C_mz, spec1C_intensity, labels_1C = data_augmentation(2)
    train_data = lgb.Dataset(spec1C_intensity, label=labels_1C)
    bst = lgb.train(param, train_data)
    ypred = bst.predict(specTest_intensity, num_iteration=bst.best_iteration)
    y_pred = np.argmax(ypred, axis=1)
    Y_train_int = Y_train.astype(int)
    accuracies.append(accuracy_score(Y_train_int, y_pred))
mean_accuracy = 100 * np.mean(accuracies)
std_accuracy = 100 * np.sqrt(np.var(accuracies))
print("The accuracy is %2.2f ± %2.2f" % (mean_accuracy, std_accuracy))



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325036
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038945 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325496
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040518 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325201
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 324978
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040367 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 324934
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042836 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325721
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042957 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325118
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039717 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325089
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041268 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325133
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043941 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325034
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040013 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 324857
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325110
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041934 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325228
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039760 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325266
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325307
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040614 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325161
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041262 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325250
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325088
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325318
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051427 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 324849
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6000
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
The accuracy is 61.00 ± 7.10
