In [1]:
import pandas as pd

from libs.preprocessing import MetaboPreprocer

import warnings
warnings.filterwarnings('ignore')

In [2]:
# The dataset (ecpect "labels" and "Name" class) must not contain non-numeric values
# Columns of "Name" and "Label" must be included in the dataset (please refer the README.md)
# Labels class in blank naming must be 'blank', not "Blank", "B", 'b'...
# .xlsx or .csv file are only allowed in this analysis process.

original_dataset = pd.read_csv('./data/raw_dataset_chickpea_neg.csv')
original_dataset.head(3)

Unnamed: 0,Name,Label,12.444_391.28538,10.965_407.28041,0.965_96.96959,12.42_201.07107,0.957_215.03285,0.834_112.98564,0.568_174.9561,0.901_201.0711,...,0.869_570.77863,8.866_546.83368,0.858_546.84453,9.061_617.79091,11.741_464.98163,13.753_293.00558,10.635_490.99722,12.388_653.00673,13.324_361.02914,6.743_614.78273
0,Area: 250405_chickpea_blan_Neg-1.raw (F1),blank,70242.5271,59094.04693,8135425.869,203364670.4,17671.90401,150678072.1,143423950.4,51620205.24,...,5809.181116,18741.18904,6185.528257,27440.8011,10672.63844,10577.03325,8718.823011,12921.06786,17589.632,11148.50964
1,Area: 250405_chickpea_blank_Neg-2.raw (F2),blank,41815.57105,58359.06506,7959795.611,203832364.8,18447.76988,166400662.5,139130878.9,52638067.57,...,5569.250441,17664.14681,5807.808732,24611.57355,14995.56991,10922.04118,15542.34703,11681.41289,18898.16855,17230.50128
2,Area: 250405_chickpea_blank_Neg-3.raw (F3),blank,59675.22922,57206.10278,8400420.33,207280491.9,18843.59144,156938200.8,149283140.7,54136856.43,...,6157.004817,14046.58235,6476.441746,22523.84836,11085.3784,12518.67344,47362.63732,14234.32919,22476.67342,12644.66897


In [3]:
# "step_blank_substracte" and "step_filter_rsd" are data filtering step
# The steps of data filtering are able to remove useless features
# This step is strongly recommended for untargeted metabolomics dataset with large numbers of features

# step_blank_substration : Substracte the blank values from the original intensities and remove features with a large number of negative data
# step_filter_rsd        : Features with high percent RSD can be removed from the dataset. Filtering the features based on RSD of QC 

# RSD = standard devication / mean
# QC = quality control

preprocesser = MetaboPreprocer(
    step_blank_substrateion=True,
    step_filter_rsd=False,
    scaling='None',
    file_save=True
)

In [4]:
# df_scaled = scaled and filtered (if step_blank_subtration or step_filter_rsd is True) dataset
# df_ori = only filtered dataset


df_scaled, df_ori = preprocesser.preprocessing(
    './data/raw_dataset_chickpea_neg.csv',
)

df_ori.head(3)

Original dataset shape (n_samples, n_features + 2) = (25, 825)
A total of 0 missing values were detected
Removal of featuers with more than 50.0% proportion of negative values in all groups
Before the substration process, dataset shape (n_samples, n_features + 2) = (25, 825).
After the substration process, dataset shape (n_samples, n_features + 2) = (20, 751).
['all'] features are selected and scaled
Processed dataset shape (n_samples, n_features + 2) = (20, 751)



Unnamed: 0,Name,Label,12.444_391.28538,10.965_407.28041,0.965_96.96959,12.42_201.07107,0.957_215.03285,0.834_112.98564,0.901_201.0711,0.871_201.07109,...,13.554_675.02819,0.87_748.73755,0.869_570.77863,0.858_546.84453,9.061_617.79091,11.741_464.98163,13.753_293.00558,10.635_490.99722,12.388_653.00673,13.324_361.02914
0,Area: 250405_chickpea_S1_Neg.raw (F12),Control,316902000.0,3329367.0,244405600.0,0.0,209892.8,19270771.68,7108334.752,9526020.16,...,8344.766285,35119.91954,35277.975115,7232.217846,18995.214198,0.0,7192.291418,4853.383468,0.0,0.0
1,Area: 250405_chickpea_S2_Neg.raw (F13),Control,142443100.0,149176400.0,246112700.0,91634629.96,474105.7,8712366.58,0.0,0.0,...,25070.655945,53953.27053,37245.730555,6076.761836,280.022888,17344.592528,611.273568,33.866608,0.0,3226.030272
2,Area: 250405_chickpea_S3_Neg.raw (F14),Control,210645600.0,59524660.0,184954900.0,41314425.86,1128180.0,0.0,5111265.522,7528950.93,...,0.0,25422.48528,43222.125165,9813.716576,28220.232318,0.0,24261.079968,9338.296508,443.923804,0.0


In [5]:
# df_scaled = scaled and filtered (if step_blank_subtration or step_filter_rsd is True) dataset
# df_ori = only filtered dataset


df_scaled, df_ori = preprocesser.preprocessing(
    './data/raw_dataset_chickpea_pos.csv',
)

df_ori.head(3)

Original dataset shape (n_samples, n_features + 2) = (25, 1296)
A total of 0 missing values were detected
Removal of featuers with more than 50.0% proportion of negative values in all groups
Before the substration process, dataset shape (n_samples, n_features + 2) = (25, 1296).
After the substration process, dataset shape (n_samples, n_features + 2) = (20, 1191).
['all'] features are selected and scaled
Processed dataset shape (n_samples, n_features + 2) = (20, 1191)



Unnamed: 0,Name,Label,0.935_203.05254,2.511_100.07558,1.697_132.10191,1.691_86.09633,1.011_118.08618,12.457_236.64026,1.134_72.08078,7.669_382.18887,...,0.868_384.86674,0.858_686.81739,0.887_442.80416,0.8_88.99012,0.886_416.80191,0.871_520.84225,0.878_510.81346,0.871_588.80784,0.826_390.91463,17.643_641.06816
0,Area: 250405_chickpea_S1_Pos.raw (F12),Control,6832765.0,90607980.0,291545900.0,279046500.0,583728200.0,771588800.0,288119500.0,674255200.0,...,125788.83123,165419.807578,203272.510072,0.0,143428.241748,177773.956064,170538.83242,188281.053306,9356.543706,0.0
1,Area: 250405_chickpea_S2_Pos.raw (F13),Control,1926549.0,314490200.0,162502800.0,162174500.0,309695000.0,327483400.0,183775000.0,718387800.0,...,85959.50553,134765.569278,31856.637942,0.0,68417.221748,125566.307564,114677.29322,121327.947906,0.0,0.0
2,Area: 250405_chickpea_S3_Pos.raw (F14),Control,5503422.0,421804100.0,45976290.0,80366760.0,589261500.0,466629000.0,81417260.0,703585500.0,...,212753.17193,168358.441178,165699.733372,0.0,110963.545448,137074.881064,108932.39862,152158.519306,0.0,0.0


In [None]:
# Due to slight errors in peak integration, the values for 14.913_295.22793 and 13.355_315.25417 were manually corrected.
# Files were saved separately by ionization mode, with 'pos' and 'neg', respectively 