In [1]:
import pandas as pd

from libs.preprocessing import MetaboPreprocer

import warnings
warnings.filterwarnings('ignore')

In [2]:
# The dataset (ecpect "labels" and "Name" class) must not contain non-numeric values
# Columns of "Name" and "Label" must be included in the dataset (please refer the README.md)
# Labels class in blank naming must be 'blank', not "Blank", "B", 'b'...
# .xlsx or .csv file are only allowed in this analysis process.

original_dataset = pd.read_excel('./data/example_dataset.xlsx', sheet_name='Pos_transpose')
original_dataset.head(3)

Unnamed: 0,Name,Label,0.5_201.8884,0.5_230.8912,0.51_170.9287,0.51_182.9633,0.51_214.9182,0.51_344.8764,0.52_125.9871,0.52_140.918,...,10.41_796.5461,10.41_804.5518,10.42_350.2677,10.44_269.1618,10.44_401.342,10.47_281.0519,10.47_415.0373,10.47_503.1079,10.47_536.166,10.47_564.1974
0,A1,blank,0.0,0.0,0.0,0.0,0.0,0.0,74.93914,0.0,...,0.0,396.837,0.0,0.0,0.0,0.0,0.0,0.0,628.3788,0.0
1,A2,blank,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1246.032,434.3936,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A3,blank,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,309.299,0.0,688.155,0.0,0.0,0.0,0.0,0.0,1177.744,0.0


In [3]:
# "step_blank_substracte" and "step_filter_rsd" are data filtering step
# The steps of data filtering are able to remove useless features
# This step is strongly recommended for untargeted metabolomics dataset with large numbers of features

# step_blank_substration : Substracte the blank values from the original intensities and remove features with a large number of negative data
# step_filter_rsd        : Features with high percent RSD can be removed from the dataset. Filtering the features based on RSD of QC 

# RSD = standard devication / mean
# QC = quality control

preprocesser = MetaboPreprocer(
    step_blank_substrateion=True,
    step_filter_rsd=False,
    qc_label='Mix',
    scaling='pareto',
    file_save=True
)

In [4]:
# df_scaled = scaled and filtered (if step_blank_subtration or step_filter_rsd is True) dataset
# df_ori = only filtered dataset


df_scaled, df_ori = preprocesser.preprocessing(
    './data/example_dataset.xlsx',
    sheet_name='Pos_transpose',
)

df_scaled.head(3)

Original dataset shape (n_samples, n_features + 2) = (134, 3011)
A total of 0 missing values were detected
25 features have all zero values. These featuers are removed
Removal of featuers with more than 50.0% proportion of negative values in all groups
Before the substration process, dataset shape (n_samples, n_features + 2) = (134, 2986).
After the substration process, dataset shape (n_samples, n_features + 2) = (107, 1015).
['all'] features are selected and scaled
Processed dataset shape (n_samples, n_features + 2) = (107, 1015)



Unnamed: 0,Name,Label,0.5_201.8884,0.5_230.8912,0.51_170.9287,0.51_182.9633,0.51_214.9182,0.51_344.8764,0.52_125.9871,0.52_140.918,...,10.26_617.4752,10.33_617.4756,10.34_393.2981,10.4_756.5543,10.41_617.4755,10.41_618.4799,10.41_796.5461,10.41_804.5518,10.42_350.2677,10.44_401.342
0,A28,Control,25.402731,76.430501,1.402912,19.718165,15.70668,0.693475,64.080131,4.914039,...,-23.362062,-24.231029,-15.65081,-12.832867,-6.764024,-10.588823,-38.061279,-18.356419,-1.677112,-130.1531
1,A29,Control,23.74861,25.736515,-20.099203,94.010693,5.542898,-20.212123,52.941544,-18.575459,...,19.806922,-10.039561,-15.65081,-12.832867,8.068956,-3.34479,-31.940344,235.631566,5.977682,-130.1531
2,A30,Control,20.258134,-14.31336,12.35333,85.492759,112.876886,19.538083,46.254057,5.92226,...,81.507408,18.560967,-9.891919,-12.832867,1.522044,-10.588823,-38.061279,532.952331,-13.333307,-130.1531


In [5]:
# You can proceed with data scaling by selecting some groups using the label_list option.

df_scaled, df_ori = preprocesser.preprocessing(
    './data/example_dataset.xlsx',
    sheet_name='Pos_transpose',
    label_list= ['Control', 'Severe', 'Moderate', 'Mild']
)

df_scaled.head(3)

Original dataset shape (n_samples, n_features + 2) = (134, 3011)
A total of 0 missing values were detected
25 features have all zero values. These featuers are removed
Removal of featuers with more than 50.0% proportion of negative values in all groups
Before the substration process, dataset shape (n_samples, n_features + 2) = (134, 2986).
After the substration process, dataset shape (n_samples, n_features + 2) = (107, 1015).
['Control', 'Severe', 'Moderate', 'Mild'] features are selected and scaled
Processed dataset shape (n_samples, n_features + 2) = (80, 1015)



Unnamed: 0,Name,Label,0.5_201.8884,0.5_230.8912,0.51_170.9287,0.51_182.9633,0.51_214.9182,0.51_344.8764,0.52_125.9871,0.52_140.918,...,10.26_617.4752,10.33_617.4756,10.34_393.2981,10.4_756.5543,10.41_617.4755,10.41_618.4799,10.41_796.5461,10.41_804.5518,10.42_350.2677,10.44_401.342
0,A28,Control,28.562293,79.076344,7.157489,16.259481,47.526689,11.108131,64.328621,6.589934,...,-20.975425,-23.940256,-16.661077,-15.946146,-5.093396,-8.955816,-45.211807,-22.849793,2.580376,-142.83197
1,A29,Control,26.913466,28.311775,-14.08936,86.584976,37.502832,-9.764019,53.391129,-16.97549,...,19.470389,-10.591632,-16.661077,-15.946146,8.897729,-2.074044,-39.496798,214.686365,11.115489,-142.83197
2,A30,Control,23.434162,-11.793862,17.977907,78.521878,143.35915,29.922588,46.824376,7.601415,...,77.278707,16.310285,-9.729683,-15.946146,2.722391,-8.955816,-45.211807,492.748462,-10.416306,-142.83197
