In [1]:
import pandas as pd

from libs.statical_analysis import AdjustedTtestSelection
from libs.statical_analysis import OneWayAnova
from libs.utils import get_mean_dataset

import warnings
warnings.filterwarnings('ignore')

In [2]:
# if you want to select the features by comparing two groups, you should use "AdjustedTtestSelection"
# if you want to analyze the multiple groups (more than two), you should use  "OneWayAnova"

# AdjustedTtestSelection is the method of Benjamini-Hochberg procedure (Adjusted T-test)
# OneWayAnova includes the post hoc test of Bonferroni


# First, this is example of two group analysis

feature_selection = AdjustedTtestSelection(
    fc_threshold=2.0,
    p_value_threshold=0.05,
    fdr_threshold=0.05,
    is_save_file=True
)

# preparing dataset (recommend for using filtered dataset)
df_ori_total = pd.read_csv('./data/dataset_filtered.csv')

# select the two groups(label)
df_ori_twogroup = df_ori_total[df_ori_total.Label.isin(['Control', 'Severe'])].reset_index(drop=True)

In [3]:
# The hyperparameter "scaled_dataset" in feature_selection is optional.
# it is just designated to be saved together with final saved file.

df_static, df_selection = feature_selection.feature_selection(ori_dataset=df_ori_twogroup)
df_static.head()

The mean of 9.03_802.5958 is zero in any one group...
The mean of 9.16_802.5959 is zero in any one group...
The mean of 9.49_800.5799 is zero in any one group...
The mean of 9.53_453.3352 is zero in any one group...
feature_selection.xlsx file is saved


Unnamed: 0,feature,Control_mean,Severe_mean,Fold_change,Control_shaprio,Severe_shaprio,p_value,FDR_values
0,0.5_201.8884,509.5367,501.570145,1.015883,0.081802,0.024895,0.954683,0.975142
1,0.5_230.8912,1257.74302,1573.254175,0.799453,0.148225,0.766734,0.170866,0.269187
2,0.51_170.9287,7681.9544,7553.8976,1.016952,0.711665,0.018975,0.770309,0.842682
3,0.51_182.9633,13656.8316,11289.30005,1.209715,0.35584,0.193073,0.008052,0.039025
4,0.51_214.9182,333859.59,323926.155,1.030666,0.952468,0.004269,0.554841,0.663582


In [4]:
# Oneway anova analysis
# more than two Label (eg, control, sefere, mild...)

df_ori_total = pd.read_csv('./data/dataset_filtered.csv')
print(df_ori_total.Label.unique())

['Control' 'Severe' 'Moderate' 'Mild']


In [5]:
anova = OneWayAnova(
    is_save_file=True,
    p_value=0.05,
)

df_anova = anova.processing(df_ori_total)
df_anova.head()

oneway_analysis.xlsx file is saved


Unnamed: 0,one-way_pvalue,Control_Severe,Control_Moderate,Control_Mild,Severe_Moderate,Severe_Mild,Moderate_Mild
0.5_201.8884,0.74525,1.0,1.0,1.0,1.0,1.0,1.0
0.5_230.8912,0.092301,1.0,1.0,1.0,1.0,0.059695,0.320858
0.51_170.9287,0.541336,1.0,1.0,1.0,1.0,1.0,1.0
0.51_182.9633,0.042624,0.04831,1.0,1.0,0.232479,0.172111,1.0
0.51_214.9182,0.530117,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# If you want to check the average intensity of features according to label, you can perform the "get_mean_dataset"

df_mean = get_mean_dataset(
    dataset=df_ori_total,
    label_list=['Control', 'Severe', 'Moderate', 'Mild']
)

df_mean.head()

Unnamed: 0,Control,Severe,Moderate,Mild
0.5_201.8884,509.5367,501.570145,596.372305,463.5199
0.5_230.8912,1257.74302,1573.254175,1434.037685,1078.90875
0.51_170.9287,7681.9544,7553.8976,8133.84525,7623.29675
0.51_182.9633,13656.8316,11289.30005,13294.8078,13329.41165
0.51_214.9182,333859.59,323926.155,347818.69,333904.115
