# Fibroscan

In [1]:
import numpy as np
import pandas as pd
from pycaret.classification import *
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score

df = pd.read_csv("data/fibroscan_predict_df.csv")
df.loc[:, "target"] = df.kPa_fib.map(lambda x: 1 if x >= 17 else 0)
df.loc[:, "FIB4"] = df.apply(lambda x: (x["age"] * x["AST"]) / (x['PLT'] * np.sqrt(x['ALT']) + 1e-8), axis=1)
df.loc[:, "APRI"] = df.apply(lambda x: (x["AST"] / 34) * 100 / (x['PLT'] + 1e-8), axis=1)
df = df.drop(["ID", "kPa_fib"], axis=1)

kf = StratifiedKFold(n_splits=5, random_state=42)

clf = setup(data=df, target='target', 
            session_id=42,
            fold_strategy=kf,
            normalize=True,
            fix_imbalance=True,
            remove_outliers = True)
add_metric('auprc', 'AUPRC', average_precision_score, target='pred_proba')

Unnamed: 0,Description,Value
0,session_id,42
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(328, 12)"
5,Missing Values,True
6,Numeric Features,11
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


Name                                                             AUPRC
Display Name                                                     AUPRC
Score Function       <function average_precision_score at 0x7f19fb6...
Scorer               make_scorer(average_precision_score, needs_pro...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: auprc, dtype: object

In [18]:
from pycaret.utils import check_metric

FIB4_auroc_results = []
FIB4_auprc_results = []

APRI_auroc_results = []
APRI_auprc_results = []

X = df.loc[:, ["v1", "v2", "v3", "v4", "v5", "AST", "ALT", "age", "PLT", "FIB4", "APRI"]]
y = df.loc[:, "target"]
for train_index, test_index in kf.split(X, y):
    train_df, test_df = df.loc[train_index], df.loc[test_index]
    test_df = test_df.fillna(0)
    
    clf = setup(data=df, target='target', 
            session_id=42,
            fold=5,
            normalize=True,
            fix_imbalance=True,
            remove_outliers = True)
    add_metric('auprc', 'AUPRC', average_precision_score, target='pred_proba')
    lr = create_model('lr')
    tuned_lr = tune_model(
        lr, optimize='AUC', n_iter=100, 
        search_library='optuna', choose_better=True)
    tuned_lr = finalize_model(tuned_lr)
    
    predict_model(tuned_lr, data=test_df)
    
    FIB4_auroc_score = roc_auc_score(test_df.target, test_df.FIB4).round(4)
    FIB4_auroc_results.append(FIB4_auroc_score)
    
    FIB4_auprc_score = average_precision_score(test_df.target, test_df.FIB4).round(4)
    FIB4_auprc_results.append(FIB4_auprc_score)
    
    APRI_auroc_score = roc_auc_score(test_df.target, test_df.APRI).round(4)
    APRI_auroc_results.append(APRI_auroc_score)
    
    APRI_auprc_score = average_precision_score(test_df.target, test_df.APRI).round(4)
    APRI_auprc_results.append(APRI_auprc_score)
    
results_df = pd.DataFrame({"Measure": ["FIB4", "FIB4", "FIB4", "FIB4", "FIB4", 
                                "APRI", "APRI", "APRI", "APRI", "APRI"], 
                           "AUROC": FIB4_auroc_results + APRI_auroc_results,
                           "AUPRC": FIB4_auprc_results + APRI_auprc_results
                          })

results_df

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8636,0.9538,0.6,0.4286,0.5,0.4236,0.4316,0.7611
1,0.8864,0.9487,0.8,0.5,0.6154,0.5528,0.5739,0.6644
2,0.7907,0.7949,0.25,0.1429,0.1818,0.0719,0.0757,0.2411
3,0.814,0.8974,0.75,0.3,0.4286,0.341,0.3923,0.8
4,0.907,0.9167,0.75,0.5,0.6,0.5497,0.5642,0.6708
Mean,0.8523,0.9023,0.63,0.3743,0.4652,0.3878,0.4075,0.6275
Std,0.0437,0.0576,0.2015,0.1368,0.1572,0.177,0.1807,0.2001


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC
0,Logistic Regression,0.9077,0.9704,1.0,0.5385,0.7,0.6512,0.6948,0.7952


Unnamed: 0,Measure,AUROC,AUPRC
0,FIB4,0.7191,0.3492
1,FIB4,0.8378,0.3943
2,FIB4,0.7337,0.4716
3,FIB4,0.936,0.5804
4,FIB4,0.9261,0.6203
5,APRI,0.7676,0.3257
6,APRI,0.7215,0.2504
7,APRI,0.7579,0.542
8,APRI,0.8768,0.5569
9,APRI,0.9433,0.6763


In [14]:
results_df.groupby("Measure").mean().round(2)

Unnamed: 0_level_0,AUROC,AUPRC
Measure,Unnamed: 1_level_1,Unnamed: 2_level_1
APRI,0.81,0.47
FIB4,0.83,0.48


In [15]:
results_df.groupby("Measure").std().round(2)

Unnamed: 0_level_0,AUROC,AUPRC
Measure,Unnamed: 1_level_1,Unnamed: 2_level_1
APRI,0.09,0.18
FIB4,0.1,0.12


In [12]:
lr = create_model('lr')
tuned_lr = tune_model(
    lr, optimize='AUC', n_iter=100, 
    search_library='optuna', choose_better=True)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8636,0.9538,0.6,0.4286,0.5,0.4236,0.4316,0.7611
1,0.8864,0.9487,0.8,0.5,0.6154,0.5528,0.5739,0.6644
2,0.7907,0.7949,0.25,0.1429,0.1818,0.0719,0.0757,0.2327
3,0.7907,0.9295,0.75,0.2727,0.4,0.3052,0.3627,0.8167
4,0.907,0.9167,0.75,0.5,0.6,0.5497,0.5642,0.6708
Mean,0.8477,0.9087,0.63,0.3688,0.4594,0.3807,0.4016,0.6292
Std,0.0485,0.0585,0.2015,0.1402,0.159,0.1795,0.1815,0.2063


In [5]:
dt = create_model('dt')
tuned_dt = tune_model(
    dt, optimize='AUC', n_iter=100, 
    search_library='optuna', choose_better=True)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.7955,0.8949,1.0,0.3571,0.5263,0.431,0.5241,0.4048
1,0.8864,0.9359,0.6,0.5,0.5455,0.4811,0.4837,0.5
2,0.7674,0.75,0.75,0.25,0.375,0.2736,0.3362,0.2108
3,0.9302,0.9455,0.75,0.6,0.6667,0.6282,0.6331,0.5269
4,0.9302,0.8494,0.75,0.6,0.6667,0.6282,0.6331,0.4733
Mean,0.8619,0.8751,0.77,0.4614,0.556,0.4885,0.5221,0.4231
Std,0.0682,0.0712,0.1288,0.1382,0.1079,0.1331,0.1102,0.1137


In [16]:
lightgbm = create_model('lightgbm')
tuned_lightgbm = tune_model(
    lightgbm, optimize='AUC', n_iter=100, 
    search_library='optuna', choose_better=True)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8864,0.9385,0.8,0.5,0.6154,0.5528,0.5739,0.7
1,0.8864,0.9538,0.6,0.5,0.5455,0.4811,0.4837,0.7611
2,0.8605,0.8654,0.75,0.375,0.5,0.4292,0.4641,0.4896
3,0.8837,0.9551,0.75,0.4286,0.5455,0.4844,0.5094,0.7875
4,0.907,0.9359,0.75,0.5,0.6,0.5497,0.5642,0.8214
Mean,0.8848,0.9297,0.73,0.4607,0.5613,0.4995,0.5191,0.7119
Std,0.0148,0.0331,0.0678,0.051,0.0417,0.0466,0.0434,0.1181


In [16]:
best_model = finalize_model(lightgbm)
save_model(best_model, 'weights/fibroscan_lightgbm_classification')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='target',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                  LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, importance_type='split',
                                 learning_rate=0.1, max_depth=-1,
                                 min_child_samples=20,

# MRE

In [18]:
import numpy as np
import pandas as pd
from pycaret.classification import *
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score

df = pd.read_csv("data/mre_predict_df.csv")
df.loc[:, "target"] = df.kPa_mre.map(lambda x: 1 if x >= 6 else 0)
df.loc[:, "FIB4"] = df.apply(lambda x: (x["age"] * x["AST"]) / (x['PLT'] * np.sqrt(x['ALT']) + 1e-8), axis=1)
df.loc[:, "APRI"] = df.apply(lambda x: (x["AST"] / 34) * 100 / (x['PLT'] + 1e-8), axis=1)
df = df.drop(["ID", "kPa_mre"], axis=1)

kf = StratifiedKFold(n_splits=5, random_state=42)

clf = setup(data=df, target='target', 
            session_id=42,
            fold_strategy=kf,
            normalize=True,
            fix_imbalance=True,
            remove_outliers = True)
add_metric('auprc', 'AUPRC', average_precision_score, target='pred_proba')

Unnamed: 0,Description,Value
0,session_id,42
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(713, 12)"
5,Missing Values,True
6,Numeric Features,11
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


Name                                                             AUPRC
Display Name                                                     AUPRC
Score Function       <function average_precision_score at 0x7f4fd30...
Scorer               make_scorer(average_precision_score, needs_pro...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: auprc, dtype: object

In [19]:
FIB4_auroc_results = []
FIB4_auprc_results = []

APRI_auroc_results = []
APRI_auprc_results = []

X = df.loc[:, ["v1", "v2", "v3", "v4", "v5", "AST", "ALT", "age", "PLT", "FIB4", "APRI"]]
y = df.loc[:, "target"]
for train_index, test_index in kf.split(X, y):
    train_df, test_df = df.loc[train_index], df.loc[test_index]
    test_df = test_df.fillna(0)
    
    FIB4_auroc_score = roc_auc_score(test_df.target, test_df.FIB4).round(4)
    FIB4_auroc_results.append(FIB4_auroc_score)
    
    FIB4_auprc_score = average_precision_score(test_df.target, test_df.FIB4).round(4)
    FIB4_auprc_results.append(FIB4_auprc_score)
    
    APRI_auroc_score = roc_auc_score(test_df.target, test_df.APRI).round(4)
    APRI_auroc_results.append(APRI_auroc_score)
    
    APRI_auprc_score = average_precision_score(test_df.target, test_df.APRI).round(4)
    APRI_auprc_results.append(APRI_auprc_score)
    
results_df = pd.DataFrame({"Measure": ["FIB4", "FIB4", "FIB4", "FIB4", "FIB4", 
                                "APRI", "APRI", "APRI", "APRI", "APRI"], 
                           "AUROC": FIB4_auroc_results + APRI_auroc_results,
                           "AUPRC": FIB4_auprc_results + APRI_auprc_results
                          })

results_df

Unnamed: 0,Measure,AUROC,AUPRC
0,FIB4,0.8574,0.4847
1,FIB4,0.8962,0.3804
2,FIB4,0.9534,0.6006
3,FIB4,0.8855,0.3755
4,FIB4,0.792,0.2665
5,APRI,0.8367,0.4639
6,APRI,0.8872,0.2835
7,APRI,0.8887,0.384
8,APRI,0.8622,0.3206
9,APRI,0.7836,0.1896


In [20]:
results_df.groupby("Measure").mean().round(2)

Unnamed: 0_level_0,AUROC,AUPRC
Measure,Unnamed: 1_level_1,Unnamed: 2_level_1
APRI,0.85,0.33
FIB4,0.88,0.42


In [21]:
results_df.groupby("Measure").std().round(2)

Unnamed: 0_level_0,AUROC,AUPRC
Measure,Unnamed: 1_level_1,Unnamed: 2_level_1
APRI,0.04,0.1
FIB4,0.06,0.13


In [22]:
lr = create_model('lr')
tuned_lr = tune_model(
    lr, optimize='AUC', n_iter=1000, 
    search_library='optuna', choose_better=True)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8526,0.9167,1.0,0.1765,0.3,0.2603,0.3868,0.2125
1,0.7684,0.9121,0.75,0.125,0.2143,0.1532,0.24,0.5219
2,0.8,0.9286,1.0,0.1739,0.2963,0.2419,0.3709,0.5371
3,0.8211,0.6978,0.5,0.1176,0.1905,0.1313,0.1756,0.1645
4,0.7979,0.7473,0.6667,0.1,0.1739,0.1254,0.2014,0.0844
Mean,0.808,0.8405,0.7833,0.1386,0.235,0.1824,0.2749,0.3041
Std,0.0279,0.0977,0.1944,0.031,0.0532,0.0572,0.0874,0.1886


In [23]:
dt = create_model('dt')
tuned_dt = tune_model(
    dt, optimize='AUC', n_iter=1000, 
    search_library='optuna', choose_better=True)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8947,0.9457,1.0,0.2308,0.375,0.3412,0.4535,0.2308
1,0.7579,0.7541,0.75,0.12,0.2069,0.1448,0.2318,0.1005
2,0.8526,0.9231,1.0,0.2222,0.3636,0.3165,0.4336,0.2222
3,0.8526,0.9231,1.0,0.2222,0.3636,0.3165,0.4336,0.2222
4,0.8404,0.7674,0.6667,0.125,0.2105,0.1657,0.2398,0.0947
Mean,0.8397,0.8627,0.8833,0.184,0.3039,0.257,0.3585,0.1741
Std,0.0448,0.0837,0.1453,0.0504,0.0779,0.0838,0.1004,0.0625


In [24]:
lightgbm = create_model('lightgbm')
tuned_lightgbm = tune_model(
    lightgbm, optimize='AUC', n_iter=1000, 
    search_library='optuna', choose_better=True)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8947,0.9946,1.0,0.2308,0.375,0.3412,0.4535,0.75
1,0.8947,0.8942,0.5,0.2,0.2857,0.24,0.2697,0.2827
2,0.8526,0.9313,1.0,0.2222,0.3636,0.3165,0.4336,0.2679
3,0.8947,0.908,0.25,0.125,0.1667,0.1171,0.1252,0.2167
4,0.8404,0.8993,0.6667,0.125,0.2105,0.1657,0.2398,0.1639
Mean,0.8755,0.9255,0.6833,0.1806,0.2803,0.2361,0.3044,0.3362
Std,0.0239,0.0368,0.2906,0.0465,0.0821,0.0856,0.1236,0.2111


In [13]:
best_model = finalize_model(lightgbm)
save_model(best_model, 'weights/mre_lightgbm_classification')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='target',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                  LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, importance_type='split',
                                 learning_rate=0.1, max_depth=-1,
                                 min_child_samples=20,