Data Modeling
===

Prediction the targets based on patient and biomarker data.

In [10]:
import json
from pathlib import Path

import lightgbm as lgb
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.compose
import sklearn.ensemble
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from tqdm import tqdm

import bcs.data_loader
import bcs.modeling

In [2]:
data_dir = (Path.cwd() / ".." / "data").resolve()
assert data_dir.exists()
figures_dir = (Path.cwd() / ".." / "figures").resolve()
figures_dir.mkdir(exist_ok=True)

In [3]:
dl = bcs.data_loader.DataLoader(data_dir)

In [4]:
pdf = dl.get_patient_dataframe()
bdf = dl.get_biomarker_dataframe()
tdf = dl.get_target_dataframe()

In [5]:
mdf = pd.merge(tdf, pdf, how="left", left_on="patient_id", right_index=True)
mdf = pd.merge(mdf, bdf, how="left", left_on="biomarker_id", right_index=True)
mdf.shape

(1734, 15174)

In [6]:
# config = bcs.modeling.ModelingConfig()
# model_evaluator = bcs.modeling.ModelEvaluator(config, mdf)

In [9]:
configs = []
for biomarker_feature_action in ["keep", "exclude"]:
    config = bcs.modeling.ModelingConfig(
        experiment_name=f"gbm_BM{biomarker_feature_action[0]}",
        biomarker_feature_action=biomarker_feature_action,
    )
    configs.append(config)
len(configs)

2

In [11]:
metrics_list = []
for config in tqdm(configs):
    model_evaluator = bcs.modeling.ModelEvaluator(config, mdf)
    model_metrics_list = model_evaluator.train_and_evaluate()
    metrics_list.extend(model_metrics_list)
len(metrics_list)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [06:15<00:00, 187.83s/it]


12

In [12]:
metrics = pd.DataFrame(metrics_list)
metrics

Unnamed: 0,experiment_name,institution_name,n,n_pos,n_pos_pred,acc,f1_pos,roc_auc
0,gbm_BMk,"BioLab, Inc.",696,155,106,0.889368,0.704981,0.946741
1,gbm_BMk,Goodfellow Research Institute,123,34,30,0.886179,0.78125,0.896563
2,gbm_BMk,Johnson & Bloom Hospitals,56,10,5,0.910714,0.666667,0.952174
3,gbm_BMk,Montague Hospital,330,78,67,0.918182,0.813793,0.949532
4,gbm_BMk,Saint Penelope Medical Center,231,48,34,0.922078,0.780488,0.945128
5,gbm_BMk,University Hospital System,298,56,42,0.90604,0.714286,0.918167
6,gbm_BMe,"BioLab, Inc.",696,155,82,0.728448,0.202532,0.590007
7,gbm_BMe,Goodfellow Research Institute,123,34,50,0.707317,0.571429,0.744878
8,gbm_BMe,Johnson & Bloom Hospitals,56,10,7,0.803571,0.352941,0.478261
9,gbm_BMe,Montague Hospital,330,78,18,0.739394,0.104167,0.603454


In [10]:
feature_columns = [
    col for col in mdf.columns if col.startswith("BM") or col.startswith("status_") or col.startswith("demographics_")
]
print(f"{len(feature_columns)} feature columns before removal")
# remove manually-excluded columns
if len(config.excluded_columns) > 0:
    feature_columns = [col for col in feature_columns if col not in config.excluded_columns]
if config.biomarker_feature_action == "exclude":
    feature_columns = [col for col in feature_columns if not col.startswith("BM")]
print(f"{len(feature_columns)} feature columns after removal")

15169 feature columns before removal
8 feature columns after removal


In [75]:
sdf = mdf[feature_columns + ["target_label", "institution_name"]].copy()

In [76]:
categorical_columns = list(sdf[feature_columns].select_dtypes(include="object").columns)

for categorical_column in categorical_columns:
    if categorical_column == "status_disease_sub_type":  # config.is_disease_sub_type_ordered
        ordered_cat = CategoricalDtype(sorted(sdf[categorical_column].unique()), ordered=True)
        sdf[categorical_column] = sdf[categorical_column].astype(ordered_cat)
    else:
        sdf[categorical_column] = sdf[categorical_column].astype("category")
categorical_columns

['demographics_gender',
 'status_disease_sub_type',
 'status_smoking_status',
 'demographics_race']

In [44]:
one_hot_encoder = sklearn.compose.make_column_transformer(
    (
        OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
        sklearn.compose.make_column_selector(dtype_include="category"),
    ),
    remainder="passthrough",
)

In [48]:
one_hot_encoder.fit_transform(mdf[feature_columns]).shape, mdf[feature_columns].shape

((1734, 236), (1734, 9))

In [77]:
# see: https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_categorical.html
ordinal_encoder = sklearn.compose.make_column_transformer(
    (
        OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
        sklearn.compose.make_column_selector(dtype_include="category"),
    ),
    remainder="passthrough",  # keep non-categorical columns after transformation
    verbose_feature_names_out=False,  # keep original feature names
)

In [78]:
clf = sklearn.ensemble.HistGradientBoostingClassifier(
    categorical_features=categorical_columns,
)
# clf.fit(mdf[feature_columns], mdf.target_label)

In [79]:
X = ordinal_encoder.fit_transform(sdf[feature_columns])
ordinal_encoder.get_feature_names_out()

array(['demographics_gender', 'status_disease_sub_type',
       'status_smoking_status', 'demographics_race', 'demographics_age',
       'status_comorbidity_index', 'status_cohort_qualifier',
       'status_months_since_diagnosis'], dtype=object)

In [80]:
X = pd.DataFrame(X, index=sdf.index, columns=ordinal_encoder.get_feature_names_out())
X.dtypes

demographics_gender              object
status_disease_sub_type          object
status_smoking_status            object
demographics_race                object
demographics_age                 object
status_comorbidity_index         object
status_cohort_qualifier          object
status_months_since_diagnosis    object
dtype: object

In [114]:
metrics_list = []
for iname, valid_df in sdf.groupby("institution_name"):
    ordinal_encoder = sklearn.compose.make_column_transformer(
        (
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
            sklearn.compose.make_column_selector(dtype_include="category"),
        ),
        remainder="passthrough",  # keep non-categorical columns after transformation
        verbose_feature_names_out=False,  # keep original feature names
    )
    train_df = sdf[~sdf.index.isin(valid_df.index)]
    assert len(train_df) + len(valid_df) == len(sdf)
    X_train = ordinal_encoder.fit_transform(train_df[feature_columns])
    X_valid = ordinal_encoder.transform(valid_df[feature_columns])

    X_train = pd.DataFrame(X_train, index=train_df.index, columns=ordinal_encoder.get_feature_names_out())
    X_valid = pd.DataFrame(X_valid, index=valid_df.index, columns=ordinal_encoder.get_feature_names_out())

    clf = sklearn.ensemble.HistGradientBoostingClassifier(
        categorical_features=categorical_columns,
    )
    clf.fit(X_train, train_df.target_label)

    y_pred = clf.predict(X_valid)
    y_score = clf.predict_proba(X_valid)[:, 1]
    y_true = valid_df.target_label

    metrics = {
        "institution_name": iname,
        "n": len(y_true),
        "n_pos": y_true.sum(),
        "n_pos_pred": y_pred.sum(),
        "acc": (y_pred == y_true).sum() / len(y_true),
        "f1_pos": sklearn.metrics.f1_score(y_true, y_pred),
        "roc_auc": sklearn.metrics.roc_auc_score(y_true, y_score),
    }
    metrics_list.append(metrics)
len(metrics_list)

6

In [115]:
metrics_df = pd.DataFrame(metrics_list)
metrics_df

Unnamed: 0,institution_name,n,n_pos,n_pos_pred,acc,f1_pos,roc_auc
0,"BioLab, Inc.",696,155,77,0.744253,0.232759,0.595063
1,Goodfellow Research Institute,123,34,48,0.723577,0.585366,0.751157
2,Johnson & Bloom Hospitals,56,10,6,0.785714,0.25,0.504348
3,Montague Hospital,330,78,20,0.745455,0.142857,0.621108
4,Saint Penelope Medical Center,231,48,18,0.78355,0.242424,0.637523
5,University Hospital System,298,56,37,0.755034,0.215054,0.556117


In [116]:
metrics_df.set_index("institution_name").mean(axis=0)

n             289.000000
n_pos          63.500000
n_pos_pred     34.333333
acc             0.756264
f1_pos          0.278077
roc_auc         0.610886
dtype: float64