# Build binary classifier to predict disease status from methylation biomarker 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
from scipy.stats import ttest_ind

# Data
- `mock.tsv` contains methylation data from 150 individuals as follows:
- A_xx, B_xx: methylation data from two orthogonal methods, e.g. A_0 is methylation measurement by method A at CpG site index 0. 
- Label: `1` (disease), `0` (no disease). There are 50 individuals with label 1 and 100 individuals with label 0.
- Cohort: There are 3 cohort groups (`Aug`, `Jul`, `Mar`), each with 50 individuals
- Cohort: Data from same cohort are from the same batch of experiment. All the data are from the identical biological population. Any differences between different Cohorts is not due to internal factor (e.g. health condition) but due to external factors (e.g. different experimental conditions or seasonal effect).   

In [3]:
# 1) Load your data
df = pd.read_csv('../data/mock.tsv', sep='\t')   # or wherever your file lives
df

FileNotFoundError: [Errno 2] No such file or directory: '../data/mock.tsv'

In [None]:
df.describe()

In [None]:
df.Cohort.unique()

In [None]:
for cohort in df['Cohort'].unique():
    count = (df['Cohort'] == cohort).sum()
    print(f"Num of samples with Cohort == {cohort}: {count}")

print(f"Num of sample with Label == 0 is {sum(df.Label==0)}")
print(f"Num of sample with Label == 1 is {sum(df.Label==1)}")    


# Compare singal level between label 1 vs label 0 for each feature
* A47 looks significantly different between the two groups (label0 vs label1)

In [None]:
# 1) Identify features & labels
features_A = [c for c in df.columns if c.startswith('A_')]
features_B = [c for c in df.columns if c.startswith('B_')]
labels     = sorted(df['Label'].unique())

# 2) Plot setup
width = 0.35
colors = {0: "lightsteelblue", 1: "salmon"}

fig, axes = plt.subplots(2, 1, figsize=(20, 12), sharex=False)

for ax, features, title in zip(axes,
                               [features_A, features_B],
                               ['A Features', 'B Features']):
    x = np.arange(len(features))
    for idx, label in enumerate(labels):
        data = [df[df['Label'] == label][feat] for feat in features]
        positions = x + (idx - 0.5) * width
        bp = ax.boxplot(data, positions=positions, widths=width, patch_artist=True)
        for box in bp['boxes']:
            box.set_facecolor(colors[label])
        for component in ('whiskers','caps','medians'):
            for art in bp[component]:
                art.set_color(colors[label] if component!='medians' else 'black')
    ax.set_title(f'{title} by Label')
    ax.set_xticks(x)
    ax.set_xticklabels(features, rotation=90)
    ax.set_ylabel('Methylation')

# build one shared legend on the first subplot
patches = [mpatches.Patch(color=colors[l], label=f'Label {l}') for l in labels]
axes[0].legend(handles=patches, loc='upper right', title='Group')

plt.tight_layout()
plt.show()

# t-test to yield p-value between the two groups (label 0 vs label 1) per feature 
* A47 is the unique feature that is significantly different from group label 0 and label 1 with p-value ~ e(-18)

In [None]:
# 2) Prepare subplots
fig, axs = plt.subplots(2, 1, figsize=(20, 10), sharex=False)

for ax, features, title in zip(
    axs,
    [features_A, features_B],
    ['A Features: –log₁₀(p)', 'B Features: –log₁₀(p)']
):
    # compute p-values
    pvals = []
    for feat in features:
        g0 = df.loc[df['Label'] == labels[0], feat].dropna()
        g1 = df.loc[df['Label'] == labels[1], feat].dropna()
        _, p = ttest_ind(g0, g1, equal_var=False)
        pvals.append(p)
    minus_log_p = -np.log10(pvals)

    # scatter plot
    x = np.arange(len(features))
    ax.scatter(x, minus_log_p, color='black', s=25, zorder=5)

    # formatting
    ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels(features, rotation=90, fontsize=8)
    ax.set_ylabel(r'$-\log_{10}(p\text{-value})$')

    # optional: mark significance threshold at p=0.01
    sig_line = -np.log10(0.01)
    ax.set_ylim(0, max(np.max(minus_log_p), sig_line) * 1.1)  # add a little headroom
    ax.axhline(sig_line, color='red', linestyle='--', linewidth=1)
    ax.text(0, sig_line * 1.02, 'p = 0.01', color='red', va='bottom')

plt.tight_layout()
plt.show()


# A47 is a feature to differentiate label 0 vs label 1

In [None]:
select_feature = 'A_47'

In [None]:
def plot_feature_vs_label(
    df,
    feature: str,
    bins: int = 20,
    colors: tuple = ('red', 'blue'),
    figsize: tuple = (12, 5)
):
    """
    Compare the distribution of `feature` between Label=0 and Label=1:
      1) jittered scatter of feature vs label
      2) overlapping histograms
    Annotates Welch’s t-test p-value.
    """
    # Split out the two classes
    x0 = df.loc[df['Label'] == 0, feature].dropna()
    x1 = df.loc[df['Label'] == 1, feature].dropna()

    # Welch’s t-test
    _, pval = ttest_ind(x0, x1, equal_var=False)

    # Build the subplots
    fig, (ax_scatter, ax_hist) = plt.subplots(1, 2, figsize=figsize)

    # Scatter (with vertical jitter)
    jitter0 = np.random.uniform(-0.02, 0.02, size=len(x0))
    jitter1 = np.random.uniform(-0.02, 0.02, size=len(x1))
    ax_scatter.plot(x0, jitter0, '.', color=colors[0], alpha=0.6, label='Label=0')
    ax_scatter.plot(x1, 1 + jitter1, '.', color=colors[1], alpha=0.6, label='Label=1')
    ax_scatter.set_xlabel(feature)
    ax_scatter.set_ylabel('Label')
    ax_scatter.set_yticks([0, 1])
    ax_scatter.set_yticklabels(['0', '1'])
    ax_scatter.set_title(f'{feature} vs. Label (scatter)')
    ax_scatter.legend()

    # Overlapping histograms
    ax_hist.hist(x0, bins=bins, alpha=0.6, color=colors[0], label='Label=0')
    ax_hist.hist(x1, bins=bins, alpha=0.6, color=colors[1], label='Label=1')
    ax_hist.set_xlabel(feature)
    ax_hist.set_ylabel('Count')
    ax_hist.set_title(f'Distribution of {feature} by Label')
    ax_hist.legend()

    # Annotate p-value
    ax_hist.text(
        0.95, 0.95, f"p = {pval:.2e}",
        transform=ax_hist.transAxes,
        ha='right', va='top',
        fontsize=12,
        bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="black", lw=0.5)
    )

    plt.tight_layout()
    return fig, (ax_scatter, ax_hist)


In [None]:
fig, (ax1, ax2) = plot_feature_vs_label(df, select_feature)

# Logistic regression binary classifier with A_47
* Logistic regression can be used as a quick test. 
* Test AUC-ROC: 0.870

In [None]:
def train_and_plot_lr(
    df,
    feature: str,
    label: str = 'Label',
    test_size: float = 0.2,
    random_state: int = 42,
    solver: str = 'liblinear',
    figsize: tuple = (6, 4)
):
    """
    Train a 1-D logistic regression on df[[feature]] vs df[label],
    print classification metrics, and plot the ROC curve.

    Returns:
        clf        : trained LogisticRegression
        X_test     : feature DataFrame for test set
        y_test     : labels for test set
        y_proba    : predicted probability of class 1 on X_test
    """
    # 1) Split off a hold-out test set
    X = df[[feature]]
    y = df[label]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )

    # 2) Fit logistic regression
    clf = LogisticRegression(solver=solver, random_state=random_state)
    clf.fit(X_train, y_train)

    # 3) Predict & print metrics
    y_pred  = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]
    auc     = roc_auc_score(y_test, y_proba)

    print(f"Test AUC-ROC: {auc:.3f}\n")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # 4) Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure(figsize=figsize)
    plt.plot(fpr, tpr, lw=2, label=f"{feature} (AUC = {auc:.2f})")
    plt.plot([0,1],[0,1], 'k--', alpha=0.5)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC Curve for {feature}")
    plt.legend(loc="lower right")
    plt.grid(alpha=0.3)
    plt.show()

    return clf, X_test, y_test, y_proba

In [None]:
clf, X_test, y_test, y_proba = train_and_plot_lr(df, feature='A_47')

# There is a batch effect by cohort which can be eliminated by offset

In [None]:
def plot_feature_by_cohort_label(df, feature: str, cohorts=None, labels=(0, 1), width=0.6, figsize=(10, 6), colors=('lightsteelblue', 'salmon')
):
    """
    Plot boxplots of `feature` broken out by `cohorts` and `labels`.

    Parameters:
    -----------
    df        : pandas.DataFrame
        Your data containing `feature`, 'Cohort', and 'Label' columns.
    feature   : str
        Name of the numeric column to plot.
    cohorts   : list-like, optional
        Sequence of cohort names. Defaults to sorted unique values in df['Cohort'].
    labels    : tuple of int, optional
        The two label values to compare (default (0, 1)).
    width     : float, optional
        Width of each box.
    figsize   : tuple, optional
        Figure size.
    colors    : tuple of str, optional
        Two colors to alternate between labels.

    Returns:
    --------
    fig, ax : matplotlib Figure and Axes
    """
    if cohorts is None:
        cohorts = sorted(df['Cohort'].unique())

    # 1) Gather data and x-positions
    data, positions, xticks = [], [], []
    step = len(labels) + 1
    for i, cohort in enumerate(cohorts):
        for j, lab in enumerate(labels):
            subset = df.loc[(df['Cohort']==cohort) & (df['Label']==lab), feature]
            data.append(subset.dropna())
            pos = i * step + j
            positions.append(pos)
            xticks.append(f"{cohort}\nLabel {lab}")

    # 2) Create boxplot
    fig, ax = plt.subplots(figsize=figsize)
    bp = ax.boxplot(data, positions=positions, widths=width, patch_artist=True)

    # 3) Color boxes
    box_colors = colors * len(cohorts)
    for patch, col in zip(bp['boxes'], box_colors):
        patch.set_facecolor(col)

    # 4) Ticks and styling
    ax.set_xticks(positions)
    ax.set_xticklabels(xticks, rotation=45, ha='right')
    ax.set_ylabel(feature)
    ax.set_title(f"{feature} by Cohort and Label")
    ax.axhline(0, color='gray', linewidth=0.8)

    plt.tight_layout()
    return fig, ax

In [None]:
fig, ax = plot_feature_by_cohort_label(df, select_feature)
plt.show()


# Offset cohort effect 
* The trend between Label 0 vs Label 1 is consistent, but there is batch effect among different Cohort. 
* The cohort effect is adjusted by offset the mean per each cohort. 

In [None]:
# 2) Create an adjusted column
select_feature_adj = select_feature + '_cohort_adj'
df[select_feature_adj] = df.groupby('Cohort')[select_feature].transform(lambda x: x - x.mean())

In [None]:
fig, ax = plot_feature_by_cohort_label(df, select_feature_adj)
plt.show()

# Adjusted feature A_47 yields p-value ~ e-25

In [None]:
fig, (ax1, ax2) = plot_feature_vs_label(df, select_feature_adj)

# Logistic regression binary classifier with A_47_cohort_adj
* Logistic regression by cohort adjusted A_47 yields Test AUC-ROC: 0.980

In [None]:
clf, X_test, y_test, y_proba = train_and_plot_lr(df, feature='A_47_cohort_adj')

# New Approach: Percentile Rank-Based Feature for Internal Adjustment
- In practice, using a logistic regression trained on a single raw feature (A_47) may not generalize well across batches.  
- For a new patient, we’d like to predict disease status from their feature set (A_xx, B_xx) using our trained classifier on A_47. But without cohort membership, we can’t compute a batch mean to normalize A_47, so batch effects remain uncorrected.  
- Instead, we can convert each feature to its within-sample rank (or percentile) across all A_xx/B_xx values for that patient. Since batch effects shift raw values uniformly, the relative rank of A_47 should stay consistent regardless of cohort. This rank-based feature is thus internally adjusted for batch effects and more robust.


In [None]:
# 2) Compute row-wise percentile ranks (0.0–1.0) across the A-block
row_pct = df[features_A].rank(axis=1, pct=True)

# 3) Extract A_47’s percentile into its own column
df['A_47_rank_pct'] = row_pct['A_47'] * 100  # convert to percentage

In [None]:
fig, (ax1, ax2) = plot_feature_vs_label(df, 'A_47_rank_pct')

In [None]:
clf, X_test, y_test, y_proba = train_and_plot_lr(df, feature='A_47_rank_pct')

# Conclusion

- Using the within-patient percentile rank of **A_47** as the sole feature in a logistic regression classifier provides a robust prediction of disease status without requiring cohort information.  
- The percentile ranks of **A_47** differ significantly between Label 0 and Label 1 groups (Welch’s t-test p ≈ 1 × 10⁻³⁵).  
- This method achieves a test AUC-ROC of **0.985**.