In [None]:
import pymysql
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
db_connection = pymysql.connect(
    read_default_file='/home/jupyter/.my.cnf',
    read_default_group='RAnalysis', 
    db = 'actin_personalization'
)

query = """
SELECT *
FROM knownPalliativeTreatments
"""

df_treatments = pd.read_sql(query, db_connection)

db_connection.close()

df_treatments.head()

In [None]:
df_treatments = df_treatments.dropna(subset = ['observedPfsDays', 'observedOsFromTreatmentStartDays'])
df_treatments['observedPpsDays'] = (df_treatments['observedOsFromTreatmentStartDays'] - df_treatments['observedPfsDays'])
df_treatments['observedPpsEvent'] = (df_treatments['hadProgressionEvent'].astype(bool) & df_treatments['hadSurvivalEvent'].astype(bool))

df_treatments_synchronous = df_treatments[df_treatments['isMetachronous'] != 1] 

# Correlation between PFS and OS 

The aim is to determine whether PFS is a good surrogate for OS by evaluating the strength of their relationship.

We use Harrell’s C-index (concordance index) because it is specifically designed for survival analysis and accounts for censoring in time-to-event data. The C-index measures the ability of one variable (e.g., PFS) to predict the ranking of another variable (e.g., OS), taking into account the uncertainty introduced by censored observations.

The C-index works by comparing all possible pairs of patients:

- A pair is concordant if the patient with a longer observed PFS also has a longer observed OS.
- A pair is discordant if the patient with a longer observed PFS has a shorter observed OS.
- Pairs where censoring prevents a definitive comparison are excluded.

The C-index ranges from 0 to 1:
- A C-index of 1 indicates perfect concordance (i.e., PFS perfectly predicts OS rankings).
- A C-index of 0.5 suggests no predictive ability (similar to random guessing).
- A C-index below 0.5 indicates that PFS is inversely related to OS, which would be unexpected.

Using the C-index allows us to evaluate the strength of the relationship between PFS and OS while properly accounting for the censoring inherent in survival data, making it well-suited for assessing whether PFS can serve as a surrogate endpoint for OS in clinical studies.

Since there is no python library that implements the Harrel's C-index as such (https://statisticaloddsandends.wordpress.com/2019/10/26/what-is-harrells-c-index/), we define it by hand:


In [None]:
import numpy as np

def harrell_c_index(predictor_time, predictor_event, outcome_time, outcome_event):
    predictor_time = np.array(predictor_time)
    predictor_event = np.array(predictor_event, dtype=bool)
    outcome_time = np.array(outcome_time)
    outcome_event = np.array(outcome_event, dtype=bool)
    
    n = len(outcome_time)
    
    i, j = np.triu_indices(n, k=1)  
    
    outcome_event_i = outcome_event[i]
    outcome_event_j = outcome_event[j]
    outcome_time_i = outcome_time[i]
    outcome_time_j = outcome_time[j]
    predictor_time_i = predictor_time[i]
    predictor_time_j = predictor_time[j]
    
    concordant = 0
    discordant = 0
    comparable = 0
    
    both_observed = outcome_event_i & outcome_event_j
    comparable += np.sum(both_observed & (outcome_time_i != outcome_time_j))
    concordant += np.sum(both_observed & (outcome_time_i > outcome_time_j) & (predictor_time_i > predictor_time_j))
    concordant += np.sum(both_observed & (outcome_time_i < outcome_time_j) & (predictor_time_i < predictor_time_j))
    discordant += np.sum(both_observed & (outcome_time_i > outcome_time_j) & (predictor_time_i < predictor_time_j))
    discordant += np.sum(both_observed & (outcome_time_i < outcome_time_j) & (predictor_time_i > predictor_time_j))
    
    only_i_observed = outcome_event_i & ~outcome_event_j
    comparable += np.sum(only_i_observed & (outcome_time_i < outcome_time_j))
    concordant += np.sum(only_i_observed & (outcome_time_i < outcome_time_j) & (predictor_time_i > predictor_time_j))
    discordant += np.sum(only_i_observed & (outcome_time_i < outcome_time_j) & (predictor_time_i < predictor_time_j))
    
    only_j_observed = outcome_event_j & ~outcome_event_i
    comparable += np.sum(only_j_observed & (outcome_time_j < outcome_time_i))
    concordant += np.sum(only_j_observed & (outcome_time_j < outcome_time_i) & (predictor_time_j > predictor_time_i))
    discordant += np.sum(only_j_observed & (outcome_time_j < outcome_time_i) & (predictor_time_j < predictor_time_i))
    
    return concordant / comparable if comparable > 0 else np.nan


In [None]:
def median_iqr(series):
        med = series.median()
        q1 = np.percentile(series, 25)
        q3 = np.percentile(series, 75)
        return f"{med:.2f} [{q1:.2f} - {q3:.2f}]"
    
def calculate_cindex_and_statistics(df, grouping_column, group_labels):
    results = []
    for group_label in group_labels:
        group_df = df[df[grouping_column] == group_label].copy().reset_index(drop=True)
        if group_df.empty:
            print(f"Group {group_label} is empty.")
            continue
        
        cidx_pfs_os = harrell_c_index(
            predictor_time = group_df['observedPfsDays'], 
            predictor_event = group_df['hadProgressionEvent'].astype(bool), 
            outcome_time = group_df['observedOsFromTreatmentStartDays'], 
            outcome_event= group_df['hadSurvivalEvent'].astype(bool)
        )
        
        cidx_pfs_pps = harrell_c_index(
            predictor_time = group_df['observedPfsDays'], 
            predictor_event = group_df['hadProgressionEvent'].astype(bool), 
            outcome_time = group_df['observedPpsDays'], 
            outcome_event= group_df['observedPpsEvent'].astype(bool)
        )
       
        pfs_stats = median_iqr(group_df['observedPfsDays'])
        os_stats = median_iqr(group_df['observedOsFromTreatmentStartDays'])
        pps_stats = median_iqr(group_df['observedPpsDays'])
    

        results.append({
            'Group': str(group_label),
            'N': len(group_df),
            'C-Index (PFS→OS)': cidx_pfs_os,
            'C-Index (PFS→PPS)': cidx_pfs_pps,
            'PFS Median(IQR)': pfs_stats,
            'OS Median(IQR)': os_stats,
            'PPS Median(IQR)': pps_stats
        })
    
    return pd.DataFrame(results)

In [None]:
def plot_correlation_results(correlation_df, grouping_column, line=True, ax=None, title=None):
    if ax is None:
        ax = plt.gca() 
    
    groups = correlation_df['Group']
    os_values = correlation_df['C-Index (PFS→OS)']
    pps_values = correlation_df['C-Index (PFS→PPS)']
    
    x = np.arange(len(groups))
    width = 0.35
    
    os_bars = ax.bar(x - width / 2, os_values, width, color='skyblue', label='C-Index (PFS→OS)')
    pps_bars = ax.bar(x + width / 2, pps_values, width, color='lightgreen', label='C-Index (PFS→PPS)')
    
    # Display values inside the bars
    for bar, value in zip(os_bars, os_values):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, height + 0.02, f'{value:.2f}', ha='center', va='bottom', color='black')

    for bar, value in zip(pps_bars, pps_values):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, height + 0.02, f'{value:.2f}', ha='center', va='bottom', color='black')

    if line:
       
        ax.plot(groups, os_values, marker='o', color='blue', linestyle='-', label='Correlation (Line)')
        ax.plot(groups, pps_values, marker='o', color='green', linestyle='-', label='Correlation (Line)')
    
    ax.set_xticks(x)
    ax.set_xticklabels(groups, rotation=45, ha='right')

    ax.set_xlabel(f'{grouping_column} Groups')
    ax.set_ylabel('C-index')
    if not title:
        ax.set_title(f'PFS-OS and PFS-PPS C-index by {grouping_column}')
    else:
        ax.set_title(title)
        
    ax.legend()
    ax.tick_params(axis='x', rotation=45)

    if ax is plt.gca():
        plt.tight_layout()
        plt.show()

## First-line treatments

### Subgroup Analysis

In [None]:
def calculate_and_plot(df, grouping_column, bins=None, line=True, ax=None, title=None):
    if bins:
        df[grouping_column + '_bins'] = pd.cut(df[grouping_column], bins=bins)
        group_labels = sorted(df[grouping_column + '_bins'].dropna().unique(), key=lambda x: x.left)
        group_column = grouping_column + '_bins'
    else:
        group_labels = sorted(df[grouping_column].unique())
        group_column = grouping_column
    
    correlation_df = calculate_cindex_and_statistics(df, group_column, group_labels)
    
    plot_correlation_results(correlation_df, group_column, line=line, ax=ax, title =title)
    
    return correlation_df

### Metachronous patients

In [None]:
first_line_correlation_results = calculate_and_plot(df_treatments, 'isMetachronous', line = False)
first_line_correlation_results

### Treatment type
Different treatment modalities, such as chemotherapy and targeted therapy, have distinct mechanisms of action. 

The goal here is to see whether the correlation between PFS and OS differs between these two treatment types. If one treatment type shows a stronger PFS-OS correlation, it may suggest that PFS is a more reliable surrogate for OS for that particular type of treatment.

In [None]:
def categorize_treatment_plan(df):
    chemo = ['FOLFOX', 'FOLFOX_B', 'CAPOX', 'CAPOX_B', 'FOLFIRI', 'FOLFIRI_B', 'FOLFOX_P', 
                      'FOLFOXIRI', 'FOLFOXIRI_B', 'CAPECITABINE', 'FLUOROURACIL', 'FOLFIRI_P', 'IRINOTECAN', 'CAPECITABINE_BEVACIZUMAB', 'FLUOROURACIL_BEVACIZUMAB']
    
    immuno = ['PEMBROLIZUMAB', 'NIVOLUMAB']
    
    # Default to 'Other' for uncategorized plans
    df['treatment_category'] = 'Other'
    
    df.loc[df['systemicTreatmentPlan'].isin(chemo), 'treatment_category'] = 'Chemotherapy'
    df.loc[df['systemicTreatmentPlan'].isin(immuno), 'treatment_category'] = 'Immunotherapy'
    
    return df

df_treatments_synchronous = categorize_treatment_plan(df_treatments_synchronous.copy())
df_treatments = categorize_treatment_plan(df_treatments.copy())

In [None]:
calculate_and_plot(df_treatments_synchronous, 'treatment_category', line = False)

### Censored vs Uncensored



In [None]:
censor_correlation_results_pfs = calculate_cindex_and_statistics(df_treatments_synchronous, 'hadProgressionEvent', [0, 1])

censor_correlation_results_pfs

In [None]:
censor_correlation_results_os = calculate_cindex_and_statistics( df_treatments_synchronous, 'hadSurvivalEvent', [0, 1])

censor_correlation_results_os

In [None]:
unique_treatments = df_treatments_synchronous['treatment_category'].unique()

fig, axs = plt.subplots(1, len(unique_treatments), figsize=(18, 6))

for ax, treatment in zip(axs, unique_treatments):
    df_treatments_specific = df_treatments_synchronous[df_treatments_synchronous['treatment_category'] == treatment]
    
    censor_treatment_correlation_results = calculate_and_plot(df_treatments_specific,'hadProgressionEvent', line=False, ax=ax, title=f"PFS-OS Correlation for {treatment}")
    print(censor_treatment_correlation_results.head())
    
plt.tight_layout()
plt.show()

### Patient characteristics

Specific patient characteristics can affect both treatment tolerance and survival outcomes. We analyze the different groups to check whether the relationship between PFS and OS differs within them.

In [None]:
from scipy.stats import spearmanr

def check_trend(df_correlations):
    groups = df_correlations['Group']
    os_values = df_correlations['C-Index (PFS→OS)']
    pps_values = df_correlations['C-Index (PFS→PPS)']

    # Check trend for PFS→OS C-index
    os_spearman_corr, os_p_value = spearmanr(groups, os_values)
    print(f"Trend for PFS→OS C-Index:")
    print(f"  Spearman's rank correlation: {os_spearman_corr:.4f}")
    print(f"  P-Value: {os_p_value:.4f}\n")
    
    # Check trend for PFS→PPS C-index
    pps_spearman_corr, pps_p_value = spearmanr(groups, pps_values)
    print(f"Trend for PFS→PPS C-Index:")
    print(f"  Spearman's rank correlation: {pps_spearman_corr:.4f}")
    print(f"  P-Value: {pps_p_value:.4f}")

In [None]:
age_correlation_results

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(18, 14))

age_correlation_results = calculate_and_plot(df_treatments_synchronous, 'ageAtTreatmentPlanStart', bins=range(0, 110, 10), ax=axs[0, 0])
who_status_correlation_results = calculate_and_plot(df_treatments_synchronous, 'whoStatusPreTreatmentStart', ax=axs[0, 1])
cci_correlation_results = calculate_and_plot(df_treatments_synchronous, 'cci', ax=axs[0, 2])
msi_correlation_results = calculate_and_plot(df_treatments_synchronous, 'hasMsi', line=False, ax=axs[1, 0])
braf_correlation_results = calculate_and_plot(df_treatments_synchronous, 'hasBrafMutation', line=False, ax=axs[1, 1])
braf_v_correlation_results = calculate_and_plot(df_treatments_synchronous, 'hasBrafV600EMutation', line=False, ax=axs[1, 2])
ras_correlation_results = calculate_and_plot(df_treatments_synchronous, 'hasRasMutation', line=False, ax=axs[2, 0])
kras_correlation_results = calculate_and_plot(df_treatments_synchronous, 'hasKrasG12CMutation', line=False, ax=axs[2, 1])
treatment_duration_correlation_results = calculate_and_plot(df_treatments_synchronous, 'systemicTreatmentPlanDuration', bins=range(0, 1193, 100), ax=axs[2, 2])

plt.tight_layout()
plt.show()

#### Check trend significance


In [None]:
check_trend(age_correlation_results.iloc[:-1]) #Exclude patient group of 2 (90-100)
age_correlation_results

In [None]:
check_trend(who_status_correlation_results)
who_status_correlation_results

In [None]:
check_trend(cci_correlation_results)
cci_correlation_results

In [None]:
check_trend(treatment_duration_correlation_results)
treatment_duration_correlation_results

### Relation between PFS and OS

A regression analysis allows us to quantify how much OS increases for each additional unit of PFS (e.g., how many extra months of survival we expect for every month of PFS improvement).

We are using a simple linear regression model because it provides an intuitive way to understand the linear relationship between the two variables. The slope of the regression line tells us the expected gain in OS for each day or month of improvement in PFS. A higher slope would suggest that even modest improvements in PFS are associated with significant gains in OS.

The intercept gives the baseline OS when PFS is zero, which can offer insight into survival even in the absence of progression-free time.


In [None]:
from sklearn.linear_model import LinearRegression

def fit_and_plot_linear_regression(df, x_column, y_column, xlabel=None, ylabel=None, title=None):
    X = df[[x_column]]
    y = df[y_column]
    
    model = LinearRegression()
    model.fit(X, y)
    
    slope = model.coef_[0]
    intercept = model.intercept_
    
    print(f"Slope (OS gain per unit of {x_column}): {slope}")
    print(f"Intercept (Baseline OS): {intercept}")
    
    y_pred = model.predict(X)
    
    plt.figure(figsize=(10, 6))
    plt.scatter(X, y, color='blue', label='Data Points')
    plt.plot(X, y_pred, color='red', label=f'Fitted Line (Slope: {slope:.2f}, Intercept: {intercept:.2f})')
    
    plt.xlabel(xlabel if xlabel else x_column)
    plt.ylabel(ylabel if ylabel else y_column)
    plt.title(title if title else f'{x_column} vs {y_column} with Fitted Line')
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
fit_and_plot_linear_regression(
    df=df_treatments_synchronous,
    x_column='observedPfsDays',
    y_column='observedOsFromTreatmentStartDays',
    xlabel='Observed PFS Days',
    ylabel='Observed OS Days',
    title='PFS vs OS with Fitted Line'
)

Looking at the plot, there appear to be some patients for which the OS is shorter than the PFS. So let's check that:

In [None]:
# Check:
shorter_os_than_pfs = df_treatments_synchronous[df_treatments_synchronous['observedOsFromTreatmentStartDays'] < df_treatments_synchronous['observedPfsDays']]

num_patients = shorter_os_than_pfs.shape[0]

shorter_os_than_pfs[['observedOsFromTreatmentStartDays', 'observedPfsDays', 'observedOsFromTreatmentStartDays']]

## Correlation between PPS and OS

In this section, we investigate **Post-Progression Survival (PPS)**, defined as the survival time after disease progression. PPS is calculated as the difference between **Overall Survival (OS)** and **Progression-Free Survival (PFS)**. 

We focus on evaluating how **PPS correlates with OS** in first-line therapies. Understanding this relationship can help us assess whether PPS plays a significant role in the overall survival outcomes of patients undergoing first-line treatments.

In [None]:
df_treatments_synchronous['PPS'] = df_treatments_synchronous['observedOsFromTreatmentStartDays'] - df_treatments_synchronous['observedPfsDays']

pps_os_correlation = df_treatments_synchronous[['PPS', 'observedOsFromTreatmentStartDays']].corr(method='spearman')

print(f"Spearman's rank correlation between PPS and OS in first-line therapies: {pps_os_correlation.iat[0, 1]}")

# Define short PPS based on median
short_pps_threshold = df_treatments_synchronous['PPS'].median()  
short_pps_group = df_treatments_synchronous[df_treatments_synchronous['PPS'] <= short_pps_threshold]
long_pps_group = df_treatments_synchronous[df_treatments_synchronous['PPS'] > short_pps_threshold]

pfs_os_correlation_short_pps = short_pps_group[['observedPfsDays', 'observedOsFromTreatmentStartDays']].corr(method='spearman')
pfs_os_correlation_long_pps = long_pps_group[['observedPfsDays', 'observedOsFromTreatmentStartDays']].corr(method='spearman')

print(f"PFS-OS correlation for short PPS group: {pfs_os_correlation_short_pps.iat[0, 1]}")
print(f"PFS-OS correlation for long PPS group: {pfs_os_correlation_long_pps.iat[0, 1]}")

In [None]:
fit_and_plot_linear_regression(
    df=df_treatments_synchronous,
    x_column='PPS',
    y_column='observedOsFromTreatmentStartDays',
    xlabel='Observed PPS Days',
    ylabel='Observed OS Days',
    title='PPS vs OS with Fitted Line'
)

## Correlation between PFS and PPS
Investigating the relationship between PFS and PPS helps us understand whether the duration of progression-free time influences the length of survival post-progression. A strong positive correlation would indicate that therapies that extend PFS may also provide patients with more time post-progression.

This analysis offers insight into whether PFS and PPS are interdependent or represent largely independent phases influenced by different factors.

In [None]:
# Calculate Spearman correlation between PFS and PPS
pfs_pps_correlation = df_treatments_synchronous[['observedPfsDays', 'PPS']].corr(method='spearman')
correlation_value = pfs_pps_correlation.iat[0, 1]

print(f"Spearman's rank correlation between PFS and PPS: {correlation_value:.4f}")


In [None]:
# Plot PFS vs PPS with linear regression line
fit_and_plot_linear_regression(
    df=df_treatments_synchronous,
    x_column='observedPfsDays',
    y_column='PPS',
    xlabel='Observed PFS Days',
    ylabel='Observed PPS Days',
    title='PFS vs PPS with Fitted Line'
)