In [None]:
import pymysql
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
db_connection = pymysql.connect(
    read_default_file='/home/jupyter/.my.cnf',
    read_default_group='RAnalysis', 
    db = 'actin_personalization'
)

query = """  
SELECT * 
FROM knownPalliativeTreatments 
"""

df_treatments = pd.read_sql(query, db_connection)

db_connection.close()

df_treatments.head()

In [None]:
df_treatments = df_treatments.dropna(subset = ['observedPfsDays', 'observedOsFromTumorIncidenceDays'])

# Correlation between PFS and OS 

The aim is to determine whether PFS is a good surrogate for OS by evaluating the strength of their relationship.

We use Spearman's rank correlation coefficient because it is well-suited for assessing the strength and direction of monotonic relationships, where one variable tends to increase or decrease as the other does, without assuming that the relationship is linear. Spearman's correlation works by ranking the values of each variable and calculating the correlation between the ranks, making it ideal for situations where:
- The relationship between PFS and OS may not be perfectly linear.
- The data may include outliers, as Spearman's method is robust to outliers and does not rely on the magnitude of the values, only their ranks.

A Spearman correlation close to 1 indicates a strong positive monotonic relationship, meaning that as PFS improves, OS tends to increase as well. This helps assess whether PFS can serve as a valid surrogate endpoint for OS in clinical studies.

In [None]:
def calculate_pfs_os_correlation(grouping_column, group_labels, df):
    correlation_results = []
    
    for group_label in group_labels:
        group = df[df[grouping_column] == group_label]
        if not group.empty:
            pfs_os_correlation = group[['observedPfsDays', 'observedOsFromTumorIncidenceDays']].corr(method='spearman')
            correlation = pfs_os_correlation.iat[0, 1] if not pfs_os_correlation.empty else None
            correlation_results.append({'Group': str(group_label), 'Correlation': correlation})

    correlation_df = pd.DataFrame(correlation_results)
    
    return correlation_df

In [None]:
def plot_correlation_results(correlation_df, grouping_column, line=True, ax=None, title=None):
    if ax is None:
        ax = plt.gca() 
    
    groups = correlation_df['Group']
    correlations = correlation_df['Correlation']

    bars = ax.bar(groups, correlations, color='skyblue', label='Correlation (Bars)')
    
    # Display correlation values inside the bars
    for bar, correlation in zip(bars, correlations):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2.0, height / 2, f'{correlation:.2f}', ha='center', va='center', color='black')

    if line:
        ax.plot(groups, correlations, marker='o', color='orange', linestyle='-', label='Correlation (Line)')
    
    ax.set_xlabel(f'{grouping_column} Groups')
    ax.set_ylabel('PFS-OS Correlation')
    if not title:
        ax.set_title(f'PFS-OS Correlation by {grouping_column}')
    else:
        ax.set_title(title)
    ax.legend()
    ax.tick_params(axis='x', rotation=45)

    if ax is plt.gca():
        plt.tight_layout()
        plt.show()

## First-line treatments

In [None]:
df_treatments['treatment_first_line'] = df_treatments['order']

first_line_correlation_results = calculate_pfs_os_correlation('treatment_first_line', [1], df_treatments)
first_line_correlation_results

## Subgroup Analysis

In [None]:
def calculate_and_plot(df, grouping_column, bins=None, line=True, ax=None, title=None):
    if bins:
        df[grouping_column + '_bins'] = pd.cut(df[grouping_column], bins=bins)
        group_labels = sorted(df[grouping_column + '_bins'].dropna().unique(), key=lambda x: x.left)
        group_column = grouping_column + '_bins'
    else:
        group_labels = sorted(df[grouping_column].unique())
        group_column = grouping_column
    
    correlation_df = calculate_pfs_os_correlation(group_column, group_labels, df)
    
    plot_correlation_results(correlation_df, group_column, line=line, ax=ax, title =title)
    
    return correlation_df

### Treatment type
Different treatment modalities, such as chemotherapy and targeted therapy, have distinct mechanisms of action. 

The goal here is to see whether the correlation between PFS and OS differs between these two treatment types. If one treatment type shows a stronger PFS-OS correlation, it may suggest that PFS is a more reliable surrogate for OS for that particular type of treatment.

In [None]:
def categorize_treatment_plan(df):
    chemo = ['FOLFOX', 'FOLFOX_B', 'CAPOX', 'CAPOX_B', 'FOLFIRI', 'FOLFIRI_B', 'FOLFOX_P', 
                      'FOLFOXIRI', 'FOLFOXIRI_B', 'CAPECITABINE', 'FLUOROURACIL', 'FOLFIRI_P', 'IRINOTECAN']
    
    target = ['CAPECITABINE_BEVACIZUMAB', 'FLUOROURACIL_BEVACIZUMAB']
    
    immuno = ['PEMBROLIZUMAB', 'NIVOLUMAB']
    
    # Default to 'Other' for uncategorized plans
    df['treatment_category'] = 'Other'
    
    df.loc[df['systemicTreatmentPlan'].isin(chemo), 'treatment_category'] = 'Chemotherapy'
    df.loc[df['systemicTreatmentPlan'].isin(target), 'treatment_category'] = 'Targeted Therapy'
    df.loc[df['systemicTreatmentPlan'].isin(immuno), 'treatment_category'] = 'Immunotherapy'
    
    return df

df_treatments = categorize_treatment_plan(df_treatments)

In [None]:
calculate_and_plot(df_treatments, 'treatment_category', line = False)

### Censored vs Uncensored



In [None]:
censor_correlation_results = calculate_pfs_os_correlation('hadProgressionEvent', [0, 1], df_treatments)
censor_correlation_results

In [None]:
unique_treatments = df_treatments['treatment_category'].unique()

fig, axs = plt.subplots(1, len(unique_treatments), figsize=(18, 6))

for ax, treatment in zip(axs, unique_treatments):
    df_treatments_specific = df_treatments[df_treatments['treatment_category'] == treatment]
    
    calculate_and_plot(df_treatments_specific,'hadProgressionEvent', line=False, ax=ax, title=f"PFS-OS Correlation for {treatment}")
   
plt.tight_layout()
plt.show()


### Patient characteristics

Specific patient characteristics can affect both treatment tolerance and survival outcomes. We analyze the different groups to check whether the relationship between PFS and OS differs within them.

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(18, 14))

age_correlation_results = calculate_and_plot(df_treatments, 'ageAtDiagnosis', bins=range(0, 110, 10), ax=axs[0, 0])
who_status_correlation_results = calculate_and_plot(df_treatments, 'whoStatusPreTreatmentStart', ax=axs[0, 1])
cci_correlation_results = calculate_and_plot(df_treatments, 'cci', ax=axs[0, 2])
msi_correlation_results = calculate_and_plot(df_treatments, 'hasMsi', line=False, ax=axs[1, 0])
braf_correlation_results = calculate_and_plot(df_treatments, 'hasBrafMutation', line=False, ax=axs[1, 1])
braf_v_correlation_results = calculate_and_plot(df_treatments, 'hasBrafV600EMutation', line=False, ax=axs[1, 2])
ras_correlation_results = calculate_and_plot(df_treatments, 'hasRasMutation', line=False, ax=axs[2, 0])
kras_correlation_results = calculate_and_plot(df_treatments, 'hasKrasG12CMutation', line=False, ax=axs[2, 1])
treatment_duration_correlation_results = calculate_and_plot(df_treatments, 'systemicTreatmentPlanDuration', bins=range(0, 1193, 100), ax=axs[2, 2])

plt.tight_layout()
plt.show()

### Relation between PFS and OS

A regression analysis allows us to quantify how much OS increases for each additional unit of PFS (e.g., how many extra months of survival we expect for every month of PFS improvement).

We are using a simple linear regression model because it provides an intuitive way to understand the linear relationship between the two variables. The slope of the regression line tells us the expected gain in OS for each day or month of improvement in PFS. A higher slope would suggest that even modest improvements in PFS are associated with significant gains in OS.

The intercept gives the baseline OS when PFS is zero, which can offer insight into survival even in the absence of progression-free time.


In [None]:
from sklearn.linear_model import LinearRegression

def fit_and_plot_linear_regression(df, x_column, y_column, xlabel=None, ylabel=None, title=None):
    X = df[[x_column]]
    y = df[y_column]
    
    model = LinearRegression()
    model.fit(X, y)
    
    slope = model.coef_[0]
    intercept = model.intercept_
    
    print(f"Slope (OS gain per unit of {x_column}): {slope}")
    print(f"Intercept (Baseline OS): {intercept}")
    
    y_pred = model.predict(X)
    
    plt.figure(figsize=(10, 6))
    plt.scatter(X, y, color='blue', label='Data Points')
    plt.plot(X, y_pred, color='red', label=f'Fitted Line (Slope: {slope:.2f}, Intercept: {intercept:.2f})')
    
    plt.xlabel(xlabel if xlabel else x_column)
    plt.ylabel(ylabel if ylabel else y_column)
    plt.title(title if title else f'{x_column} vs {y_column} with Fitted Line')
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
fit_and_plot_linear_regression(
    df=df_treatments,
    x_column='observedPfsDays',
    y_column='observedOsFromTumorIncidenceDays',
    xlabel='Observed PFS Days',
    ylabel='Observed OS Days',
    title='PFS vs OS with Fitted Line'
)

## Correlation between PPS and OS

In this section, we investigate **Post-Progression Survival (PPS)**, defined as the survival time after disease progression. PPS is calculated as the difference between **Overall Survival (OS)** and **Progression-Free Survival (PFS)**. 

We focus on evaluating how **PPS correlates with OS** in first-line therapies. Understanding this relationship can help us assess whether PPS plays a significant role in the overall survival outcomes of patients undergoing first-line treatments.

In [None]:
df_treatments['PPS'] = df_treatments['observedOsFromTumorIncidenceDays'] - df_treatments['observedPfsDays']

pps_os_correlation = df_treatments[['PPS', 'observedOsFromTumorIncidenceDays']].corr(method='spearman')

print(f"Spearman's rank correlation between PPS and OS in first-line therapies: {pps_os_correlation.iat[0, 1]}")

# Define short PPS based on median
short_pps_threshold = df_treatments['PPS'].median()  
short_pps_group = df_treatments[df_treatments['PPS'] <= short_pps_threshold]
long_pps_group = df_treatments[df_treatments['PPS'] > short_pps_threshold]

pfs_os_correlation_short_pps = short_pps_group[['observedPfsDays', 'observedOsFromTumorIncidenceDays']].corr(method='spearman')
pfs_os_correlation_long_pps = long_pps_group[['observedPfsDays', 'observedOsFromTumorIncidenceDays']].corr(method='spearman')

print(f"PFS-OS correlation for short PPS group: {pfs_os_correlation_short_pps.iat[0, 1]}")
print(f"PFS-OS correlation for long PPS group: {pfs_os_correlation_long_pps.iat[0, 1]}")

In [None]:
fit_and_plot_linear_regression(
    df=df_treatments,
    x_column='PPS',
    y_column='observedOsFromTumorIncidenceDays',
    xlabel='Observed PPS Days',
    ylabel='Observed OS Days',
    title='PPS vs OS with Fitted Line'
)