In [None]:
import pymysql
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
db_connection = pymysql.connect(
    read_default_file='/home/jupyter/.my.cnf',
    read_default_group='RAnalysis', 
    db = 'actin_personalization'
)

query = """  
SELECT * 
FROM knownPalliativeTreatments 
"""

df_treatments = pd.read_sql(query, db_connection)

db_connection.close()

df_treatments.head()

In [None]:
df_treatments = df_treatments.dropna(subset = ['observedPfsDays', 'observedOsFromTumorIncidenceDays'])

# Correlation between PFS and OS 

The aim is to determine whether PFS is a good surrogate for OS by evaluating the strength of their relationship.

We use Pearson's correlation coefficient here because it measures the linear relationship between two continuous variables (PFS and OS). A Pearson correlation close to 1 indicates a strong positive relationship, meaning that improvements in PFS are associated with increases in OS.

In [None]:
def calculate_pfs_os_correlation(grouping_column, group_labels, df):
    correlation_results = []
    
    for group_label in group_labels:
        group = df[df[grouping_column] == group_label]
        if not group.empty:
            pfs_os_correlation = group[['observedPfsDays', 'observedOsFromTumorIncidenceDays']].corr(method='pearson')
            correlation = pfs_os_correlation.iat[0, 1] if not pfs_os_correlation.empty else None
            correlation_results.append({'Group': str(group_label), 'Correlation': correlation})

    correlation_df = pd.DataFrame(correlation_results)
    
    return correlation_df

In [None]:
def plot_correlation_results(correlation_df, grouping_column, line=True, ax=None):
    if ax is None:
        ax = plt.gca() 
    
    groups = correlation_df['Group']
    correlations = correlation_df['Correlation']

    bars = ax.bar(groups, correlations, color='skyblue', label='Correlation (Bars)')
    
    # Display correlation values inside the bars
    for bar, correlation in zip(bars, correlations):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2.0, height / 2, f'{correlation:.2f}', ha='center', va='center', color='black')

    if line:
        ax.plot(groups, correlations, marker='o', color='orange', linestyle='-', label='Correlation (Line)')
    
    ax.set_xlabel(f'{grouping_column} Groups')
    ax.set_ylabel('PFS-OS Correlation')
    ax.set_title(f'PFS-OS Correlation by {grouping_column}')
    ax.legend()
    ax.tick_params(axis='x', rotation=45)

    if ax is plt.gca():
        plt.tight_layout()
        plt.show()

## First-line treatments

In [None]:
df_treatments['treatment_first_line'] = df_treatments['order']

first_line_correlation_results = calculate_pfs_os_correlation('treatment_first_line', [1], df_treatments)

## Subgroup Analysis

In [None]:
def calculate_and_plot(df, grouping_column, bins=None, line=True, ax=None):
    if bins:
        df[grouping_column + '_bins'] = pd.cut(df[grouping_column], bins=bins)
        group_labels = sorted(df[grouping_column + '_bins'].dropna().unique(), key=lambda x: x.left)
        group_column = grouping_column + '_bins'
    else:
        group_labels = sorted(df[grouping_column].unique())
        group_column = grouping_column
    
    correlation_df = calculate_pfs_os_correlation(group_column, group_labels, df)
    
    plot_correlation_results(correlation_df, group_column, line=line, ax=ax)
    
    return correlation_df

### Treatment type
Different treatment modalities, such as chemotherapy and targeted therapy, have distinct mechanisms of action. 

The goal here is to see whether the correlation between PFS and OS differs between these two treatment types. If one treatment type shows a stronger PFS-OS correlation, it may suggest that PFS is a more reliable surrogate for OS for that particular type of treatment.

In [None]:
def categorize_treatment_plan(df):
    chemo = ['FOLFOX', 'FOLFOX_B', 'CAPOX', 'CAPOX_B', 'FOLFIRI', 'FOLFIRI_B', 'FOLFOX_P', 
                      'FOLFOXIRI', 'FOLFOXIRI_B', 'CAPECITABINE', 'FLUOROURACIL', 'FOLFIRI_P', 'IRINOTECAN']
    
    target = ['CAPECITABINE_BEVACIZUMAB', 'FLUOROURACIL_BEVACIZUMAB']
    
    immuno = ['PEMBROLIZUMAB', 'NIVOLUMAB']
    
    # Default to 'Other' for uncategorized plans
    df['treatment_category'] = 'Other'
    
    df.loc[df['systemicTreatmentPlan'].isin(chemo), 'treatment_category'] = 'Chemotherapy'
    df.loc[df['systemicTreatmentPlan'].isin(target), 'treatment_category'] = 'Targeted Therapy'
    df.loc[df['systemicTreatmentPlan'].isin(immuno), 'treatment_category'] = 'Immunotherapy'
    
    return df

df_treatments = categorize_treatment_plan(df_treatments)

In [None]:
calculate_and_plot(df_treatments, 'treatment_category', line = False)

### Patient characteristics

Specific patient characteristics can affect both treatment tolerance and survival outcomes. We analyze the different groups to check whether the relationship between PFS and OS differs within them.

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(18, 14))

age_correlation_results = calculate_and_plot(df_treatments, 'ageAtDiagnosis', bins=range(0, 110, 10), ax=axs[0, 0])
who_status_correlation_results = calculate_and_plot(df_treatments, 'whoStatusPreTreatmentStart', ax=axs[0, 1])
cci_correlation_results = calculate_and_plot(df_treatments, 'cci', ax=axs[0, 2])
msi_correlation_results = calculate_and_plot(df_treatments, 'hasMsi', line=False, ax=axs[1, 0])
braf_correlation_results = calculate_and_plot(df_treatments, 'hasBrafMutation', line=False, ax=axs[1, 1])
braf_v_correlation_results = calculate_and_plot(df_treatments, 'hasBrafV600EMutation', line=False, ax=axs[1, 2])
ras_correlation_results = calculate_and_plot(df_treatments, 'hasRasMutation', line=False, ax=axs[2, 0])
kras_correlation_results = calculate_and_plot(df_treatments, 'hasKrasG12CMutation', line=False, ax=axs[2, 1])
treatment_duration_correlation_results = calculate_and_plot(df_treatments, 'systemicTreatmentPlanDuration', bins=range(0, 1193, 100), ax=axs[2, 2])

plt.tight_layout()
plt.show()

### Relation between PFS and OS

A regression analysis allows us to quantify how much OS increases for each additional unit of PFS (e.g., how many extra months of survival we expect for every month of PFS improvement).

We are using a simple linear regression model because it provides an intuitive way to understand the linear relationship between the two variables. The slope of the regression line tells us the expected gain in OS for each day or month of improvement in PFS. A higher slope would suggest that even modest improvements in PFS are associated with significant gains in OS.

The intercept gives the baseline OS when PFS is zero, which can offer insight into survival even in the absence of progression-free time.


In [None]:
from sklearn.linear_model import LinearRegression

X = df_treatments[['observedPfsDays']] 
y = df_treatments['observedOsFromTumorIncidenceDays']  

model = LinearRegression()
model.fit(X, y)

pfs_os_slope = model.coef_
pfs_os_intercept = model.intercept_

print(f"Slope (OS gain per unit of PFS): {pfs_os_slope[0]}")
print(f"Intercept (Baseline OS): {pfs_os_intercept}")

y_pred = model.predict(X)

plt.figure(figsize=(10, 6))
plt.scatter(df_treatments['observedPfsDays'], df_treatments['observedOsFromTumorIncidenceDays'], color='blue', label='Data Points')
plt.plot(df_treatments['observedPfsDays'], y_pred, color='red', label=f'Fitted Line (Slope: {pfs_os_slope[0]:.2f}, Intercept: {pfs_os_intercept:.2f})')
plt.xlabel('Observed PFS Days')
plt.ylabel('Observed OS Days')
plt.title('PFS vs OS with Fitted Line')
plt.legend()
plt.tight_layout()
plt.show()