In [None]:
import pymysql
import pandas as pd

In [None]:
db_connection = pymysql.connect(
    read_default_file='/home/jupyter/.my.cnf',
    read_default_group='RAnalysis', 
    db = 'actin_personalization'
)

query = """  
SELECT * 
FROM knownPalliativeTreatments 
"""
# Not sure if this is the correct data we want to look at? This is the data we use for analysis, so I would think we need to confirm the correlation in this data

df_treatments = pd.read_sql(query, db_connection)

db_connection.close()

df_treatments.head()

In [None]:
df_treatments = df_treatments.dropna(subset = ['observedPfsDays', 'observedOsFromTumorIncidenceDays'])

# Correlation between PFS and OS 

The aim is to determine whether PFS is a good surrogate for OS by evaluating the strength of their relationship.

We use Pearson's correlation coefficient here because it measures the linear relationship between two continuous variables (PFS and OS). A Pearson correlation close to 1 indicates a strong positive relationship, meaning that improvements in PFS are associated with increases in OS.

In [None]:
def calculate_pfs_os_correlation(grouping_column, group_labels, df):
    for group_label in group_labels:
        group = df[df[grouping_column] == group_label]
        if not group.empty:
            pfs_os_correlation = group[['observedPfsDays', 'observedOsFromTumorIncidenceDays']].corr(method='pearson')
            print(f"PFS-OS correlation ({grouping_column}: {group_label}):", 
                  pfs_os_correlation.iat[0, 1] if not pfs_os_correlation.empty else "Not available")

## First-line treatments

In [None]:
df_treatments['treatment_order'] = df_treatments['order']

calculate_pfs_os_correlation('treatment_order', [1], df_treatments)

## Subgroup Analysis

### Treatment type
Different treatment modalities, such as chemotherapy and targeted therapy, have distinct mechanisms of action. 

The goal here is to see whether the correlation between PFS and OS differs between these two treatment types. If one treatment type shows a stronger PFS-OS correlation, it may suggest that PFS is a more reliable surrogate for OS for that particular type of treatment.

In [None]:
def categorize_treatment_plan(df):
    chemo = ['FOLFOX', 'FOLFOX_B', 'CAPOX', 'CAPOX_B', 'FOLFIRI', 'FOLFIRI_B', 'FOLFOX_P', 
                      'FOLFOXIRI', 'FOLFOXIRI_B', 'CAPECITABINE', 'FLUOROURACIL', 'FOLFIRI_P', 'IRINOTECAN']
    
    target = ['CAPECITABINE_BEVACIZUMAB', 'FLUOROURACIL_BEVACIZUMAB']
    
    immuno = ['PEMBROLIZUMAB', 'NIVOLUMAB']
    
    # Default to 'Other' for uncategorized plans
    df['treatment_category'] = 'Other'
    
    df.loc[df['systemicTreatmentPlan'].isin(chemo), 'treatment_category'] = 'Chemotherapy'
    df.loc[df['systemicTreatmentPlan'].isin(target), 'treatment_category'] = 'Targeted Therapy'
    df.loc[df['systemicTreatmentPlan'].isin(immuno), 'treatment_category'] = 'Immunotherapy'
    
    return df

first_line_treatments = categorize_treatment_plan(first_line_treatments)

In [None]:
treatment_labels = first_line_treatments['treatment_category'].unique()
calculate_pfs_os_correlation('treatment_category', treatment_labels, first_line_treatments)

### Patient characteristics

Specific patient characteristics can affect both treatment tolerance and survival outcomes. We analyze the different groups to check whether the relationship between PFS and OS differs within them.

In [None]:
first_line_treatments['age_bins'] = pd.cut(first_line_treatments['ageAtDiagnosis'], bins=range(0, 110, 10))
calculate_pfs_os_correlation('age_bins', sorted(first_line_treatments['age_bins'].unique()), first_line_treatments)

In [None]:
who_status_labels = sorted(first_line_treatments['whoStatusPreTreatmentStart'].unique())
calculate_pfs_os_correlation('whoStatusPreTreatmentStart', who_status_labels, first_line_treatments)

In [None]:
ras_status_labels = sorted(first_line_treatments['hasRasMutation'].unique())
calculate_pfs_os_correlation('hasRasMutation', ras_status_labels, first_line_treatments)

### Relation between PFS and OS

A regression analysis allows us to quantify how much OS increases for each additional unit of PFS (e.g., how many extra months of survival we expect for every month of PFS improvement).

We are using a simple linear regression model because it provides an intuitive way to understand the linear relationship between the two variables. The slope of the regression line tells us the expected gain in OS for each day or month of improvement in PFS. A higher slope would suggest that even modest improvements in PFS are associated with significant gains in OS.

The intercept gives the baseline OS when PFS is zero, which can offer insight into survival even in the absence of progression-free time.


In [None]:
from sklearn.linear_model import LinearRegression

X = first_line_treatments[['observedPfsDays']]  # Predictor: PFS
y = first_line_treatments['observedOsFromTumorIncidenceDays']  # Target: OS

model = LinearRegression()
model.fit(X, y)

pfs_os_slope = model.coef_
pfs_os_intercept = model.intercept_

print(f"Slope (OS gain per unit of PFS): {pfs_os_slope[0]}")
print(f"Intercept (Baseline OS): {pfs_os_intercept}")