# Analysis of Treatment Effects on Survival

This analysis investigates the association between different treatment regimens and survival outcomes.


In [None]:
import pymysql
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import logrank_test

sns.set(style="whitegrid")

In [None]:
db_connection = pymysql.connect(
    read_default_file='/home/jupyter/.my.cnf',
    read_default_group='RAnalysis', 
    db = 'actin_personalization_v2'
)

query = "SELECT * FROM knownPalliativeTreatedReference"

df = pd.read_sql(query, db_connection)

db_connection.close()

df.head()

### Summaries and Grouping of Rare Treatments

The initial step is to generate summary statistics for selected clinical variables, stratified by treatment regimen. In addition to reporting the mean, standard deviation, median, minimum, and maximum values, the number of subjects per treatment group is also determined. To enhance the reliability/readability of subsequent analyses and visualizations, treatment groups with a low subject count (fewer than 100) are aggregated into a single "Other" category. This consolidation reduces the potential impact of sparse data on statistical estimates and improves the clarity of graphical representations.


In [None]:
def summarize_treatments(data, treatment_col, characteristics, min_count=100):
    """
    Summarize mean/std/min/max/median for selected characteristics 
    grouped by a treatment column. Also returns a modified DataFrame 
    where rare treatments (< min_count) are grouped as 'Other'.
    """
    grouped_stats = data.groupby(treatment_col)[characteristics].agg(['mean','std','median','min','max'])
    
    treatment_counts = data[treatment_col].value_counts().rename("count")
    
    summary_table = pd.concat([grouped_stats, treatment_counts], axis=1)
    
    rare_treats = treatment_counts[treatment_counts < min_count].index
    data_grouped = data.copy()
    data_grouped.loc[data_grouped[treatment_col].isin(rare_treats), treatment_col] = "Other"
    
    return summary_table, data_grouped

CHARACTERISTICS = ['whoAssessmentAtMetastaticDiagnosis','ageAtDiagnosis']# ,'cci'] --> TODO: add to view

summary_table, df_grouped = summarize_treatments(df, "firstSystemicTreatmentAfterMetastaticDiagnosis", CHARACTERISTICS, min_count=100)

print("### Summary statistics by treatment (with patient counts):")
summary_table


In [None]:
def plot_characteristics_boxplot(data, treatment_col, characteristics):
    """
    Generate boxplots of given characteristics across treatments.
    """
    for col in characteristics:
        plt.figure(figsize=(10,6))
        sns.boxplot(x=treatment_col, y=col, data=data)
        plt.title(f"Distribution of {col} by {treatment_col}")
        plt.xlabel("Treatment")
        plt.ylabel(col)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

plot_characteristics_boxplot(df_grouped, "firstSystemicTreatmentAfterMetastaticDiagnosis", CHARACTERISTICS)

## Kaplan–Meier Survival Curves

Kaplan–Meier survival analysis is utilized to estimate the survival functions for different treatment groups over time. This non-parametric method provides a visual representation of the probability of survival at successive time points. Separate plots are generated for overall survival and progression-free survival (when applicable), enabling a direct visual comparison of the time-to-event outcomes across the treatment regimens. Such visualizations are critical for identifying potential differences in survival patterns that may warrant further investigation.


In [None]:
def plot_km_survival(data, treatment_col, time_col, event_col, treatments, title="Kaplan-Meier Survival"):
    """
    Plots KM survival curves for a given list of treatments.
    """
    kmf = KaplanMeierFitter()
    plt.figure(figsize=(10,6))
    
    for trt in treatments:
        mask = data[treatment_col] == trt
        df_surv = data.loc[mask, [time_col, event_col]].dropna()
        
        if len(df_surv) > 0:
            kmf.fit(df_surv[time_col], df_surv[event_col], label=trt)
            kmf.plot_survival_function(ci_show=False)
    
    plt.title(title)
    plt.xlabel("Days from Treatment Start")
    plt.ylabel("Survival Probability")
    plt.legend(title='Treatment')
    plt.tight_layout()
    plt.show()

In [None]:
treatments_to_plot = df_grouped['firstSystemicTreatmentAfterMetastaticDiagnosis'].unique()  # adjust as needed
plot_km_survival(
    data=df_grouped,
    treatment_col="firstSystemicTreatmentAfterMetastaticDiagnosis",
    time_col="survivalDaysSinceMetastaticDiagnosis",
    event_col="hadSurvivalEvent",
    treatments=treatments_to_plot,
    title="Kaplan-Meier Overall Survival by Treatment"
)

In [None]:
if 'daysBetweenTreatmentStartAndProgression' in df_grouped.columns and 'hadProgressionEvent' in df_grouped.columns:
    plot_km_survival(
        data=df_grouped,
        treatment_col="firstSystemicTreatmentAfterMetastaticDiagnosis",
        time_col="daysBetweenTreatmentStartAndProgression",
        event_col="hadProgressionEvent",
        treatments=treatments_to_plot,
        title="Kaplan-Meier Progression-Free Survival by Treatment"
    )

## Cox Proportional Hazards Model

A Cox proportional hazards model is fitted to evaluate the independent effect of treatment regimens on survival outcomes while adjusting for relevant confounding variables, such as age, performance status, and comorbidity index. This multivariate regression model produces hazard ratios that quantify the relative risk of the event of interest associated with each treatment group, compared to a reference category. The model summary includes coefficients, standard errors, and statistical significance, providing insights into the strength and direction of the associations under investigation.


In [None]:
def fit_cox_model(data, duration_col, event_col, covariates, treatment_col, drop_first=True):
    """
    Fits a Cox model using lifelines. 
    data: must have all columns in covariates + [duration_col, event_col, treatment_col].
    """
    df_cox = data[[duration_col, event_col] + covariates + [treatment_col]].dropna().copy()
    
    # Convert treatment to dummies
    df_cox = pd.get_dummies(df_cox, columns=[treatment_col], drop_first=drop_first)
    
    cph = CoxPHFitter()
    cph.fit(df_cox, duration_col=duration_col, event_col=event_col)
    return cph

covariates = ["ageAtDiagnosis", "whoAssessmentAtMetastaticDiagnosis"]#, "cci"]

cph = fit_cox_model(
    data=df_grouped,
    duration_col="survivalDaysSinceMetastaticDiagnosis",
    event_col="hadSurvivalEvent",
    covariates=covariates,
    treatment_col="firstSystemicTreatmentAfterMetastaticDiagnosis", 
    drop_first=True
)

cph.print_summary()

## Log-Rank Test

The log-rank test is applied as a non-parametric method to compare the survival distributions between two selected treatment groups. This test evaluates whether the differences observed in the Kaplan–Meier curves are statistically significant, offering additional evidence to support conclusions about the differential impact of the treatment regimens on survival outcomes.


In [None]:
def compare_two_treatments_logrank(data, treatment_col, treatmentA, treatmentB, time_col, event_col):
    dfA = data.loc[data[treatment_col] == treatmentA, [time_col, event_col]].dropna()
    dfB = data.loc[data[treatment_col] == treatmentB, [time_col, event_col]].dropna()
    
    results = logrank_test(
        dfA[time_col],
        dfB[time_col],
        event_observed_A=dfA[event_col],
        event_observed_B=dfB[event_col]
    )
    print(f"Log-rank test between {treatmentA} and {treatmentB}:")
    print(results.summary)
    print("-"*50)


In [None]:
compare_two_treatments_logrank(
    data=df_grouped,
    treatment_col="firstSystemicTreatmentAfterMetastaticDiagnosis",
    treatmentA="CAPOX",
    treatmentB="CAPOX_B",
    time_col="survivalDaysSinceMetastaticDiagnosis",
    event_col="hadSurvivalEvent"
)

In [None]:

compare_two_treatments_logrank(
    data=df_grouped,
    treatment_col="firstSystemicTreatmentAfterMetastaticDiagnosis",
    treatmentA="CAPOX",
    treatmentB="CAPOX_B",
    time_col="daysBetweenTreatmentStartAndProgression",
    event_col="hadProgressionEvent"
)

## Correlation Analysis

Correlation analysis is performed on a set of continuous variables, including demographic and clinical outcome measures, to identify potential linear associations among them. A heatmap is generated to visually represent the correlation matrix, which helps to reveal underlying relationships that may influence the multivariate modeling.


In [None]:
numeric_cols = ['ageAtDiagnosis', 
                'survivalDaysSinceMetastaticDiagnosis', 
                'daysBetweenTreatmentStartAndProgression'] #, 'cci']

corr_df = df[numeric_cols].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr_df, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix of Selected Numeric Features")
plt.tight_layout()
plt.show()
