In [None]:
import pandas as pd 
import pymysql
import matplotlib.pyplot as plt
from lifelines import CoxPHFitter
import seaborn as sns

In [None]:
db_connection = pymysql.connect(
    read_default_file='/home/jupyter/.my.cnf',
    read_default_group='RAnalysis', 
    db = 'actin_personalization_v2'
)

query = 'SELECT * FROM palliativeReference'

df = pd.read_sql(query, db_connection)

db_connection.close()

df.head()

In [None]:
df['systemicTreatmentsAfterMetastaticDiagnosis'].value_counts()

In [None]:
from lifelines import KaplanMeierFitter, CoxPHFitter
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.lines import Line2D

df_os = df.dropna(subset=[
    'survivalDaysSinceMetastaticDiagnosis', 
    'hadSurvivalEvent', 
    'whoAssessmentAtMetastaticDiagnosis', 
    'ageAtMetastaticDiagnosis', 
    'diagnosisYear', 
    'systemicTreatmentsAfterMetastaticDiagnosis'
])

treated_df = df_os[df_os['systemicTreatmentsAfterMetastaticDiagnosis'] == 1]
untreated_df = df_os[df_os['systemicTreatmentsAfterMetastaticDiagnosis'] == 0]

cph_df = df_os[['survivalDaysSinceMetastaticDiagnosis', 'hadSurvivalEvent', 'whoAssessmentAtMetastaticDiagnosis', 'ageAtMetastaticDiagnosis', 'diagnosisYear', 'systemicTreatmentsAfterMetastaticDiagnosis']]
cph = CoxPHFitter()
cph.fit(cph_df, duration_col='survivalDaysSinceMetastaticDiagnosis', event_col='hadSurvivalEvent')

fig, axs = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: Cox Proportional Hazards Model Only
cph.plot_partial_effects_on_outcome(
    covariates='systemicTreatmentsAfterMetastaticDiagnosis', 
    values=[0, 1], 
    ax=axs[0], 
    color=['purple', 'mediumpurple'], 
    marker=None
)
axs[0].set_title('Cox Proportional Hazards')
axs[0].set_xlabel('Time (days)')
axs[0].set_ylabel('Overall Survival Probability')
axs[0].legend(['Cox Untreated', 'Cox Treated'])

# Plot 2: Kaplan-Meier and Cox Proportional Hazards Combined
kmf = KaplanMeierFitter()

kmf.fit(treated_df['survivalDaysSinceMetastaticDiagnosis'], event_observed=treated_df['hadSurvivalEvent'], label='KM Treated')
ax = kmf.plot_survival_function(ax=axs[1], color='forestgreen')

kmf.fit(untreated_df['survivalDaysSinceMetastaticDiagnosis'], event_observed=untreated_df['hadSurvivalEvent'], label='KM Untreated')
kmf.plot_survival_function(ax=axs[1], color='limegreen')

cph.plot_partial_effects_on_outcome(
    covariates='systemicTreatmentsAfterMetastaticDiagnosis', 
    values=[0, 1], 
    ax=axs[1], 
    color=['purple', 'mediumpurple'], 
    marker=None
)

axs[1].set_title('Kaplan-Meier vs Cox Proportional Hazards')
axs[1].set_xlabel('Time (days)')
axs[1].set_ylabel('Overall Survival Probability')

custom_lines = [
    Line2D([0], [0], color='forestgreen', lw=2, label='KM Treated'),
    Line2D([0], [0], color='limegreen', lw=2, label='KM Untreated'),
    Line2D([0], [0], color='purple', lw=2, label='Cox Treated'),
    Line2D([0], [0], color='mediumpurple', lw=2, label='Cox Untreated')
]
axs[1].legend(handles=custom_lines)

plt.tight_layout()
plt.show()


In [None]:
def plot_cox_model_by_who_status(subgroup_df, who_status, ax):
    cph_df = subgroup_df[['survivalDaysSinceMetastaticDiagnosis', 'hadSurvivalEvent', 'systemicTreatmentsAfterMetastaticDiagnosis', 'ageAtMetastaticDiagnosis']]
    
    cph = CoxPHFitter()
    
    cph.fit(cph_df, duration_col='survivalDaysSinceMetastaticDiagnosis', event_col='hadSurvivalEvent')
    cph.plot_partial_effects_on_outcome(covariates='systemicTreatmentsAfterMetastaticDiagnosis', values=[0, 1], cmap='coolwarm', ax=ax)
    
    untreated_df = subgroup_df[subgroup_df['systemicTreatmentsAfterMetastaticDiagnosis'] == 0].copy()
    untreated_df.loc[:, 'reasonRefrainmentFromTreatment'] = untreated_df['reasonRefrainmentFromTreatment'].fillna('NONE')
    
    reason_counts = untreated_df['reasonRefrainmentFromTreatment'].value_counts()
    reason_percents = (reason_counts /  len(untreated_df)) * 100

    reason_text = ''
    for reason, percent in reason_percents.items():
        reason_text += f"{reason[:4]}: {percent:.1f}%\n"

    reason_percents = reason_percents.reset_index()
    reason_percents.columns = ['reason', 'percentage']

    reason_percents['who_status'] = int(who_status)
    
    
    ax.text(
        0.75, 0.7, reason_text,
        transform=ax.transAxes,
        fontsize=12,
        verticalalignment='center',
        horizontalalignment='left'
    )

    ax.set_title(f'WHO Status {who_status}')
    ax.set_xlabel('Time (days)')
    ax.set_ylabel('Survival Probability')
    ax.legend(['Untreated', 'Treated'])
    
    return reason_percents

n_rows, n_cols = 2, 3 
fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 12))
axes = axes.flatten()

who_statuses = df_os['whoAssessmentAtMetastaticDiagnosis'].unique()
reason_percents_list = []

for i, who_status in enumerate(who_statuses):
    subgroup_df = df_os[df_os['whoAssessmentAtMetastaticDiagnosis'] == who_status]
    
    if not subgroup_df.empty:
        reason_percents = plot_cox_model_by_who_status(subgroup_df, who_status, axes[i])
        reason_percents_list.append(reason_percents)
    else:
        print(f"No data for WHO status {who_status}")
    

for j in range(len(who_statuses), n_rows * n_cols):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
who_reason_percentages = pd.concat(reason_percents_list, ignore_index=True)
who_reason_pivot = who_reason_percentages.pivot(index='who_status', columns='reason', values='percentage').fillna(0)

who_reason_pivot = who_reason_pivot.reset_index()
who_reason_pivot = who_reason_pivot.sort_values('who_status')

reason_label_mapping = {
    'NONE': 'Unknown',
    'EXPECTED_FAST_PROGRESSION_OR_HIGH_TUMOR_LOAD': 'Expected Fast Progression',
    'WISH_OR_REFUSAL_FROM_PATIENT_OR_FAMILY': 'Patient/Family Refusal',
    'COMORBIDITY_AND_OR_PERFORMANCE_OR_FUNCTIONAL_STATUS_OR_PRESENCE_OTHER_TUMOR': 'Comorbidity/Performance Status',
    'LIMITED_TUMOR_LOAD_OR_FEW_COMPLAINTS': 'Limited Tumor Load'
}

who_reason_pivot = who_reason_pivot.rename(columns=reason_label_mapping)

plt.figure(figsize=(10, 6))

simplified_reasons = list(reason_label_mapping.values())

for idx, reason in enumerate(simplified_reasons):
    if reason in who_reason_pivot.columns:
        plt.plot(
            who_reason_pivot['who_status'],
            who_reason_pivot[reason],      
            marker='o',               
            label=reason,                  
        )

plt.xlabel('WHO Performance Status')
plt.ylabel('Percentage of Untreated Patients (%)')
plt.title('Trends in Reasons for Not Receiving Treatment Across WHO Statuses')
plt.legend(title='Reason for Refrainment', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(who_reason_pivot['who_status'])  
plt.grid(True)  
plt.tight_layout()
plt.show()

In [None]:
def plot_cox_model_by_age_group(subgroup_df, age_group, ax):
    cph_df = subgroup_df[['survivalDaysSinceMetastaticDiagnosis', 'hadSurvivalEvent', 'systemicTreatmentsAfterMetastaticDiagnosis', 'ageAtMetastaticDiagnosis']]
    
    cph = CoxPHFitter()
    cph.fit(cph_df, duration_col='survivalDaysSinceMetastaticDiagnosis', event_col='hadSurvivalEvent')
    
    cph.plot_partial_effects_on_outcome(covariates='systemicTreatmentsAfterMetastaticDiagnosis', values=[0, 1], cmap='coolwarm', ax=ax)
    
    untreated_df = subgroup_df[subgroup_df['systemicTreatmentsAfterMetastaticDiagnosis'] == 0].copy()
    untreated_df.loc[:, 'reasonRefrainmentFromTreatment'] = untreated_df['reasonRefrainmentFromTreatment'].fillna('NONE')
    
    reason_counts = untreated_df['reasonRefrainmentFromTreatment'].value_counts()
    reason_percents = (reason_counts / len(untreated_df)) * 100

    reason_text = ''
    for reason, percent in reason_percents.items():
        reason_text += f"{reason[:4]}: {percent:.1f}%\n"

    reason_percents = reason_percents.reset_index()
    reason_percents.columns = ['reason', 'percentage']
    reason_percents['age_group'] = age_group
    
    ax.text(
        0.75, 0.6, reason_text,
        transform=ax.transAxes,
        fontsize=12,
        verticalalignment='center',
        horizontalalignment='left'
    )

    ax.set_title(f'Age Group {age_group}')
    ax.set_xlabel('Time (days)')
    ax.set_ylabel('Survival Probability')
    ax.legend(['Untreated', 'Treated'])
    
    return reason_percents

bins = range(min(df_os['ageAtMetastaticDiagnosis']), 110, 10)
labels = [f"{i}-{i+9}" for i in bins[:-1]]
df_os['age_group'] = pd.cut(df_os['ageAtMetastaticDiagnosis'], bins=bins, labels=labels, right=False)
age_groups = sorted(df_os['age_group'].dropna().unique(), key=lambda x: int(x.split('-')[0]))

n_rows, n_cols = (len(age_groups) + 2) // 3, 3 
fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 12))
axes = axes.flatten()

reason_percents_list = []

for i, age_group in enumerate(age_groups):
    subgroup_df = df_os[df_os['age_group'] == age_group]
    subgroup_df = subgroup_df.dropna(subset = ['survivalDaysSinceMetastaticDiagnosis', 'hadSurvivalEvent', 'systemicTreatmentsAfterMetastaticDiagnosis', 'ageAtMetastaticDiagnosis'])
    
    if not subgroup_df.empty:
        reason_percents = plot_cox_model_by_age_group(subgroup_df, age_group, axes[i])
        reason_percents_list.append(reason_percents)
    else:
        print(f"No data for age group {age_group}")

for j in range(len(age_groups), n_rows * n_cols):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
age_reason_percentages = pd.concat(reason_percents_list, ignore_index=True)
age_reason_pivot = age_reason_percentages.pivot(index ='age_group', columns='reason', values='percentage').fillna(0)

age_reason_pivot = age_reason_pivot.reset_index()
age_reason_pivot['age_group'] = pd.Categorical(
    age_reason_pivot['age_group'],
    categories=labels,
    ordered=True
)
age_reason_pivot = age_reason_pivot.sort_values('age_group')

age_reason_pivot = age_reason_pivot.rename(columns=reason_label_mapping)

plt.figure(figsize=(10, 6))

simplified_reasons = list(reason_label_mapping.values())

for idx, reason in enumerate(simplified_reasons):
    if reason in age_reason_pivot.columns:
        plt.plot(
            age_reason_pivot['age_group'],
            age_reason_pivot[reason],      
            marker='o',               
            label=reason,                  
        )

plt.xlabel('Age')
plt.ylabel('Percentage of Untreated Patients (%)')
plt.title('Trends in Reasons for Not Receiving Treatment Across Ages')
plt.legend(title='Reason for Refrainment', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(age_reason_pivot['age_group'])  
plt.grid(True)  
plt.tight_layout()
plt.show()

In [None]:
df['whoAssessmentAtMetastaticDiagnosis'] = df['whoAssessmentAtMetastaticDiagnosis'].fillna(df['whoAssessmentAtMetastaticDiagnosis'].median())
df['ageAtMetastaticDiagnosis'] = df['ageAtMetastaticDiagnosis'].fillna(df['ageAtMetastaticDiagnosis'].median())
df['charlsonComorbidityIndex'] = df['charlsonComorbidityIndex'].fillna(df['charlsonComorbidityIndex'].median())

treated = df[df['systemicTreatmentsAfterMetastaticDiagnosis'] == 1]
untreated = df[df['systemicTreatmentsAfterMetastaticDiagnosis'] == 0]

print("Treated patients - Baseline characteristics:")
print(treated[['ageAtMetastaticDiagnosis', 'whoAssessmentAtMetastaticDiagnosis', 'charlsonComorbidityIndex']].describe())

print("\nUntreated patients - Baseline characteristics:")
print(untreated[['ageAtMetastaticDiagnosis', 'whoAssessmentAtMetastaticDiagnosis', 'charlsonComorbidityIndex']].describe())

In [None]:
plt.figure(figsize=(18, 6))

# Age Distribution
plt.subplot(1, 3, 1)
sns.histplot(treated['ageAtMetastaticDiagnosis'], color='blue', label='Treated', bins=20, stat='percent')
sns.histplot(untreated['ageAtMetastaticDiagnosis'], color='orange', label='Untreated', bins=20, stat='percent')
plt.title('Age Distribution by Treatment Status')
plt.xlabel('Age at Diagnosis')
plt.ylabel('Percentage')
plt.legend()

# WHO Performance Status Distribution
plt.subplot(1, 3, 2)
sns.histplot(treated['whoAssessmentAtMetastaticDiagnosis'], color='blue', label='Treated', bins=5, stat='percent')
sns.histplot(untreated['whoAssessmentAtMetastaticDiagnosis'], color='orange', label='Untreated', bins=5, stat='percent')
plt.title('WHO Performance Status by Treatment Status')
plt.xlabel('WHO Performance Status')
plt.ylabel('Percentage')
plt.legend()

# Charlson Comorbidity Index Distribution
plt.subplot(1, 3, 3)
sns.histplot(treated['charlsonComorbidityIndex'], color='blue', label='Treated', bins=10, stat='percent')
sns.histplot(untreated['charlsonComorbidityIndex'], color='orange', label='Untreated', bins=10, stat='percent')
plt.title('charlsonComorbidityIndex Distribution by Treatment Status')
plt.xlabel('Charlson Comorbidity Index')
plt.ylabel('Percentage')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
df['treatment_label'] = df['systemicTreatmentsAfterMetastaticDiagnosis'].replace({0: 'No Treatment', 1: 'Treatment'})

plt.figure(figsize=(18, 6))

# Age Boxplot
plt.subplot(1, 3, 1)
sns.boxplot(x='treatment_label', y='ageAtMetastaticDiagnosis', data=df)
plt.title('Age by Treatment Status')
plt.xlabel('systemicTreatmentsAfterMetastaticDiagnosis')
plt.ylabel('Age at Diagnosis')

# WHO Performance Status Boxplot
plt.subplot(1, 3, 2)
sns.boxplot(x='treatment_label', y='whoAssessmentAtMetastaticDiagnosis', data=df)
plt.title('WHO Performance Status by Treatment Status')
plt.xlabel('systemicTreatmentsAfterMetastaticDiagnosis')
plt.ylabel('WHO Performance Status')

# charlsonComorbidityIndex Boxplot
plt.subplot(1, 3, 3)
sns.boxplot(x='treatment_label', y='charlsonComorbidityIndex', data=df)
plt.title('charlsonComorbidityIndex by Treatment Status')
plt.xlabel('systemicTreatmentsAfterMetastaticDiagnosis')
plt.ylabel('Charlson Comorbidity Index')

plt.tight_layout()
plt.show()