In [None]:
import pandas as pd 
import pymysql
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
import seaborn as sns

In [None]:
db_connection = pymysql.connect(
    read_default_file='/home/jupyter/.my.cnf',
    read_default_group='RAnalysis', 
    db = 'actin_personalization'
)

query = '''
SELECT
    d.hadSurvivalEvent,
    d.ageAtDiagnosis,
    d.cci,
    d.observedOverallSurvivalFromTumorIncidenceDays,
    e.systemicTreatmentPlan,
    e.hasReceivedTumorDirectedTreatment,
    e.observedPfsDays,
    e.whoStatusPreTreatmentStart,
    e.hadProgressionEvent,
    e.reasonRefrainmentFromTumorDirectedTreatment
    
FROM
    episode e
JOIN
    diagnosis d ON e.diagnosisId = d.id

LEFT JOIN
    surgery s ON e.id = s.episodeId 

WHERE
    (e.hasHadPreSurgerySystemicChemotherapy = 0 OR e.hasHadPreSurgerySystemicChemotherapy IS NULL)
    AND (e.hasHadPostSurgerySystemicChemotherapy = 0 OR e.hasHadPostSurgerySystemicChemotherapy IS NULL)
    AND (e.hasHadPreSurgerySystemicTargetedTherapy = 0 OR e.hasHadPreSurgerySystemicTargetedTherapy IS NULL)
    AND (e.hasHadPostSurgerySystemicTargetedTherapy = 0 OR e.hasHadPostSurgerySystemicTargetedTherapy IS NULL)
 
    AND s.id IS NULL

    AND e.distantMetastasesDetectionStatus = 'AT_START'
    AND (e.systemicTreatmentPlan != 'OTHER' OR e.systemicTreatmentPlan IS NULL)

    AND e.`order` = 1
'''

df = pd.read_sql(query, db_connection)

db_connection.close()

df.head()

In [None]:
df['pfs_time'] = df['observedPfsDays']
df['pfs_event'] = df['hadProgressionEvent'].apply(lambda x: 1 if x == 1 else 0)

df['os_time'] = df['observedOverallSurvivalFromTumorIncidenceDays']
df['os_event'] = df['hadSurvivalEvent']

df['treatment'] = df['hasReceivedTumorDirectedTreatment']

df['treatment'].value_counts()

In [None]:
df_os = df.dropna(subset=['os_time', 'os_event'])

treated_df = df_os[df_os['treatment'] == 1]
untreated_df = df_os[df_os['treatment'] == 0]

kmf = KaplanMeierFitter()

plt.figure(figsize=(10, 6))
kmf.fit(treated_df['os_time'], event_observed=treated_df['os_event'], label='Treated')
ax = kmf.plot_survival_function()

kmf.fit(untreated_df['os_time'], event_observed=untreated_df['os_event'], label='Untreated')
kmf.plot_survival_function(ax=ax)

plt.title('Overall Survival for Treated vs Untreated Patients')
plt.xlabel('Time (days)')
plt.ylabel('Overall Survival Fraction')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
import pandas as pd

df_os = df.dropna(subset = ['whoStatusPreTreatmentStart', 'os_time', 'os_event'])
who_status_values = df_os['whoStatusPreTreatmentStart'].unique()

treated_df = df_os[df_os['treatment'] == 1]
untreated_df = df_os[df_os['treatment'] == 0]

plt.figure(figsize=(20, 2 * 5))
reason_percents_list = []

for i, who_status in enumerate(who_status_values):
    plt.subplot(2, 3, i + 1)

    group_data = df_os[df_os['whoStatusPreTreatmentStart'] == who_status]

    treated_df = group_data[group_data['treatment'] == 1]
    untreated_df = group_data[group_data['treatment'] == 0]
    
    reason_counts = group_data['reasonRefrainmentFromTumorDirectedTreatment'].value_counts()

    reason_percents = (reason_counts / len(group_data)) * 100
    reason_percents = reason_percents.reset_index()
    reason_percents.columns = ['reason', 'percentage']
    
    reason_percents['who_status'] = int(who_status)
    reason_percents_list.append(reason_percents)

    kmf = KaplanMeierFitter()

    if not treated_df.empty:
        kmf.fit(
            treated_df['observedOverallSurvivalFromTumorIncidenceDays'],
            event_observed=treated_df['os_event'],
            label='Treated'
        )
        kmf.plot_survival_function()

    if not untreated_df.empty:
        kmf.fit(
            untreated_df['observedOverallSurvivalFromTumorIncidenceDays'],
            event_observed=untreated_df['os_event'],
            label='Untreated'
        )
        kmf.plot_survival_function()

        reason_counts = untreated_df['reasonRefrainmentFromTumorDirectedTreatment'].value_counts()
        total_untreated = len(untreated_df)
        percentages = (reason_counts / total_untreated) * 100

        reason_text = ''
        for reason, percent in percentages.items():
            reason_text += f"{reason[:4]}: {percent:.1f}%\n"


    ax = plt.gca()
    ax.text(
        0.75, 0.7, reason_text,
        transform=ax.transAxes,
        fontsize=12,
        verticalalignment='center',
        horizontalalignment='left'
    )

    plt.title(f'Overall Survival for WHO Status {who_status}')
    plt.xlabel('Time (days)')
    plt.ylabel('Overall Survival Fraction')
    plt.legend()

plt.tight_layout(rect=[0, 0, 0.85, 1]) 
plt.show()


In [None]:
   
who_reason_percentages = pd.concat(reason_percents_list, ignore_index=True)
who_reason_pivot = who_reason_percentages.pivot(index='who_status', columns='reason', values='percentage').fillna(0)

who_reason_pivot = who_reason_pivot.reset_index()
who_reason_pivot = who_reason_pivot.sort_values('who_status')

reason_label_mapping = {
    'EXPECTED_FAST_PROGRESSION_OR_HIGH_TUMOR_LOAD': 'Expected Fast Progression',
    'WISH_OR_REFUSAL_FROM_PATIENT_OR_FAMILY': 'Patient/Family Refusal',
    'COMORBIDITY_AND_OR_PERFORMANCE_OR_FUNCTIONAL_STATUS_OR_PRESENCE_OTHER_TUMOR': 'Comorbidity/Performance Status',
    'LIMITED_TUMOR_LOAD_OR_FEW_COMPLAINTS': 'Limited Tumor Load'
}

who_reason_pivot = who_reason_pivot.rename(columns=reason_label_mapping)

plt.figure(figsize=(10, 6))

simplified_reasons = list(reason_label_mapping.values())

for idx, reason in enumerate(simplified_reasons):
    if reason in who_reason_pivot.columns:
        plt.plot(
            who_reason_pivot['who_status'],
            who_reason_pivot[reason],      
            marker='o',               
            label=reason,                  
        )

plt.xlabel('WHO Performance Status')
plt.ylabel('Percentage of Untreated Patients (%)')
plt.title('Trends in Reasons for Not Receiving Treatment Across WHO Statuses')
plt.legend(title='Reason for Refrainment', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(who_reason_pivot['who_status'])  
plt.grid(True)  
plt.tight_layout()
plt.show()

In [None]:
df_os = df.dropna(subset=['ageAtDiagnosis', 'os_time', 'os_event'])

reason_percents_list = []

# Define bins dynamically based on the data
bins = range(min(df_os['ageAtDiagnosis']), max(df_os['ageAtDiagnosis']) + 10, 10) 
labels = [f"{i}-{i+9}" for i in bins[:-1]]
df_os['age_group'] = pd.cut(df_os['ageAtDiagnosis'], bins=bins, labels=labels, right=False)
age_groups = sorted(df_os['age_group'].dropna().unique(), key=lambda x: int(x.split('-')[0]))

cols = 3
rows = (len(age_groups) + cols - 1) // cols  
plt.figure(figsize=(20, rows * 5))  

for i, age_group in enumerate(age_groups):
    plt.subplot(rows, cols, i + 1)

    group_data = df_os[df_os['age_group'] == age_group]

    treated_df = group_data[group_data['treatment'] == 1]
    untreated_df = group_data[group_data['treatment'] == 0]
    
    reason_counts = group_data['reasonRefrainmentFromTumorDirectedTreatment'].value_counts()
    reason_percents = (reason_counts / len(group_data)) * 100
    reason_percents = reason_percents.reset_index()
    reason_percents.columns = ['reason', 'percentage']
    
    reason_percents['age_group'] = age_group
    reason_percents_list.append(reason_percents)
    
    kmf = KaplanMeierFitter()
    
    if not treated_df.empty:
        kmf.fit(treated_df['observedOverallSurvivalFromTumorIncidenceDays'], 
                event_observed=treated_df['os_event'], label='Treated')
        kmf.plot_survival_function()
    
    if not untreated_df.empty:
        kmf.fit(untreated_df['observedOverallSurvivalFromTumorIncidenceDays'], 
                event_observed=untreated_df['os_event'], label='Untreated')
        kmf.plot_survival_function()
        
        reason_counts = untreated_df['reasonRefrainmentFromTumorDirectedTreatment'].value_counts()
        total_untreated = len(untreated_df)
        percentages = (reason_counts / total_untreated) * 100

        reason_text = ''
        for reason, percent in percentages.items():
            reason_text += f"{reason[:4]}:{percent:.1f}%\n"
    

    ax = plt.gca()
    ax.text(
        0.75, 0.7, reason_text,
        transform=ax.transAxes,
        fontsize=14,
        verticalalignment='center',
        horizontalalignment='left'
    )

    plt.title(f'Overall Survival for Age Group {age_group}')
    plt.xlabel('Time (days)')
    plt.ylabel('Overall Survival Fraction')
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
age_reason_percentages = pd.concat(reason_percents_list, ignore_index=True)
age_reason_pivot = age_reason_percentages.pivot(index ='age_group', columns='reason', values='percentage').fillna(0)

age_reason_pivot = age_reason_pivot.reset_index()
age_reason_pivot['age_group'] = pd.Categorical(
    age_reason_pivot['age_group'],
    categories=labels,
    ordered=True
)
age_reason_pivot = age_reason_pivot.sort_values('age_group')

age_reason_pivot = age_reason_pivot.rename(columns=reason_label_mapping)

plt.figure(figsize=(10, 6))

simplified_reasons = list(reason_label_mapping.values())

for idx, reason in enumerate(simplified_reasons):
    if reason in age_reason_pivot.columns:
        plt.plot(
            age_reason_pivot['age_group'],
            age_reason_pivot[reason],      
            marker='o',               
            label=reason,                  
        )

plt.xlabel('Age')
plt.ylabel('Percentage of Untreated Patients (%)')
plt.title('Trends in Reasons for Not Receiving Treatment Across Ages')
plt.legend(title='Reason for Refrainment', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(age_reason_pivot['age_group'])  
plt.grid(True)  
plt.tight_layout()
plt.show()

In [None]:
df['whoStatusPreTreatmentStart'] = df['whoStatusPreTreatmentStart'].fillna(df['whoStatusPreTreatmentStart'].median())
df['ageAtDiagnosis'] = df['ageAtDiagnosis'].fillna(df['ageAtDiagnosis'].median())
df['cci'] = df['cci'].fillna(df['cci'].median())

treated = df[df['treatment'] == 1]
untreated = df[df['treatment'] == 0]

print("Treated patients - Baseline characteristics:")
print(treated[['ageAtDiagnosis', 'whoStatusPreTreatmentStart', 'cci']].describe())

print("\nUntreated patients - Baseline characteristics:")
print(untreated[['ageAtDiagnosis', 'whoStatusPreTreatmentStart', 'cci']].describe())

In [None]:
plt.figure(figsize=(18, 6))

# Age Distribution
plt.subplot(1, 3, 1)
sns.histplot(treated['ageAtDiagnosis'], color='blue', label='Treated', bins=20, stat='percent')
sns.histplot(untreated['ageAtDiagnosis'], color='orange', label='Untreated', bins=20, stat='percent')
plt.title('Age Distribution by Treatment Status')
plt.xlabel('Age at Diagnosis')
plt.ylabel('Percentage')
plt.legend()

# WHO Performance Status Distribution
plt.subplot(1, 3, 2)
sns.histplot(treated['whoStatusPreTreatmentStart'], color='blue', label='Treated', bins=5, stat='percent')
sns.histplot(untreated['whoStatusPreTreatmentStart'], color='orange', label='Untreated', bins=5, stat='percent')
plt.title('WHO Performance Status by Treatment Status')
plt.xlabel('WHO Performance Status')
plt.ylabel('Percentage')
plt.legend()

# Charlson Comorbidity Index Distribution
plt.subplot(1, 3, 3)
sns.histplot(treated['cci'], color='blue', label='Treated', bins=10, stat='percent')
sns.histplot(untreated['cci'], color='orange', label='Untreated', bins=10, stat='percent')
plt.title('CCI Distribution by Treatment Status')
plt.xlabel('Charlson Comorbidity Index')
plt.ylabel('Percentage')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
df['treatment_label'] = df['treatment'].replace({0: 'No Treatment', 1: 'Treatment'})

plt.figure(figsize=(18, 6))

# Age Boxplot
plt.subplot(1, 3, 1)
sns.boxplot(x='treatment_label', y='ageAtDiagnosis', data=df)
plt.title('Age by Treatment Status')
plt.xlabel('Treatment')
plt.ylabel('Age at Diagnosis')

# WHO Performance Status Boxplot
plt.subplot(1, 3, 2)
sns.boxplot(x='treatment_label', y='whoStatusPreTreatmentStart', data=df)
plt.title('WHO Performance Status by Treatment Status')
plt.xlabel('Treatment')
plt.ylabel('WHO Performance Status')

# CCI Boxplot
plt.subplot(1, 3, 3)
sns.boxplot(x='treatment_label', y='cci', data=df)
plt.title('CCI by Treatment Status')
plt.xlabel('Treatment')
plt.ylabel('Charlson Comorbidity Index')

plt.tight_layout()
plt.show()