In [None]:
import pandas as pd 
import pymysql
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
import seaborn as sns

In [None]:
db_connection = pymysql.connect(
    read_default_file='/home/jupyter/.my.cnf',
    read_default_group='RAnalysis', 
    db = 'actin_personalization'
)

query = '''
SELECT
    p.isAlive,
    d.ageAtDiagnosis,
    d.cci,
    d.observedOverallSurvivalFromTumorIncidenceDays,
    e.systemicTreatmentPlan,
    e.observedPfsDays,
    e.whoStatusPreTreatmentStart,
    e.hadProgressionEvent
FROM
    episode e
JOIN
    diagnosisTreatments dt ON e.diagnosisId = dt.diagnosisId
JOIN
    diagnosis d ON dt.diagnosisId = d.id AND dt.patientId = d.patientId
JOIN
    patient p ON dt.patientId = p.id
WHERE
       (e.hasHadPreSurgerySystemicChemotherapy = 0 OR e.hasHadPreSurgerySystemicChemotherapy IS NULL)
    AND (e.hasHadPostSurgerySystemicChemotherapy = 0 OR e.hasHadPostSurgerySystemicChemotherapy IS NULL)
    AND (e.hasHadPreSurgerySystemicTargetedTherapy = 0 OR e.hasHadPreSurgerySystemicTargetedTherapy IS NULL)
    AND (e.hasHadPostSurgerySystemicTargetedTherapy = 0 OR e.hasHadPostSurgerySystemicTargetedTherapy IS NULL)
'''

df = pd.read_sql(query, db_connection)

db_connection.close()

df.head()

In [None]:
df['pfs_time'] = df['observedPfsDays']
df['pfs_event'] = df['hadProgressionEvent'].apply(lambda x: 1 if x == 1 else 0)

df['os_time'] = df['observedOverallSurvivalFromTumorIncidenceDays']
df['os_event'] = df['isAlive'].apply(lambda x: 0 if x == 1 else 1)

df['treatment'] = df['systemicTreatmentPlan'].apply(lambda x: 0 if (pd.isnull(x) or x == 'None') else 1)

df['treatment'].value_counts()

In [None]:
df_os = df.dropna(subset=['os_time', 'os_event'])

treated_os = df_os[df_os['treatment'] == 1]
untreated_os = df_os[df_os['treatment'] == 0]

kmf = KaplanMeierFitter()

plt.figure(figsize=(10, 6))
kmf.fit(treated_os['os_time'], event_observed=treated_os['os_event'], label='Treated')
ax = kmf.plot_survival_function()

kmf.fit(untreated_os['os_time'], event_observed=untreated_os['os_event'], label='Untreated')
kmf.plot_survival_function(ax=ax)

plt.title('Overall Survival for Treated vs Untreated Patients')
plt.xlabel('Time (days)')
plt.ylabel('Overall Survival Fraction')
plt.legend()
plt.show()

In [None]:
who_status_values = df['whoStatusPreTreatmentStart'].unique()

plt.figure(figsize=(20, 2 * 5))

for i, who_status in enumerate(who_status_values):
    plt.subplot(2, 3, i + 1)

    group_data = df_os[df_os['whoStatusPreTreatmentStart'] == who_status]
    
    treated_os = group_data[group_data['treatment'] == 1]
    untreated_os = group_data[group_data['treatment'] == 0]
    
    kmf = KaplanMeierFitter()
    
    if not treated_os.empty:
        kmf.fit(treated_os['observedOverallSurvivalFromTumorIncidenceDays'], 
                event_observed=treated_os['os_event'], label='Treated')
        kmf.plot_survival_function()
    
    if not untreated_os.empty:
        kmf.fit(untreated_os['observedOverallSurvivalFromTumorIncidenceDays'], 
                event_observed=untreated_os['os_event'], label='Untreated')
        kmf.plot_survival_function()

    plt.title(f'Overall Survival for WHO Status {who_status}')
    plt.xlabel('Time (days)')
    plt.ylabel('Overall Survival Fraction')
    plt.legend()

plt.tight_layout()
plt.show()


In [None]:
df['whoStatusPreTreatmentStart'] = df['whoStatusPreTreatmentStart'].fillna(df['whoStatusPreTreatmentStart'].median())
df['ageAtDiagnosis'] = df['ageAtDiagnosis'].fillna(df['ageAtDiagnosis'].median())
df['cci'] = df['cci'].fillna(df['cci'].median())

treated = df[df['treatment'] == 1]
untreated = df[df['treatment'] == 0]

print("Treated patients - Baseline characteristics:")
print(treated[['ageAtDiagnosis', 'whoStatusPreTreatmentStart', 'cci']].describe())

print("\nUntreated patients - Baseline characteristics:")
print(untreated[['ageAtDiagnosis', 'whoStatusPreTreatmentStart', 'cci']].describe())

In [None]:
plt.figure(figsize=(18, 6))

# Age Distribution
plt.subplot(1, 3, 1)
sns.histplot(treated['ageAtDiagnosis'], color='blue', label='Treated', bins=20, stat='percent')
sns.histplot(untreated['ageAtDiagnosis'], color='orange', label='Untreated', bins=20, stat='percent')
plt.title('Age Distribution by Treatment Status')
plt.xlabel('Age at Diagnosis')
plt.ylabel('Percentage')
plt.legend()

# WHO Performance Status Distribution
plt.subplot(1, 3, 2)
sns.histplot(treated['whoStatusPreTreatmentStart'], color='blue', label='Treated', bins=5, stat='percent')
sns.histplot(untreated['whoStatusPreTreatmentStart'], color='orange', label='Untreated', bins=5, stat='percent')
plt.title('WHO Performance Status by Treatment Status')
plt.xlabel('WHO Performance Status')
plt.ylabel('Percentage')
plt.legend()

# Charlson Comorbidity Index Distribution
plt.subplot(1, 3, 3)
sns.histplot(treated['cci'], color='blue', label='Treated', bins=10, stat='percent')
sns.histplot(untreated['cci'], color='orange', label='Untreated', bins=10, stat='percent')
plt.title('CCI Distribution by Treatment Status')
plt.xlabel('Charlson Comorbidity Index')
plt.ylabel('Percentage')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
df['treatment_label'] = df['treatment'].replace({0: 'No Treatment', 1: 'Treatment'})

plt.figure(figsize=(18, 6))

# Age Boxplot
plt.subplot(1, 3, 1)
sns.boxplot(x='treatment_label', y='ageAtDiagnosis', data=df)
plt.title('Age by Treatment Status')
plt.xlabel('Treatment')
plt.ylabel('Age at Diagnosis')

# WHO Performance Status Boxplot
plt.subplot(1, 3, 2)
sns.boxplot(x='treatment_label', y='whoStatusPreTreatmentStart', data=df)
plt.title('WHO Performance Status by Treatment Status')
plt.xlabel('Treatment')
plt.ylabel('WHO Performance Status')

# CCI Boxplot
plt.subplot(1, 3, 3)
sns.boxplot(x='treatment_label', y='cci', data=df)
plt.title('CCI by Treatment Status')
plt.xlabel('Treatment')
plt.ylabel('Charlson Comorbidity Index')

plt.tight_layout()
plt.show()