In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kruskal
import pymysql

In [None]:
db_connection = pymysql.connect(
    read_default_file='/home/jupyter/.my.cnf',
    read_default_group='RAnalysis', 
    db = 'actin_personalization'
)

query = """  
SELECT 
    * 
FROM 
    knownPalliativeTreatments 
"""

km_df = pd.read_sql(query, db_connection)

db_connection.close()

km_df.head()

# Assumption 5

In [None]:
df = km_df[km_df['tumorIncidenceYear'] != 2022]

pfs_by_year = df.groupby('tumorIncidenceYear')['observedPfsDays'].median()

years = df['tumorIncidenceYear'].unique()
pfs_data = [df[df['tumorIncidenceYear'] == year]['observedPfsDays'].dropna() for year in years]
kruskal_test = kruskal(*pfs_data)

print("Kruskal-Wallis test results:", kruskal_test)

plt.figure(figsize=(10, 6))
sns.boxplot(x='tumorIncidenceYear', y='observedPfsDays', data=df)
plt.title("PFS Distribution by Start Year")
plt.xticks(rotation=45)
plt.show()

In [None]:
os_by_year = df.groupby('tumorIncidenceYear')['observedOsFromTreatmentStartDays'].median()

years = df['tumorIncidenceYear'].unique()
os_data = [df[df['tumorIncidenceYear'] == year]['observedOsFromTreatmentStartDays'].dropna() for year in years]
kruskal_test = kruskal(*os_data)

print("Kruskal-Wallis test results:", kruskal_test)

plt.figure(figsize=(10, 6))
sns.boxplot(x='tumorIncidenceYear', y='observedOsFromTreatmentStartDays', data=df)
plt.title("OS Distribution by Start Year")
plt.xticks(rotation=45)
plt.show()

In [None]:
treatments = df['systemicTreatmentPlan'].dropna().unique()

num_cols = 4
num_rows = (len(treatments) + num_cols - 1) // num_cols  

fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(18, num_rows * 4))
fig.tight_layout(pad=5.0)
axes = axes.flatten()

pfs_kruskal_results = []

for i, treatment in enumerate(treatments):
    treatment_data = df[df['systemicTreatmentPlan'] == treatment]
    pfs_data = [treatment_data[treatment_data['tumorIncidenceYear'] == year]['observedPfsDays'].dropna() for year in years]
    
    filtered_pfs_data = [data for data in pfs_data if len(data) > 0]
    
    if len(filtered_pfs_data) > 1:
        kruskal_test = kruskal(*filtered_pfs_data)
        pfs_kruskal_results.append([treatment, kruskal_test.statistic, kruskal_test.pvalue])
        
        # Plot the boxplot for each treatment in the respective subplot
        sns.boxplot(x='tumorIncidenceYear', y='observedPfsDays', data=treatment_data, ax=axes[i])
        axes[i].set_title(f"PFS Distribution by Year for {treatment}")
        axes[i].set_xticks(axes[i].get_xticks())
        axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)

for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.show()

In [None]:
pfs_kruskal_results_df = pd.DataFrame(pfs_kruskal_results, columns=['Treatment', 'Kruskal-Wallis Statistic', 'p-value'])

pfs_kruskal_results_df

In [None]:
num_cols = 4
num_rows = (len(treatments) + num_cols - 1) // num_cols  

fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(18, num_rows * 4))
fig.tight_layout(pad=5.0)
axes = axes.flatten()

os_kruskal_results = []

for i, treatment in enumerate(treatments):
    treatment_data = df[df['systemicTreatmentPlan'] == treatment]
    os_data = [treatment_data[treatment_data['tumorIncidenceYear'] == year]['observedOsFromTreatmentStartDays'].dropna() for year in years]
    
    filtered_os_data = [data for data in os_data if len(data) > 0]
    
    if len(filtered_pfs_data) > 1:
        kruskal_test = kruskal(*filtered_os_data)
        os_kruskal_results.append([treatment, kruskal_test.statistic, kruskal_test.pvalue])
        
        # Plot the boxplot for each treatment in the respective subplot
        sns.boxplot(x='tumorIncidenceYear', y='observedOsFromTreatmentStartDays', data=treatment_data, ax=axes[i])
        axes[i].set_title(f"OS Distribution by Year for {treatment}")
        axes[i].set_xticks(axes[i].get_xticks())
        axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)

for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.show()

In [None]:
os_kruskal_results_df = pd.DataFrame(os_kruskal_results, columns=['Treatment', 'Kruskal-Wallis Statistic', 'p-value'])

os_kruskal_results_df

# Assumption 6

In [None]:
exclude_small_sample_sizes = True

In [None]:
pfs_event_counts = km_df.pivot_table(index='systemicTreatmentPlan', 
                              columns='hadProgressionEvent',
                              values='episodeId', 
                              aggfunc='count', 
                              fill_value=0)

pfs_event_counts['censored_percentage'] = pfs_event_counts[0] / (pfs_event_counts[1] + pfs_event_counts[0]) * 100

pfs_event_counts['total_events'] = pfs_event_counts[1] + pfs_event_counts[0]
pfs_event_counts['censor_events'] = pfs_event_counts[0]
pfs_event_counts['progression_events'] = pfs_event_counts[1]

plt.figure(figsize=(10, 6))
bars = pfs_event_counts['censored_percentage'].plot(kind='bar')
plt.title("Censoring Percentage by Treatment Group (Using hadProgressionEvent)")
plt.ylabel("Censored Percentage")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()

plt.show()

In [None]:
pfs_event_counts = pfs_event_counts.reset_index()

if exclude_small_sample_sizes:
    pfs_event_counts = pfs_event_counts[pfs_event_counts['total_events'] >= 50]

pfs_event_counts['Treatment_Type'] = pfs_event_counts['systemicTreatmentPlan'].apply(
    lambda x: 'Immunotherapy' if x in ['PEMBROLIZUMAB', 'NIVOLUMAB'] else 'Chemotherapy'
)

pfs_chemotherapy_data_filtered = pfs_event_counts[pfs_event_counts['Treatment_Type'] == 'Chemotherapy']
pfs_immunotherapy_data_filtered = pfs_event_counts[pfs_event_counts['Treatment_Type'] == 'Immunotherapy']
pfs_combined_data_filtered = pfs_event_counts

# For Chemotherapy group
pfs_chemotherapy_censoring_filtered = [pfs_chemotherapy_data_filtered[pfs_chemotherapy_data_filtered['systemicTreatmentPlan'] == treatment]['censored_percentage']
                                   for treatment in pfs_chemotherapy_data_filtered['systemicTreatmentPlan'].unique()]
pfs_kruskal_chem_filtered = kruskal(*pfs_chemotherapy_censoring_filtered)
print(f"Kruskal-Wallis Test for Chemotherapy treatments (filtered): {pfs_kruskal_chem_filtered}")

# For Immunotherapy group
if not exclude_small_sample_sizes:
    pfs_immunotherapy_censoring_filtered = [pfs_immunotherapy_data_filtered[pfs_immunotherapy_data_filtered['systemicTreatmentPlan'] == treatment]['censored_percentage']
                                        for treatment in pfs_immunotherapy_data_filtered['systemicTreatmentPlan'].unique()]
    pfs_kruskal_immu_filtered = kruskal(*pfs_immunotherapy_censoring_filtered)
    print(f"Kruskal-Wallis Test for Immunotherapy treatments (filtered): {pfs_kruskal_immu_filtered}")

# For combined Chemotherapy + Immunotherapy group
pfs_combined_censoring_filtered = [pfs_combined_data_filtered[pfs_combined_data_filtered['systemicTreatmentPlan'] == treatment]['censored_percentage']
                               for treatment in pfs_combined_data_filtered['systemicTreatmentPlan'].unique()]
pfs_kruskal_combined_filtered = kruskal(*pfs_combined_censoring_filtered)
print(f"Kruskal-Wallis Test for Combined treatments (filtered): {pfs_kruskal_combined_filtered}")


In [None]:
os_event_counts = km_df.pivot_table(index='systemicTreatmentPlan', 
                              columns='hadSurvivalEvent',
                              values='episodeId', 
                              aggfunc='count', 
                              fill_value=0)

os_event_counts.head()

os_event_counts['censored_percentage'] = os_event_counts[0] / (os_event_counts[1] + os_event_counts[0]) * 100

os_event_counts['total_events'] = os_event_counts[1] + os_event_counts[0]
os_event_counts['censor_events'] = os_event_counts[0]
os_event_counts['progression_events'] = os_event_counts[1]

plt.figure(figsize=(10, 6))
bars = os_event_counts['censored_percentage'].plot(kind='bar')
plt.title("Censoring Percentage by Treatment Group (Using hadSurvivalEvent)")
plt.ylabel("Censored Percentage")
plt.xticks(rotation=45, ha="right") 
plt.tight_layout()

plt.show()

In [None]:
os_event_counts = os_event_counts.reset_index()

if exclude_small_sample_sizes:
    os_event_counts = os_event_counts[os_event_counts['total_events'] >= 50]

os_event_counts['Treatment_Type'] = os_event_counts['systemicTreatmentPlan'].apply(
    lambda x: 'Immunotherapy' if x in ['PEMBROLIZUMAB', 'NIVOLUMAB'] else 'Chemotherapy'
)

os_chemotherapy_data_filtered = os_event_counts[os_event_counts['Treatment_Type'] == 'Chemotherapy']
os_immunotherapy_data_filtered = os_event_counts[os_event_counts['Treatment_Type'] == 'Immunotherapy']
os_combined_data_filtered = os_event_counts

# For Chemotherapy group
os_chemotherapy_censoring_filtered = [os_chemotherapy_data_filtered[os_chemotherapy_data_filtered['systemicTreatmentPlan'] == treatment]['censored_percentage']
                                   for treatment in os_chemotherapy_data_filtered['systemicTreatmentPlan'].unique()]
os_kruskal_chem_filtered = kruskal(*os_chemotherapy_censoring_filtered)
print(f"Kruskal-Wallis Test for Chemotherapy treatments (filtered): {os_kruskal_chem_filtered}")

# For Immunotherapy group
if not exclude_small_sample_sizes:
    os_immunotherapy_censoring_filtered = [os_immunotherapy_data_filtered[os_immunotherapy_data_filtered['systemicTreatmentPlan'] == treatment]['censored_percentage']
                                        for treatment in os_immunotherapy_data_filtered['systemicTreatmentPlan'].unique()]
    os_kruskal_immu_filtered = kruskal(*os_immunotherapy_censoring_filtered)
    print(f"Kruskal-Wallis Test for Immunotherapy treatments (filtered): {os_kruskal_immu_filtered}")

# For combined Chemotherapy + Immunotherapy group
os_combined_censoring_filtered = [os_combined_data_filtered[os_combined_data_filtered['systemicTreatmentPlan'] == treatment]['censored_percentage']
                               for treatment in os_combined_data_filtered['systemicTreatmentPlan'].unique()]
os_kruskal_combined_filtered = kruskal(*os_combined_censoring_filtered)
print(f"Kruskal-Wallis Test for Combined treatments (filtered): {os_kruskal_combined_filtered}")
