In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kruskal

In [None]:
db_connection = pymysql.connect(
    read_default_file='/home/jupyter/.my.cnf',
    read_default_group='RAnalysis', 
    db = 'actin_personalization'
)

query = """  
SELECT 
    dt.*, 
    e.tumorIncidenceYear,
    e.hadProgressionEvent,
    pm.type
FROM 
    diagnosisTreatments dt
JOIN
    episode e 
    ON e.id = dt.episodeId
LEFT JOIN
    pfsMeasure pm 
    ON e.id = pm.episodeId;


"""

df = pd.read_sql(query, db_connection)

db_connection.close()

df.head()

# Assumption 5

In [None]:
pfs_by_year = df.groupby('tumorIncidenceYear')['systemicTreatmentPlanPfs'].median()

years = df['tumorIncidenceYear'].unique()
pfs_data = [df[df['tumorIncidenceYear'] == year]['systemicTreatmentPlanPfs'].dropna() for year in years]
kruskal_test = kruskal(*pfs_data)

print("Kruskal-Wallis test results:", kruskal_test)

plt.figure(figsize=(10, 6))
sns.boxplot(x='tumorIncidenceYear', y='systemicTreatmentPlanPfs', data=df)
plt.title("PFS Distribution by Start Year")
plt.xticks(rotation=45)
plt.show()

# Assumption 6

In [None]:
event_counts = df.pivot_table(index='systemicTreatmentPlan', 
                              columns='type', 
                              values='episodeId', 
                              aggfunc='count', 
                              fill_value=0)

event_counts['censored_percentage'] = event_counts['CENSOR'] / (event_counts['PROGRESSION'] + event_counts['CENSOR']) * 100
print(event_counts[['censored_percentage']])

plt.figure(figsize=(10, 6))

bars = event_counts['censored_percentage'].plot(kind='bar')

plt.title("Censoring Percentage by Treatment Group")
plt.ylabel("Censored Percentage")
plt.xticks(rotation=45, ha="right") 
plt.tight_layout()

plt.show()