In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
import seaborn as sns


In [None]:
df_crc = pd.read_csv('/data/patient_like_me/ncr/K2400223.csv', sep=';')
df_crc.head()

In [None]:
no_treatment_conditions = (
    (df_crc['tumgericht_ther'].fillna(0) == 0) & 
    (df_crc['deelname_studie'].fillna(0) == 0) &
    (df_crc['chir'].fillna(0) == 0) &         
    (df_crc['rt'].fillna(0) == 0) &        
    (df_crc['chemort'].fillna(0) == 0) &     
    (df_crc['chemo'].fillna(0) == 0) &        
    (df_crc['target'].fillna(0) == 0) &      
    (df_crc['hipec'].fillna(0) == 0)  
)
df_crc['treatment'] = 1 
df_crc.loc[no_treatment_conditions, 'treatment'] = 0

df_crc['treatment'].value_counts()

In [None]:
df = df_crc[['treatment', 'pfs_int1', 'pfs_event1', 'perf_stat', 'leeft', 'cci']].copy()

df = df.dropna(subset=['pfs_int1', 'pfs_event1'])

df['pfs_int1'] = pd.to_numeric(df['pfs_int1'], errors='coerce')
df['pfs_event1'] = pd.to_numeric(df['pfs_event1'], errors='coerce')

df = df[df['perf_stat'] != 9]

df['perf_stat'] = df['perf_stat'].fillna(df['perf_stat'].median())
df['leeft'] = df['leeft'].fillna(df['leeft'].median())
df['cci'] = df['cci'].fillna(df['cci'].median())

treated = df[df['treatment'] == 1]
untreated = df[df['treatment'] == 0]

print("Treated patients - Baseline characteristics:")
print(treated[['leeft', 'perf_stat', 'cci']].describe())


print("\n Untreated patients - Baseline characteristics:")
print(untreated[['leeft', 'perf_stat', 'cci']].describe())


In [None]:
kmf = KaplanMeierFitter()

plt.figure(figsize=(10, 6))

kmf.fit(treated['pfs_int1'], event_observed=treated['pfs_event1'], label='Treated')
kmf.plot_survival_function()

kmf.fit(untreated['pfs_int1'], event_observed=untreated['pfs_event1'], label='Untreated')
kmf.plot_survival_function()

plt.title('PFS for Treated vs Untreated Patients')
plt.xlabel('Time (days)')
plt.ylabel('Progression-Free Survival %')
plt.show()

In [None]:
df = df_crc[['treatment', 'vit_stat', 'vit_stat_int']].copy()
df = df.dropna(subset=['vit_stat_int', 'vit_stat'])

df['vit_stat_int'] = pd.to_numeric(df['vit_stat_int'], errors='coerce')
df['vit_stat'] = pd.to_numeric(df['vit_stat'], errors='coerce')

treated = df[df['treatment'] == 1]
untreated = df[df['treatment'] == 0]

kmf = KaplanMeierFitter()

plt.figure(figsize=(10, 6))

kmf.fit(treated['vit_stat_int'], event_observed=(treated['vit_stat'] == 1), label='Treated')
kmf.plot_survival_function()

kmf.fit(untreated['vit_stat_int'], event_observed=(untreated['vit_stat'] == 1), label='Untreated')
kmf.plot_survival_function()

plt.title('Overall Survival for Treated vs Untreated Patients')
plt.xlabel('Time (days)')
plt.ylabel('Overall Survival %')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
df = df_crc[['treatment', 'leeft', 'perf_stat', 'cci']].copy()
df = df[df['perf_stat'] != 9]

treated = df[df['treatment'] == 1]
untreated = df[df['treatment'] == 0]

plt.figure(figsize=(18, 6))

# Plot the age (leeft) distribution  
plt.subplot(1, 3, 1)
sns.histplot(treated['leeft'], color='blue', label='Treated', kde=True, bins=20, stat='percent')
sns.histplot(untreated['leeft'], color='orange', label='Untreated', kde=True, bins=20, stat='percent')
plt.title('Distribution of Age (leeft) by Treatment Status')
plt.xlabel('Age')
plt.ylabel('Percentage')
plt.legend()

# Plot the performance status (perf_stat) distribution 
plt.subplot(1, 3, 2)
sns.histplot(treated['perf_stat'], color='blue', label='Treated', kde=True, bins=5, stat='percent')
sns.histplot(untreated['perf_stat'], color='orange', label='Untreated', kde=True, bins=5, stat='percent')
plt.title('Distribution of Performance Status (perf_stat) by Treatment Status')
plt.xlabel('Performance Status')
plt.xticks([0, 1, 2, 3, 4])
plt.ylabel('Percentage')
plt.legend()

# Plot the Charlson Comorbidity Index (cci) distribution
plt.subplot(1, 3, 3)
sns.histplot(treated['cci'], color='blue', label='Treated', kde=True, bins=10, stat='percent')
sns.histplot(untreated['cci'], color='orange', label='Untreated', kde=True, bins=10, stat='percent')
plt.title('Distribution of Charlson Comorbidity Index (cci) by Treatment Status')
plt.xlabel('Charlson Comorbidity Index')
plt.ylabel('Percentage')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:

df['treatment_label'] = df['treatment'].replace({0: 'No Treatment', 1: 'Treatment'})

plt.figure(figsize=(18, 6))

# Boxplot for age (leeft)
plt.subplot(1, 3, 1)
sns.boxplot(x='treatment_label', y='leeft', data=df)
plt.title('Boxplot of Age (leeft) by Treatment Status')
plt.xlabel('Treatment')
plt.ylabel('Age')

# Boxplot for performance status (perf_stat)
plt.subplot(1, 3, 2)
sns.boxplot(x='treatment_label', y='perf_stat', data=df)
plt.title('Boxplot of Performance Status (perf_stat) by Treatment Status')
plt.xlabel('Treatment')
plt.ylabel('Performance Status')

# Boxplot for Charlson Comorbidity Index (cci)
plt.subplot(1, 3, 3)
sns.boxplot(x='treatment_label', y='cci', data=df)
plt.title('Boxplot of Charlson Comorbidity Index (cci) by Treatment Status')
plt.xlabel('Treatment')
plt.ylabel('Charlson Comorbidity Index')

plt.tight_layout()
plt.show()
