### Import Essential Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

### Load and Initial Data Inspection

In [None]:
df = pd.read_csv('dataset/complete_diabetes_dataset.csv')

print(f"Dataset Shape: {df.shape}")
print(f"\nFirst Few Records:")
df.head()

### Dataset Overview

In [None]:
print("Column Names and Data Types:")
print(df.dtypes)
print("\n" + "="*60)
print("\nMissing Values:")
print(df.isnull().sum())
print("\n" + "="*60)
print("\nBasic Statistics:")
df.describe()

### Treatment Distribution

In [None]:
treatment_counts = df['treatment'].value_counts()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

treatment_counts.plot(kind='bar', ax=ax1, color='steelblue')
ax1.set_title('Treatment Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Treatment')
ax1.set_ylabel('Number of Patients')
ax1.tick_params(axis='x', rotation=45)

ax2.pie(treatment_counts, labels=treatment_counts.index, autopct='%1.1f%%', startangle=90)
ax2.set_title('Treatment Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("\nTreatment Distribution:")
print(treatment_counts)

### Outcome Analysis

In [None]:
outcome_dist = df['outcome_category'].value_counts()
success_rate = (df['success'].sum() / len(df)) * 100

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

outcome_dist.plot(kind='bar', ax=axes[0], color=['green', 'orange', 'red'])
axes[0].set_title('Outcome Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Outcome Category')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

axes[1].bar(['Success', 'Failure'], [df['success'].sum(), len(df) - df['success'].sum()],
            color=['green', 'red'])
axes[1].set_title('Binary Success Rate', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

print(f"\nOverall Success Rate: {success_rate:.2f}%")
print("\nOutcome Distribution:")
print(outcome_dist)

### Success Rate by Treatment

In [None]:
treatment_success = df.groupby('treatment').agg({
    'success': 'mean',
    'hba1c_reduction': 'mean',
    'patient_id': 'count'
}).round(3)

treatment_success.columns = ['Success Rate', 'Avg HbA1c Reduction', 'Patient Count']
treatment_success['Success Rate'] = (treatment_success['Success Rate'] * 100).round(2)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

treatment_success['Success Rate'].plot(kind='bar', ax=ax1, color='teal')
ax1.set_title('Success Rate by Treatment', fontsize=14, fontweight='bold')
ax1.set_xlabel('Treatment')
ax1.set_ylabel('Success Rate (%)')
ax1.tick_params(axis='x', rotation=45)
ax1.axhline(y=success_rate, color='r', linestyle='--', label='Overall Average')
ax1.legend()

treatment_success['Avg HbA1c Reduction'].plot(kind='bar', ax=ax2, color='coral')
ax2.set_title('Average HbA1c Reduction by Treatment', fontsize=14, fontweight='bold')
ax2.set_xlabel('Treatment')
ax2.set_ylabel('HbA1c Reduction (%)')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("\nTreatment Performance:")
print(treatment_success)

### Patient Demographics Distribution

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(16, 14))

axes[0, 0].hist(df['age'], bins=30, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Age Distribution', fontweight='bold')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Frequency')

axes[0, 1].hist(df['bmi'], bins=30, color='lightcoral', edgecolor='black')
axes[0, 1].set_title('BMI Distribution', fontweight='bold')
axes[0, 1].set_xlabel('BMI')
axes[0, 1].set_ylabel('Frequency')

axes[0, 2].hist(df['hba1c_baseline'], bins=30, color='lightgreen', edgecolor='black')
axes[0, 2].set_title('Baseline HbA1c Distribution', fontweight='bold')
axes[0, 2].set_xlabel('HbA1c (%)')
axes[0, 2].set_ylabel('Frequency')

axes[1, 0].hist(df['egfr'], bins=30, color='plum', edgecolor='black')
axes[1, 0].set_title('eGFR Distribution', fontweight='bold')
axes[1, 0].set_xlabel('eGFR (mL/min/1.73mÂ²)')
axes[1, 0].set_ylabel('Frequency')

axes[1, 1].hist(df['diabetes_duration'], bins=30, color='gold', edgecolor='black')
axes[1, 1].set_title('Diabetes Duration Distribution', fontweight='bold')
axes[1, 1].set_xlabel('Duration (years)')
axes[1, 1].set_ylabel('Frequency')

axes[1, 2].hist(df['fasting_glucose'], bins=30, color='salmon', edgecolor='black')
axes[1, 2].set_title('Fasting Glucose Distribution', fontweight='bold')
axes[1, 2].set_xlabel('Fasting Glucose (mg/dL)')
axes[1, 2].set_ylabel('Frequency')

axes[2, 0].hist(df['c_peptide'], bins=30, color='lightblue', edgecolor='black')
axes[2, 0].set_title('C-Peptide Distribution', fontweight='bold')
axes[2, 0].set_xlabel('C-Peptide (ng/mL)')
axes[2, 0].set_ylabel('Frequency')

axes[2, 1].hist(df['bp_diastolic'], bins=30, color='peachpuff', edgecolor='black')
axes[2, 1].set_title('Diastolic BP Distribution', fontweight='bold')
axes[2, 1].set_xlabel('BP Diastolic (mmHg)')
axes[2, 1].set_ylabel('Frequency')

axes[2, 2].hist(df['alt'], bins=30, color='thistle', edgecolor='black')
axes[2, 2].set_title('ALT Distribution', fontweight='bold')
axes[2, 2].set_xlabel('ALT (U/L)')
axes[2, 2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print("\nDemographic Summary:")
print(df[['age', 'bmi', 'hba1c_baseline', 'egfr', 'diabetes_duration',
          'fasting_glucose', 'c_peptide', 'bp_diastolic', 'alt']].describe())


### Comorbidity Prevalence

In [None]:
comorbidities = ['cvd', 'ckd', 'nafld', 'hypertension', 'retinopathy']
comorbidity_prevalence = df[comorbidities].sum().sort_values(ascending=False)
comorbidity_pct = (comorbidity_prevalence / len(df) * 100).round(2)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

comorbidity_pct.plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_title('Comorbidity Prevalence', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Percentage of Patients (%)')
axes[0].set_ylabel('Comorbidity')

prediabetes_counts = df['previous_prediabetes'].value_counts()
axes[1].bar(['No', 'Yes'], prediabetes_counts.values, color=['steelblue', 'coral'], edgecolor='black')
axes[1].set_title('Previous Prediabetes History', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Number of Patients')
total = prediabetes_counts.sum()
for i, v in enumerate(prediabetes_counts.values):
    axes[1].text(i, v + 50, f'{v/total*100:.1f}%', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("\nComorbidity Prevalence:")
for comorbidity, pct in comorbidity_pct.items():
    count = comorbidity_prevalence[comorbidity]
    print(f"{comorbidity.upper()}: {count} patients ({pct}%)")

prediabetes_pct = (df['previous_prediabetes'].sum() / len(df)) * 100
print(f"\nPREVIOUS_PREDIABETES: {df['previous_prediabetes'].sum()} patients ({prediabetes_pct:.2f}%)")


### Correlation Analysis

In [None]:
numeric_cols = ['age', 'bmi', 'hba1c_baseline', 'diabetes_duration', 'fasting_glucose',
                'c_peptide', 'egfr', 'bp_systolic', 'bp_diastolic', 'alt',
                'ldl', 'hdl', 'triglycerides', 'hba1c_reduction', 'success']

correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm',
            center=0, square=True, linewidths=1)
plt.title('Correlation Matrix of Clinical Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nTop Correlations with HbA1c Reduction:")
hba1c_corr = correlation_matrix['hba1c_reduction'].sort_values(ascending=False)
print(hba1c_corr[1:11])

print("\nTop Correlations with Treatment Success:")
success_corr = correlation_matrix['success'].sort_values(ascending=False)
print(success_corr[1:11])

### Treatment Performance by Patient Subgroups

In [None]:
subgroup_analysis = df.groupby(['treatment', 'age_group'])['success'].mean().unstack() * 100

plt.figure(figsize=(12, 6))
subgroup_analysis.plot(kind='bar', ax=plt.gca())
plt.title('Success Rate by Treatment and Age Group', fontsize=14, fontweight='bold')
plt.xlabel('Treatment')
plt.ylabel('Success Rate (%)')
plt.legend(title='Age Group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nSuccess Rate by Treatment and Age Group (%):")
print(subgroup_analysis.round(2))

### Treatment Performance by BMI Category

In [None]:
bmi_analysis = df.groupby(['treatment', 'bmi_category'])['success'].mean().unstack() * 100

plt.figure(figsize=(12, 6))
bmi_analysis.plot(kind='bar', ax=plt.gca())
plt.title('Success Rate by Treatment and BMI Category', fontsize=14, fontweight='bold')
plt.xlabel('Treatment')
plt.ylabel('Success Rate (%)')
plt.legend(title='BMI Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nSuccess Rate by Treatment and BMI Category (%):")
print(bmi_analysis.round(2))

### Treatment Performance by Comorbidities

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for idx, comorbidity in enumerate(['cvd', 'ckd', 'nafld', 'hypertension']):
    ax = axes[idx // 2, idx % 2]

    comorbidity_treatment = df.groupby(['treatment', comorbidity])['success'].mean().unstack() * 100
    comorbidity_treatment.plot(kind='bar', ax=ax)
    ax.set_title(f'Success Rate by Treatment and {comorbidity.upper()}', fontweight='bold')
    ax.set_xlabel('Treatment')
    ax.set_ylabel('Success Rate (%)')
    ax.legend(['No ' + comorbidity.upper(), 'Has ' + comorbidity.upper()])
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

### HbA1c Reduction Distribution by Treatment

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

treatment_order = df.groupby('treatment')['hba1c_reduction'].median().sort_values(ascending=False).index
df_sorted = df.copy()
df_sorted['treatment'] = pd.Categorical(df_sorted['treatment'], categories=treatment_order, ordered=True)

sns.boxplot(data=df_sorted, x='treatment', y='hba1c_reduction', ax=ax)
ax.set_title('HbA1c Reduction Distribution by Treatment', fontsize=14, fontweight='bold')
ax.set_xlabel('Treatment')
ax.set_ylabel('HbA1c Reduction (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nHbA1c Reduction Statistics by Treatment:")
print(df.groupby('treatment')['hba1c_reduction'].describe().round(2))

## Key Questions

### Which treatments work best for specific patient profiles?

In [None]:
print("ANALYSIS: Optimal Treatment by Patient Context")
print("="*60)

contexts = [
    ("Young patients (age < 55) with low BMI (< 28)",
     (df['age'] < 55) & (df['bmi'] < 28)),
    ("Obese patients (BMI > 35)",
     df['bmi'] > 35),
    ("Patients with CVD",
     df['cvd'] == 1),
    ("Elderly patients (age > 70) with CKD",
     (df['age'] > 70) & (df['ckd'] == 1)),
    ("Severe diabetes (HbA1c > 11)",
     df['hba1c_baseline'] > 11)
]

for context_name, condition in contexts:
    subset = df[condition]
    if len(subset) > 0:
        best_treatment = subset.groupby('treatment').agg({
            'success': 'mean',
            'hba1c_reduction': 'mean',
            'patient_id': 'count'
        }).sort_values('success', ascending=False)

        print(f"\n{context_name}:")
        print(f"Sample size: {len(subset)} patients")
        print(f"Best treatment: {best_treatment.index[0]}")
        print(f"  Success rate: {best_treatment.iloc[0]['success']*100:.1f}%")
        print(f"  Avg reduction: {best_treatment.iloc[0]['hba1c_reduction']:.2f}%")

### Are certain treatments biased toward specific demographics?

In [None]:
print("ANALYSIS: Treatment Fairness Across Demographics")
print("="*60)

demographics = ['gender', 'ethnicity', 'age_group']

for demo in demographics:
    print(f"\n{demo.upper()} Distribution Across Treatments:")
    demo_treatment = pd.crosstab(df[demo], df['treatment'], normalize='columns') * 100
    print(demo_treatment.round(2))

    chi2, p_value, dof, expected = stats.chi2_contingency(pd.crosstab(df[demo], df['treatment']))
    print(f"Chi-square test p-value: {p_value:.4f}")
    if p_value < 0.05:
        print("Distribution is statistically different across treatments")
    else:
        print("Distribution is balanced across treatments")

### What patient features most predict treatment success?

In [None]:
print("ANALYSIS: Feature Importance for Treatment Success")
print("="*60)

feature_cols = ['age', 'bmi', 'hba1c_baseline', 'diabetes_duration', 'fasting_glucose',
                'c_peptide', 'egfr', 'bp_systolic', 'bp_diastolic', 'alt',
                'ldl', 'cvd', 'ckd', 'nafld', 'hypertension', 'previous_prediabetes']

success_correlations = df[feature_cols + ['success']].corr()['success'].drop('success').sort_values(key=abs, ascending=False)

plt.figure(figsize=(10, 8))
success_correlations.plot(kind='barh', color='teal')
plt.title('Feature Correlation with Treatment Success', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Feature')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

print("\nFeature Correlation with Success:")
print(success_correlations)

for treatment in df['treatment'].unique():
    treatment_df = df[df['treatment'] == treatment]
    treatment_corr = treatment_df[feature_cols + ['success']].corr()['success'].drop('success')
    print(f"\n{treatment} - Top 3 success predictors:")
    print(treatment_corr.sort_values(key=abs, ascending=False).head(3))


### How much does patient similarity matter for outcomes?

In [None]:
print("ANALYSIS: Impact of Patient Similarity on Outcomes")
print("="*60)

categorical_matches = df.groupby(['age_group', 'bmi_category', 'hba1c_severity', 'treatment']).agg({
    'success': ['mean', 'count'],
    'hba1c_reduction': 'mean'
}).round(3)

categorical_matches.columns = ['Success_Rate', 'Patient_Count', 'Avg_Reduction']
categorical_matches = categorical_matches[categorical_matches['Patient_Count'] >= 10]
categorical_matches = categorical_matches.sort_values('Success_Rate', ascending=False)

print("\nTop 10 Patient Profile-Treatment Combinations:")
print(categorical_matches.head(10))

print("\nBottom 10 Patient Profile-Treatment Combinations:")
print(categorical_matches.tail(10))

variance_by_group = categorical_matches.groupby(level=3)['Success_Rate'].std()
print("\nSuccess Rate Variability by Treatment:")
print(variance_by_group.sort_values(ascending=False))

### Clinical Marker Analysis by Treatment Success

In [None]:
df['c_peptide_category'] = pd.cut(df['c_peptide'],
                                   bins=[0, 0.8, 1.5, 5],
                                   labels=['Low', 'Medium', 'High'])
df['duration_category'] = pd.cut(df['diabetes_duration'],
                                 bins=[0, 3, 7, 30],
                                 labels=['Short', 'Medium', 'Long'])
df['glucose_category'] = pd.cut(df['fasting_glucose'],
                                bins=[0, 126, 180, 400],
                                labels=['Controlled', 'Elevated', 'High'])

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

success_by_cpeptide = df.groupby('c_peptide_category')['success'].mean() * 100
success_by_cpeptide.plot(kind='bar', ax=axes[0, 0], color='steelblue', edgecolor='black')
axes[0, 0].set_title('Success Rate by C-Peptide Level', fontweight='bold')
axes[0, 0].set_ylabel('Success Rate (%)')
axes[0, 0].tick_params(axis='x', rotation=0)

success_by_duration = df.groupby('duration_category')['success'].mean() * 100
success_by_duration.plot(kind='bar', ax=axes[0, 1], color='coral', edgecolor='black')
axes[0, 1].set_title('Success Rate by Diabetes Duration', fontweight='bold')
axes[0, 1].set_ylabel('Success Rate (%)')
axes[0, 1].tick_params(axis='x', rotation=0)

success_by_glucose = df.groupby('glucose_category')['success'].mean() * 100
success_by_glucose.plot(kind='bar', ax=axes[1, 0], color='lightgreen', edgecolor='black')
axes[1, 0].set_title('Success Rate by Fasting Glucose', fontweight='bold')
axes[1, 0].set_ylabel('Success Rate (%)')
axes[1, 0].tick_params(axis='x', rotation=0)

success_by_prediabetes = df.groupby('previous_prediabetes')['success'].mean() * 100
success_by_prediabetes.plot(kind='bar', ax=axes[1, 1], color='plum', edgecolor='black')
axes[1, 1].set_title('Success Rate by Previous Prediabetes', fontweight='bold')
axes[1, 1].set_ylabel('Success Rate (%)')
axes[1, 1].set_xticks([0, 1])
axes[1, 1].set_xticklabels(['No', 'Yes'], rotation=0)

plt.tight_layout()
plt.show()

print("\nSuccess Rate by C-Peptide Level:")
print(df.groupby('c_peptide_category')['success'].agg(['mean', 'count']).round(3))

print("\nSuccess Rate by Diabetes Duration:")
print(df.groupby('duration_category')['success'].agg(['mean', 'count']).round(3))

print("\nSuccess Rate by Fasting Glucose:")
print(df.groupby('glucose_category')['success'].agg(['mean', 'count']).round(3))

print("\nSuccess Rate by Previous Prediabetes:")
print(df.groupby('previous_prediabetes')['success'].agg(['mean', 'count']).round(3))


### Treatment Performance by Key Clinical Markers

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

cpeptide_treatment = df.groupby(['treatment', 'c_peptide_category'])['success'].mean().unstack() * 100
cpeptide_treatment.plot(kind='bar', ax=axes[0, 0])
axes[0, 0].set_title('Success Rate by Treatment and C-Peptide Level', fontweight='bold')
axes[0, 0].set_xlabel('Treatment')
axes[0, 0].set_ylabel('Success Rate (%)')
axes[0, 0].legend(title='C-Peptide')
axes[0, 0].tick_params(axis='x', rotation=45)

duration_treatment = df.groupby(['treatment', 'duration_category'])['success'].mean().unstack() * 100
duration_treatment.plot(kind='bar', ax=axes[0, 1])
axes[0, 1].set_title('Success Rate by Treatment and Duration', fontweight='bold')
axes[0, 1].set_xlabel('Treatment')
axes[0, 1].set_ylabel('Success Rate (%)')
axes[0, 1].legend(title='Duration')
axes[0, 1].tick_params(axis='x', rotation=45)

glucose_treatment = df.groupby(['treatment', 'glucose_category'])['success'].mean().unstack() * 100
glucose_treatment.plot(kind='bar', ax=axes[1, 0])
axes[1, 0].set_title('Success Rate by Treatment and Fasting Glucose', fontweight='bold')
axes[1, 0].set_xlabel('Treatment')
axes[1, 0].set_ylabel('Success Rate (%)')
axes[1, 0].legend(title='Glucose')
axes[1, 0].tick_params(axis='x', rotation=45)

prediabetes_treatment = df.groupby(['treatment', 'previous_prediabetes'])['success'].mean().unstack() * 100
prediabetes_treatment.plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Success Rate by Treatment and Prediabetes History', fontweight='bold')
axes[1, 1].set_xlabel('Treatment')
axes[1, 1].set_ylabel('Success Rate (%)')
axes[1, 1].legend(['No Prediabetes', 'Had Prediabetes'])
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()