## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression, SelectKBest, f_regression
import pickle
import json
import os
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Data

In [None]:
df = pd.read_csv('dataset/complete_diabetes_dataset.csv')

print(f"Dataset loaded successfully")
print(f"Shape: {df.shape[0]} patients, {df.shape[1]} columns")
print(f"\nSample of raw data:")
df.head()

## Define Feature Lists

In [None]:
feature_cols = [
    'age', 'gender', 'ethnicity', 'hba1c_baseline', 'diabetes_duration',
    'fasting_glucose', 'c_peptide', 'egfr', 'bmi',
    'bp_systolic', 'bp_diastolic', 'alt', 'ldl', 'hdl', 'triglycerides',
    'previous_prediabetes', 'hypertension', 'ckd', 'cvd', 'nafld', 'retinopathy'
]

continuous_features = [
    'age', 'hba1c_baseline', 'diabetes_duration', 'fasting_glucose', 'c_peptide',
    'egfr', 'bmi', 'bp_systolic', 'bp_diastolic', 'alt', 'ldl', 'hdl', 'triglycerides'
]

binary_features = [
    'gender', 'ethnicity', 'previous_prediabetes',
    'hypertension', 'ckd', 'cvd', 'nafld', 'retinopathy'
]

print(f"Total features: {len(feature_cols)}")
print(f"Continuous features: {len(continuous_features)}")
print(f"Binary/Categorical features: {len(binary_features)}")
print(f"\nTarget variables: treatment, hba1c_reduction")

## Feature Engineering - Create Derived Features

In [None]:
print("Creating engineered features...")
print("="*60)

df_eng = df.copy()

# === EXISTING ENGINEERED FEATURES ===
df_eng['insulin_deficiency_score'] = (2.0 - df_eng['c_peptide']) * (1 + df_eng['diabetes_duration'] / 15)

df_eng['beta_cell_reserve'] = df_eng['c_peptide'] * (1 / (1 + df_eng['diabetes_duration'] / 10))

df_eng['glucose_severity'] = (df_eng['fasting_glucose'] / 100) * df_eng['hba1c_baseline']

df_eng['disease_progression'] = df_eng['diabetes_duration'] * df_eng['hba1c_baseline']

df_eng['metabolic_syndrome_score'] = (
    (df_eng['bmi'] - 25) / 10 +
    (df_eng['bp_systolic'] - 120) / 40 +
    (df_eng['triglycerides'] - 150) / 100 +
    (60 - df_eng['hdl']) / 20
)

df_eng['cv_risk_score'] = df_eng['cvd'] + df_eng['hypertension'] + (df_eng['age'] > 65).astype(int)

df_eng['kidney_severity'] = pd.cut(df_eng['egfr'],
                                    bins=[0, 30, 60, 90, 200],
                                    labels=[3, 2, 1, 0]).astype(int)

df_eng['comorbidity_count'] = (
    df_eng['hypertension'] + df_eng['ckd'] + df_eng['cvd'] +
    df_eng['nafld'] + df_eng['retinopathy']
)

# === NEW: TREATMENT-EXCLUSIVE SCORE FEATURES ===
df_eng['glp1_exclusive_score'] = (
    ((df_eng['bmi'] > 35).astype(int) * 3) +
    (df_eng['nafld'] * 2) +
    ((df_eng['c_peptide'] > 1.0).astype(int) * 1)
)

df_eng['sglt2_exclusive_score'] = (
    (df_eng['cvd'] * 4) +
    ((df_eng['egfr'] >= 60).astype(int) * 2) +
    ((df_eng['bmi'] > 28).astype(int) * 1)
)

df_eng['insulin_exclusive_score'] = (
    ((df_eng['c_peptide'] < 0.8).astype(int) * 5) +
    ((df_eng['diabetes_duration'] > 12).astype(int) * 3) +
    ((df_eng['hba1c_baseline'] > 10.5).astype(int) * 2)
)

df_eng['metformin_exclusive_score'] = (
    ((df_eng['age'] < 60).astype(int) * 2) +
    ((df_eng['bmi'] < 30).astype(int) * 3) +
    ((df_eng['egfr'] > 60).astype(int) * 2) +
    ((df_eng['diabetes_duration'] < 5).astype(int) * 2)
)

df_eng['dpp4_exclusive_score'] = (
    ((df_eng['age'] > 70).astype(int) * 3) +
    (df_eng['ckd'] * 3) +
    (((df_eng['c_peptide'] > 0.8) & (df_eng['c_peptide'] < 1.5)).astype(int) * 2)
)

# Obesity nafld and cvd kidney interaction
df_eng['obesity_nafld_interaction'] = (df_eng['bmi'] - 25) * df_eng['nafld']

df_eng['cvd_kidney_interaction'] = df_eng['cvd'] * (df_eng['egfr'] / 100)

df_eng['deficiency_severity_interaction'] = (2.0 - df_eng['c_peptide']) * df_eng['hba1c_baseline']

engineered_features = [
    'insulin_deficiency_score', 'beta_cell_reserve', 'glucose_severity',
    'disease_progression', 'metabolic_syndrome_score', 'cv_risk_score',
    'kidney_severity', 'comorbidity_count',
    'glp1_exclusive_score', 'sglt2_exclusive_score', 'insulin_exclusive_score',
    'metformin_exclusive_score', 'dpp4_exclusive_score',
    'obesity_nafld_interaction', 'cvd_kidney_interaction',
    'deficiency_severity_interaction'
]

print(f"\nEngineered features created: {len(engineered_features)}")
print(f"  Original: 8")
print(f"  Exclusive scores: 5")
print(f"  Interactions: 3")
print(f"  Total: {len(engineered_features)}")

print("\nEngineered feature statistics:")
print(df_eng[engineered_features].describe().round(2))

##  Update Feature Lists with Engineered Features

In [None]:
continuous_features_eng = continuous_features + [
    'insulin_deficiency_score', 'beta_cell_reserve', 'glucose_severity',
    'disease_progression', 'metabolic_syndrome_score', 'cv_risk_score',
    'obesity_nafld_interaction', 'cvd_kidney_interaction',
    'deficiency_severity_interaction'
]

binary_features_eng = binary_features + [
    'kidney_severity', 'comorbidity_count',
    'glp1_exclusive_score', 'sglt2_exclusive_score', 'insulin_exclusive_score',
    'metformin_exclusive_score', 'dpp4_exclusive_score'
]

all_features = feature_cols + engineered_features

print(f"Total features after engineering: {len(all_features)}")
print(f"Continuous features: {len(continuous_features_eng)}")
print(f"Binary/Categorical features: {len(binary_features_eng)}")

## Feature Importance Analysis

In [None]:
print("Calculating feature importance scores...")
print("="*60)

X_importance = df_eng[all_features].copy()
X_importance['gender'] = (X_importance['gender'] == 'Female').astype(int)
ethnicity_map = {'African': 0, 'Asian': 1, 'Caucasian': 2, 'Hispanic': 3, 'Other': 4}
X_importance['ethnicity'] = X_importance['ethnicity'].map(ethnicity_map)

y_importance = df_eng['hba1c_reduction']

mi_scores = mutual_info_regression(X_importance, y_importance, random_state=42)
mi_scores = pd.Series(mi_scores, index=all_features).sort_values(ascending=False)

print("\nTop 15 features by mutual information:")
for i, (feat, score) in enumerate(mi_scores.head(15).items(), 1):
    print(f"{i:2d}. {feat:30s} {score:.4f}")

plt.figure(figsize=(12, 8))
mi_scores.head(20).plot(kind='barh', color='steelblue')
plt.title('Top 20 Features by Mutual Information Score', fontweight='bold')
plt.xlabel('Mutual Information Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

## Feature Selection

In [None]:
## Selecting top features
top_k = 21
selected_features = mi_scores.head(top_k).index.tolist()

print(f"\nSelected {top_k} features:")
for i, feat in enumerate(selected_features, 1):
    score = mi_scores[feat]
    feat_type = "continuous" if feat in continuous_features_eng else "binary"
    print(f"{i:2d}. {feat:30s} (MI: {score:.4f}, {feat_type})")

feature_cols_final = selected_features
continuous_features_final = [f for f in selected_features if f in continuous_features_eng]
binary_features_final = [f for f in selected_features if f in binary_features_eng]

print(f"\nFinal feature set:")
print(f"  Total: {len(feature_cols_final)}")
print(f"  Continuous: {len(continuous_features_final)}")
print(f"  Binary/Categorical: {len(binary_features_final)}")

## Encode Categorical Variables

In [None]:
df_processed = df_eng.copy()

print("Encoding categorical variables...")
print("="*60)

df_processed['gender'] = (df_processed['gender'] == 'Female').astype(int)
print("\nGender encoding: Female -> 1, Male -> 0")

ethnicity_map = {'African': 0, 'Asian': 1, 'Caucasian': 2, 'Hispanic': 3, 'Other': 4}
df_processed['ethnicity'] = df_processed['ethnicity'].map(ethnicity_map)
print("\nEthnicity encoding:")
for eth, code in ethnicity_map.items():
    print(f"  {eth:10s} -> {code}")

treatment_map = {'Metformin': 0, 'GLP-1': 1, 'SGLT-2': 2, 'DPP-4': 3, 'Insulin': 4}
df_processed['treatment'] = df_processed['treatment'].map(treatment_map)
print("\nTreatment encoding:")
for treat, code in treatment_map.items():
    print(f"  {treat:10s} -> {code}")

print("\nEncoding complete")

## Standardize Features

In [None]:
## Standardizing continuous features

scaler = StandardScaler()
df_processed[continuous_features_final] = scaler.fit_transform(df_processed[continuous_features_final])

print("\nAfter standardization:")
print(df_processed[continuous_features_final].describe().round(2))

print("\nStandardization complete")
print(f"  Continuous features standardized: {len(continuous_features_final)}")
print(f"  Binary features unchanged: {len(binary_features_final)}")

## Prepare Final Dataset

In [None]:
final_df = df_processed[feature_cols_final + ['treatment', 'hba1c_reduction']].copy()

print("Final Dataset Preparation:")
print("="*60)
print(f"\nShape: {final_df.shape}")
print(f"Features: {len(feature_cols_final)}")
print(f"Target columns: treatment, hba1c_reduction")
print(f"\nMissing values: {final_df.isnull().sum().sum()}")
print(f"Duplicate rows: {final_df.duplicated().sum()}")

print("\nFeature Summary:")
print(f"  Continuous (scaled): {len(continuous_features_final)}")
print(f"  Binary/Categorical (encoded): {len(binary_features_final)}")
print(f"  Total features: {len(feature_cols_final)}")

print("\nTreatment distribution:")
print(final_df['treatment'].value_counts().sort_index())

print("\nSample of processed data:")
final_df.head()

## Data Quality Verification

In [None]:
## Data Quality Verification

print("\nTarget variable statistics:")
print(f"  Treatment classes: {final_df['treatment'].nunique()}")
print(f"  HbA1c reduction range: [{final_df['hba1c_reduction'].min():.2f}, {final_df['hba1c_reduction'].max():.2f}]")
print(f"  HbA1c reduction mean: {final_df['hba1c_reduction'].mean():.2f}")
print(f"  HbA1c reduction std: {final_df['hba1c_reduction'].std():.2f}")

print("\nData quality checks:")
print(f"  All features present: {all(col in final_df.columns for col in feature_cols_final)}")
print(f"  No missing values: {final_df.isnull().sum().sum() == 0}")
print(f"  No infinite values: {not np.isinf(final_df.select_dtypes(include=[np.number])).any().any()}")

print("\nData is ready for model training")

## Create Output Directories

In [None]:
## Creating output directories
os.makedirs('dataset/preprocessed', exist_ok=True)
os.makedirs('features', exist_ok=True)

print("\nDirectories created:")
print("  data/preprocessed/")
print("  features/")

## Save Training Data

In [None]:
output_file = 'dataset/preprocessed/training_data.csv'
final_df.to_csv(output_file, index=False)

print(f"Training data saved: {output_file}")
print(f"  Shape: {final_df.shape}")
print(f"  Size: {os.path.getsize(output_file) / 1024:.2f} KB")

## Save Feature Scaler

In [None]:
scaler_file = 'features/feature_scaler.pkl'
os.makedirs(os.path.dirname(scaler_file), exist_ok=True)

with open(scaler_file, 'wb') as f:
    pickle.dump(scaler, f)

print(f"Feature scaler saved: {scaler_file}")
print(f"  Scaler type: StandardScaler")
print(f"  Features scaled: {len(continuous_features_final)}")
print(f"  Size: {os.path.getsize(scaler_file) / 1024:.2f} KB")

## Save Preprocessing Metadata

In [None]:
metadata = {
    'feature_cols': feature_cols_final,
    'continuous_features': continuous_features_final,
    'binary_features': binary_features_final,
    'treatment_map': treatment_map,
    'ethnicity_map': ethnicity_map,
    'engineered_features': engineered_features,
    'n_features': len(feature_cols_final)
}

metadata_file = 'features/preprocessing_metadata.json'

with open(metadata_file, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Preprocessing metadata saved: {metadata_file}")
print(f"\nMetadata contents:")
print(f"  Feature columns: {len(metadata['feature_cols'])}")
print(f"  Continuous features: {len(metadata['continuous_features'])}")
print(f"  Binary features: {len(metadata['binary_features'])}")
print(f"  Engineered features: {len(metadata['engineered_features'])}")
print(f"  Treatment classes: {len(metadata['treatment_map'])}")
print(f"  Total features for model: {metadata['n_features']}")