In [None]:
from flask import Flask
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

app = Flask(__name__)

# Define the feature and target columns
feature_columns = ['Age', 'Sex', 'Marital', 'Income', 'Race', 'WaistCirc', 'BMI',
                   'Albuminuria', 'UrAlbCr', 'UricAcid', 'BloodGlucose', 'HDL', 'Triglycerides']
categorical_columns = ['Sex', 'Marital', 'Race']
numerical_columns = [col for col in feature_columns if col not in categorical_columns]
target_column = 'MetabolicSyndrome'

# Load CSV data
try:
    data = pd.read_csv('/content/Metabolic Syndrome.csv')
except FileNotFoundError:
    raise FileNotFoundError("Please ensure 'metabolic_syndrome_data.csv' is in the same directory as app.py")

# Print class distribution
print("Class distribution:", data[target_column].value_counts(normalize=True))

# Feature engineering: Add interaction terms
data['WaistCirc_BMI'] = data['WaistCirc'] * data['BMI']
data['Triglycerides_HDL'] = data['Triglycerides'] / (data['HDL'] + 1e-6)
feature_columns += ['WaistCirc_BMI', 'Triglycerides_HDL']
numerical_columns += ['WaistCirc_BMI', 'Triglycerides_HDL']

# Prepare features and target
X = data[feature_columns]
y = data[target_column]

# Handle missing values for numerical columns
imputer = SimpleImputer(strategy='mean')
X_numerical = imputer.fit_transform(X[numerical_columns])
X_numerical_df = pd.DataFrame(X_numerical, columns=numerical_columns)

# Print imputer means for debugging
print("Imputer means:", dict(zip(numerical_columns, imputer.statistics_)))

# Scale numerical features
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical_df)
X_numerical_scaled_df = pd.DataFrame(X_numerical_scaled, columns=numerical_columns)

# Encode categorical columns
encoders = {}
X_categorical = X[categorical_columns].copy()
for col in categorical_columns:
    encoders[col] = LabelEncoder()
    X_categorical[col] = X_categorical[col].fillna('Missing')
    X_categorical[col] = encoders[col].fit_transform(X_categorical[col])
    missing_mask = X[categorical_columns][col].isna()
    X_categorical.loc[missing_mask, col] = np.nan
    # Convert to nullable Int64 to handle NaN and ensure integer type
    X_categorical[col] = X_categorical[col].astype('Int64')

# Print mode for categorical columns
for col in categorical_columns:
    mode_value = X_categorical[col].mode()[0]
    print(f"{col} mode:", mode_value, "Encoded as:", encoders[col].inverse_transform([mode_value])[0])

# Combine numerical and categorical features
X_processed = pd.concat([X_numerical_scaled_df, X_categorical.reset_index(drop=True)], axis=1)[feature_columns]

# Impute any remaining NaN in categorical columns with mode
for col in categorical_columns:
    if X_processed[col].isna().any():
        mode_value = int(X_processed[col].mode()[0])  # Ensure integer
        X_processed[col] = X_processed[col].fillna(mode_value)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)

# Calculate scale_pos_weight for class imbalance
negative_count = sum(y_train == 0)
positive_count = sum(y_train == 1)
scale_pos_weight = (negative_count / positive_count) * 0.7 if positive_count > 0 else 1  # Further reduce to avoid overpredicting positives
print("Scale pos weight:", scale_pos_weight)

# Train XGBoost model with hyperparameter tuning
xgb = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42, eval_metric='logloss')
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}
grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

# Evaluate model with custom threshold
y_pred_proba = model.predict_proba(X_test)[:, 1]
threshold = 0.65  # Increase to reduce false positives
y_pred = (y_pred_proba >= threshold).astype(int)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy (threshold={threshold}): {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print feature importances
print("Feature importances:", dict(zip(feature_columns, model.feature_importances_)))

# Save model, imputer, scaler, and encoders
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('imputer.pkl', 'wb') as f:
    pickle.dump(imputer, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

def predict_metabolic_syndrome(input_data):
    """
    Predict Metabolic Syndrome based on input data.
    
    Parameters:
    input_data (dict): Dictionary with keys as feature names and values as input values.
                       Missing values should be None or empty.
    
    Returns:
    str: Prediction result ('Metabolic Syndrome Present' or 'No Metabolic Syndrome')
    """
    data = {col: input_data.get(col, np.nan) for col in feature_columns}
    
    for col in data:
        if data[col] == '' or data[col] is None:
            data[col] = np.nan
        else:
            if col in numerical_columns and not pd.isna(data[col]):
                try:
                    data[col] = float(data[col])
                except (ValueError, TypeError):
                    data[col] = np.nan

    input_df = pd.DataFrame([data], columns=feature_columns)

    # Feature engineering for input
    input_df['WaistCirc_BMI'] = input_df['WaistCirc'] * input_df['BMI']
    input_df['Triglycerides_HDL'] = input_df['Triglycerides'] / (input_df['HDL'] + 1e-6)

    # Impute missing numerical values
    numerical_data = input_df[numerical_columns]
    imputed_data = imputer.transform(numerical_data)
    input_df[numerical_columns] = imputed_data

    # Clip numerical values to avoid extreme outliers
    for col in numerical_columns:
        q1, q3 = X[numerical_columns][col].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower_bound, upper_bound = q1 - 1.5 * iqr, q3 + 1.5 * iqr
        input_df[col] = input_df[col].clip(lower_bound, upper_bound)

    # Scale numerical features
    input_df[numerical_columns] = scaler.transform(input_df[numerical_columns])

    # Encode categorical variables
    for col in categorical_columns:
        if pd.isna(input_df[col]).iloc[0] or input_df[col].iloc[0] == '':
            mode_value = int(X_processed[col].mode()[0])
            input_df[col] = mode_value
        else:
            try:
                input_df[col] = encoders[col].transform([input_df[col].iloc[0]])[0]
            except ValueError:
                mode_value = int(X_processed[col].mode()[0])
                input_df[col] = mode_value
        input_df[col] = input_df[col].astype('Int64')

    # Print processed input for debugging
    processed_dict = input_df.to_dict(orient='records')[0]
    for col in categorical_columns:
        if col in processed_dict and not pd.isna(processed_dict[col]):
            processed_dict[col] = encoders[col].inverse_transform([processed_dict[col]])[0]
    print("Processed input:", processed_dict)

    # Predict with custom threshold
    X = input_df[feature_columns].values
    proba = model.predict_proba(X)[0, 1]
    print(f"Prediction probability: {proba:.4f}")
    prediction = 1 if proba >= threshold else 0
    return 'Metabolic Syndrome Present' if prediction == 1 else 'No Metabolic Syndrome'



Categorical modes (encoded): {'Sex': np.int64(0), 'Marital': np.int64(1), 'Race': np.int64(5)}
Categorical modes (decoded): {'Sex': 'Female', 'Marital': 'Married', 'Race': 'White'}
Model feature importances: {'Age': np.float32(0.04220668), 'Sex': np.float32(0.09509988), 'Marital': np.float32(0.021859279), 'Income': np.float32(0.023370527), 'Race': np.float32(0.032343876), 'WaistCirc': np.float32(0.08677558), 'BMI': np.float32(0.042829763), 'Albuminuria': np.float32(0.01625666), 'UrAlbCr': np.float32(0.03445383), 'UricAcid': np.float32(0.023577383), 'BloodGlucose': np.float32(0.20534304), 'HDL': np.float32(0.05996352), 'Triglycerides': np.float32(0.10186309), 'WaistCirc_BMI': np.float32(0.0817603), 'Triglycerides_HDL': np.float32(0.13229656)}

Running Test Case 1
Processed input (unscaled): {'Age': 50.0, 'Sex': 'Female', 'Marital': 'Single', 'Income': 3000.0, 'Race': 'Black', 'WaistCirc': 95.0, 'BMI': 27.0, 'Albuminuria': 0.0, 'UrAlbCr': 5.0, 'UricAcid': 5.5, 'BloodGlucose': 110.0, 'HDL

configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
