In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import lightgbm as lgb
# from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE

In [3]:
# Load the dataset
df = pd.read_csv('pronostico_dataset.csv', delimiter=";")

# Drop the ID column
df.drop(columns='ID', inplace=True)

# Encode the target variable
le = LabelEncoder()
df['prognosis'] = le.fit_transform(df['prognosis'])

In [4]:
df.head()

Unnamed: 0,age,systolic_bp,diastolic_bp,cholesterol,prognosis
0,77.19634,85.288742,80.021878,79.957109,1
1,63.52985,99.379736,84.852361,110.382411,1
2,69.003986,111.349455,109.850616,100.828246,1
3,82.63821,95.056128,79.666851,87.066303,1
4,78.346286,109.154591,90.71322,92.51177,1


In [5]:
# Feature engineering
X = df[['age', 'systolic_bp', 'diastolic_bp', 'cholesterol']].copy()

# Add derived features
X['bp_ratio'] = X['systolic_bp'] / X['diastolic_bp'].replace(0, np.nan)
X['pulse_pressure'] = X['systolic_bp'] - X['diastolic_bp']
X['age_cholesterol'] = X['age'] * X['cholesterol']
X['bp_product'] = X['systolic_bp'] * X['diastolic_bp']
X['age_squared'] = X['age'] ** 2
X['high_cholesterol'] = (X['cholesterol'] > 100).astype(int)

In [6]:
# Handle outliers using IQR
for col in X.columns:
    Q1 = X[col].quantile(0.25)
    Q3 = X[col].quantile(0.75)
    IQR = Q3 - Q1
    X[col] = X[col].clip(lower=Q1 - 1.5 * IQR, upper=Q3 + 1.5 * IQR)

# Check for NaNs and infinities
print("NaNs in X:", X.isna().sum())
print()
print("Infinities in X:", np.isinf(X).sum())

NaNs in X: age                 0
systolic_bp         0
diastolic_bp        0
cholesterol         0
bp_ratio            0
pulse_pressure      0
age_cholesterol     0
bp_product          0
age_squared         0
high_cholesterol    0
dtype: int64

Infinities in X: age                 0
systolic_bp         0
diastolic_bp        0
cholesterol         0
bp_ratio            0
pulse_pressure      0
age_cholesterol     0
bp_product          0
age_squared         0
high_cholesterol    0
dtype: int64


In [7]:
# Impute NaNs with median
X = X.fillna(X.median())
y = df['prognosis'].loc[X.index]

# Check class distribution
print("Class distribution:", pd.Series(y).value_counts(normalize=True))
minority_count = pd.Series(y).value_counts().min()
print("Minority class count:", minority_count)

Class distribution: prognosis
1    0.5145
0    0.4855
Name: proportion, dtype: float64
Minority class count: 2913


In [8]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTE only if minority class is sufficient
if minority_count >= 10:
    smote = SMOTE(sampling_strategy=0.8, random_state=42, k_neighbors=3)
    try:
        X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
    except ValueError as e:
        print(f"SMOTE failed: {e}. Using class weights instead.")
        X_resampled, y_resampled = X_scaled, y
else:
    print("Minority class too small for SMOTE. Using class weights instead.")
    X_resampled, y_resampled = X_scaled, y

SMOTE failed: The specified ratio required to remove samples from the minority class while trying to generate new samples. Please increase the ratio.. Using class weights instead.


In [9]:
# Split the dataset (stratified)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Compute class weights for imbalance
class_weights = {0: len(y_train) / (2 * np.bincount(y_train)[0]), 1: len(y_train) / (2 * np.bincount(y_train)[1])}

In [10]:
# Define models with class weights
# cat_model = CatBoostClassifier(random_state=42, verbose=0, class_weights=class_weights)
lgb_model = lgb.LGBMClassifier(random_state=42, class_weight='balanced')
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss', scale_pos_weight=class_weights[1])
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')
svm_model = SVC(random_state=42, class_weight='balanced', probability=True)
dt_model = DecisionTreeClassifier(random_state=42, class_weight='balanced')

In [11]:
# # Hyperparameter tuning for CatBoost
# cat_params = {
#     'iterations': [200, 500],
#     'depth': [4, 8],
#     'learning_rate': [0.01, 0.1]
# }
# cat_search = RandomizedSearchCV(cat_model, cat_params, n_iter=10, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1, random_state=42)
# cat_search.fit(X_train, y_train)
# best_cat = cat_search.best_estimator_

In [12]:
# Hyperparameter tuning for LightGBM
lgb_params = {
    'n_estimators': [200, 400],
    'max_depth': [5, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50]
}
lgb_search = RandomizedSearchCV(lgb_model, lgb_params, n_iter=10, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1, random_state=42)
lgb_search.fit(X_train, y_train)
best_lgb = lgb_search.best_estimator_

[LightGBM] [Info] Number of positive: 2470, number of negative: 2330
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000689 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2298
[LightGBM] [Info] Number of data points in the train set: 4800, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [13]:
# Hyperparameter tuning for XGBoost
xgb_params = {
    'n_estimators': [200, 400],
    'max_depth': [5, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0]
}
xgb_search = RandomizedSearchCV(xgb_model, xgb_params, n_iter=10, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1, random_state=42)
xgb_search.fit(X_train, y_train)
best_xgb = xgb_search.best_estimator_

In [14]:
# Hyperparameter tuning for Random Forest
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_search = RandomizedSearchCV(rf_model, rf_params, n_iter=10, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1, random_state=42)
rf_search.fit(X_train, y_train)
best_rf = rf_search.best_estimator_

In [15]:
# Hyperparameter tuning for SVM
svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 'auto']
}
svm_search = RandomizedSearchCV(svm_model, svm_params, n_iter=10, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1, random_state=42)
svm_search.fit(X_train, y_train)
best_svm = svm_search.best_estimator_

In [16]:
# Hyperparameter tuning for Decision Tree
dt_params = {
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
dt_search = RandomizedSearchCV(dt_model, dt_params, n_iter=10, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1, random_state=42)
dt_search.fit(X_train, y_train)
best_dt = dt_search.best_estimator_

In [17]:
# Define Voting Classifier (soft voting)
voting_clf = VotingClassifier(
    estimators=[
#         ('cat', best_cat),
        ('lgb', best_lgb),
        ('xgb', best_xgb),
        ('rf', best_rf),
        ('svm', best_svm),
        ('dt', best_dt)
    ],
    voting='soft',
    n_jobs=-1
)

# Fit the Voting Classifier
voting_clf.fit(X_train, y_train)

In [18]:
# Initialize results dictionary
results = []

In [19]:
# Function to evaluate model and store metrics
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = {
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1-Score': f1_score(y_test, y_pred, average='weighted')
    }
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train, y_train, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='accuracy')
    metrics['CV Accuracy'] = cv_scores.mean()
    metrics['CV Std'] = cv_scores.std()
    # Train accuracy
    y_train_pred = model.predict(X_train)
    metrics['Train Accuracy'] = accuracy_score(y_train, y_train_pred)
    return metrics

In [20]:
# Evaluate all models
models = [
#     ('CatBoost', best_cat),
    ('LightGBM', best_lgb),
    ('XGBoost', best_xgb),
    ('Random Forest', best_rf),
    ('SVM', best_svm),
    ('Decision Tree', best_dt),
    ('Voting Classifier', voting_clf)
]

for name, model in models:
    results.append(evaluate_model(name, model, X_train, X_test, y_train, y_test))

[LightGBM] [Info] Number of positive: 2470, number of negative: 2330
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000692 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2298
[LightGBM] [Info] Number of data points in the train set: 4800, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 1976, number of negative: 1864
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001087 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2298
[LightGBM] [Info] Number of data points in the train set: 3840, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 1976, number of negative: 1864
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000394 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2298
[LightGBM] [Info] Number of data points in the train set: 3840, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 1976, number of negative: 1864
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000592 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2298
[LightGBM] [Info] Number of data points in the train set: 3840, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 1976, number of negative: 1864
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000311 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2298
[LightGBM] [Info] Number of data points in the train set: 3840, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 1976, number of negative: 1864
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2298
[LightGBM] [Info] Number of data points in the train set: 3840, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




In [21]:
# Create results DataFrame
results_df = pd.DataFrame(results)

In [22]:
# Print results table
print("\nModel Performance Metrics:")
print(results_df.to_string(index=False))


Model Performance Metrics:
            Model  Accuracy  Precision   Recall  F1-Score  CV Accuracy   CV Std  Train Accuracy
         LightGBM  0.770000   0.770238 0.770000  0.770042     0.735000 0.018838        0.769792
          XGBoost  0.759167   0.759122 0.759167  0.759119     0.735833 0.011346        0.803333
    Random Forest  0.761667   0.761633 0.761667  0.761641     0.729792 0.011422        0.893542
              SVM  0.771667   0.771691 0.771667  0.771677     0.742500 0.016866        0.746458
    Decision Tree  0.755833   0.756566 0.755833  0.755871     0.726042 0.011024        0.754792
Voting Classifier  0.771667   0.771667 0.771667  0.771667     0.745000 0.014811        0.788542


In [23]:
import joblib

# Save the trained SVM model
joblib.dump(best_svm, 'best_svm_model.pkl')

# Save the StandardScaler
joblib.dump(scaler, 'scaler.pkl')

# Save the LabelEncoder
joblib.dump(le, 'label_encoder.pkl')

['label_encoder.pkl']