# Exploración con support vector machine

In [6]:
# Importamos librerias necesarias para realizar el analisis. 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# SVM
from sklearn.model_selection import (
    train_test_split,   
    StratifiedKFold,    
    GridSearchCV,       
    cross_validate,     
    ParameterGrid       
)
from sklearn.svm import SVC     
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

# Configuramos las opciones de pandas y seaborn para mejorar la visualización de datos.
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid", palette="muted", color_codes=True)

# Cargamos el dataset de diabetes y mostramos su tamaño.
df = pd.read_csv('../data/processed/diabetes_sample.csv')


In [7]:
Y = df['Diabetes_012']
X = df.drop(columns=["Diabetes_012"])

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=2, stratify=Y
)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)

model = SVC(class_weight='balanced', random_state=2)

params = model.get_params()
print("\nParámetros del modelo SVM:")
for param, valor in sorted(params.items()):
    print(f"{param:25s} = {valor}")


Parámetros del modelo SVM:
C                         = 1.0
break_ties                = False
cache_size                = 200
class_weight              = balanced
coef0                     = 0.0
decision_function_shape   = ovr
degree                    = 3
gamma                     = scale
kernel                    = rbf
max_iter                  = -1
probability               = False
random_state              = 42
shrinking                 = True
tol                       = 0.001
verbose                   = False


In [None]:
param_grid = {
    'kernel': ['linear', 'rbf'],   
    'C': [0.01, 0.1, 1, 10],            
    'gamma': ['scale', 'auto']  
}

grid_search = GridSearchCV(
    estimator=model,               
    param_grid=param_grid,
    scoring='f1_macro',            
    cv=cv,                      
    n_jobs=-1,                    
    verbose=2,
    return_train_score=True,    

)

# Entrenamiento con búsqueda de hiperparámetros
grid_search.fit(X_train, Y_train)

print("\nMejores parámetros encontrados:", grid_search.best_params_)
print("Mejor F1 macro en CV:", grid_search.best_score_)