In [12]:
from sqlalchemy import create_engine, inspect
import mlflow
import pandas as pd

In [18]:
from sklearn.model_selection import train_test_split

### Parametros conexiones

In [13]:
db_config = {
    "host": "10.43.101.189",
    "port": 3306,
    "user": "taller-mlflow",
    "password": "mlflow",
    "database": "taller",
}

tracking_uri = "http://10.43.101.189:5000"

### Carga de datos desde servidor de `MySql`

In [14]:
engine = create_engine(
    f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
)

# Crear el inspector de la base de datos
inspector = inspect(engine)

# Obtener la lista de tablas en la base de datos 'taller'
tables = inspector.get_table_names()

# Imprimir la lista de tablas
print("Tablas en la base de datos 'taller':", tables)


Tablas en la base de datos 'taller': ['iris_cleaned', 'iris_raw']


In [15]:
df = pd.read_sql_table('iris_cleaned', engine)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


### Set up MLflow

In [16]:
# Set up MLflow tracking URI
tracking_uri = "http://10.43.101.189:5000"
mlflow.set_tracking_uri(tracking_uri)

# Create an experiment
experiment_name = "iris_classification_experiment"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='s3://mlflows3/artifacts/1', creation_time=1742159536746, experiment_id='1', last_update_time=1742159536746, lifecycle_stage='active', name='iris_classification_experiment', tags={}>

### Serparación dataset

In [19]:
X = df.drop(columns=['species'])
y = df['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [21]:
X_train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
57,4.9,2.4,3.3,1.0
109,7.2,3.6,6.1,2.5
117,7.7,3.8,6.7,2.2
116,6.5,3.0,5.5,1.8
91,6.1,3.0,4.6,1.4
...,...,...,...,...
49,5.0,3.3,1.4,0.2
129,7.2,3.0,5.8,1.6
64,5.6,2.9,3.6,1.3
37,4.9,3.6,1.4,0.1


### Optimización de hiperparametros con Optuna para `RandomForestClassifier`

In [None]:
# Define objective function
def objective(trial):
    with mlflow.start_run(nested=True):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 600), 
            'max_depth': trial.suggest_int('max_depth', 3, 30), 
            'min_samples_split': trial.suggest_int('min_samples_split', 3, 20), 
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 3, 20), 
            'max_features': trial.suggest_float('max_features', 0.5, 1.0),
            'max_samples': trial.suggest_float('max_samples', 0.7, 1.0)
        }
        
        # Initialize model
        model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)

        # Define Stratified K-Folds
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        # Compute cross-validation scores using Average Precision
        scores = cross_val_score(model, X, y, cv=skf, scoring="average_precision", n_jobs=-1)

        # Log parameters and metrics
        mlflow.log_params(params)
        mlflow.log_metric("mean_auprc", np.mean(scores))

        return np.mean(scores)