In [0]:
!pip install catboost

Collecting catboost
  Obtaining dependency information for catboost from https://files.pythonhosted.org/packages/74/0c/cff2c8fa0ccac3df0589334846d470c76295f7f3a0cc5954d87d81b1bdcc/catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Obtaining dependency information for graphviz from https://files.pythonhosted.org/packages/00/be/d59db2d1d52697c6adc9eacaf50e8965b6345cc143f671e1ed068818d5cf/graphviz-0.20.3-py3-none-any.whl.metadata
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/98.7 MB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/98.7 MB[0m [31m3.4 MB/s[0m eta [36m0:00:29[0m
[2K   [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


from sklearn.preprocessing import LabelEncoder

import mlflow

In [0]:
data = pd.read_csv('https://raw.githubusercontent.com/juankquintana/prediccion_salarios/main/Data/data_top10.csv')

# función para clasificar rangos de salarios
def classify_salary(salary):
    if salary < 75000:
        return 'Bajo'
    elif 75000 <= salary < 120000:
        return 'Medio_Bajo'
    elif 120000 <= salary < 180000:
        return 'Medio_Alto'
    else:
        return 'Alto'

In [0]:
# Mapeo Opcion 1 salary_class
data['salary_class'] = data['salary_in_usd'].apply(classify_salary)

data = data.drop('salary_in_usd', axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11193 entries, 0 to 11192
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   experience_level    11193 non-null  object
 1   employment_type     11193 non-null  object
 2   job_title           11193 non-null  object
 3   employee_residence  11193 non-null  object
 4   remote_ratio        11193 non-null  int64 
 5   company_location    11193 non-null  object
 6   company_size        11193 non-null  object
 7   salary_class        11193 non-null  object
dtypes: int64(1), object(7)
memory usage: 699.7+ KB


In [0]:
# Índices de las columnas categóricas
indices_var_cat = [0, 1, 2, 3, 5, 6]  

X = data.drop('salary_class', axis=1)
Y = data['salary_class']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [0]:
import mlflow
import mlflow.catboost
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Registrar el experimento (sin la URI)
#mlflow.set_experiment("CatBoost")  # Ajusta la ruta según tu nombre de usuario

experiment_name = "/Users/jk.sepulveda@uniandes.edu.co/CatBoost JK2"  

# Busca el experimento por nombre
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    # Si el experimento no existe, lo creamos
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    # Si ya existe, obtenemos su experiment_id
    experiment_id = experiment.experiment_id

In [0]:

# Aquí se ejecuta MLflow sin especificar un nombre o id del experimento.
with mlflow.start_run(experiment_id=experiment_id):
    # Definir los parámetros del modelo
    iterations = 285
    learning_rate = 0.041
    depth = 9
    bagging_temperature = 0.5397474893002537
    random_strength = 3.020207704599305
    cat_features = indices_var_cat
    verbose = 0
    loss_function = 'MultiClass'
    
    # Crear y ajustar el modelo CatBoost
    modelo_CAT = CatBoostClassifier(iterations=iterations, learning_rate=learning_rate, 
                                    depth=depth, cat_features=cat_features, 
                                    verbose=verbose, loss_function=loss_function, 
                                    bagging_temperature =bagging_temperature, random_strength =random_strength)
    modelo_CAT.fit(X_train, y_train)

    # Hacer predicciones y evaluar
    y_pred = modelo_CAT.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
  
    # Registrar los parámetros
    mlflow.log_param("iterations", iterations)
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("depth", depth)
    mlflow.log_param("verbose", verbose)
    mlflow.log_param("loss_function", loss_function)
    mlflow.log_param("bagging_temperature",bagging_temperature)
    mlflow.log_param("random_strength",random_strength)
  
    # Registrar el modelo
    mlflow.catboost.log_model(modelo_CAT, "CatBoost-model")
  
    # Registrar la métrica
    mlflow.log_metric("accuracy", accuracy)
    print(f'Accuracy: {accuracy:.2f}')



Accuracy: 0.42
