In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


from sklearn.preprocessing import LabelEncoder

In [0]:
data = pd.read_csv('https://raw.githubusercontent.com/juankquintana/prediccion_salarios/main/Data/data_top10.csv')

# función para clasificar rangos de salarios
def classify_salary(salary):
    if salary < 75000:
        return 'Bajo'
    elif 75000 <= salary < 120000:
        return 'Medio_Bajo'
    elif 120000 <= salary < 180000:
        return 'Medio_Alto'
    else:
        return 'Alto'
    

# Mapeo Opcion 1 salary_class
data['salary_class'] = data['salary_in_usd'].apply(classify_salary)

data = data.drop('salary_in_usd', axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11193 entries, 0 to 11192
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   experience_level    11193 non-null  object
 1   employment_type     11193 non-null  object
 2   job_title           11193 non-null  object
 3   employee_residence  11193 non-null  object
 4   remote_ratio        11193 non-null  int64 
 5   company_location    11193 non-null  object
 6   company_size        11193 non-null  object
 7   salary_class        11193 non-null  object
dtypes: int64(1), object(7)
memory usage: 699.7+ KB


In [0]:
# Definir las columnas a codificar
categorical_columns = ['experience_level', 'employment_type', 'job_title',
                       'employee_residence', 'company_location', 'company_size']

# Aplicar codificación one-hot
data = pd.get_dummies(data, columns=categorical_columns)
data.head(3)

Unnamed: 0,remote_ratio,salary_class,experience_level_EN,experience_level_EX,experience_level_MI,experience_level_SE,employment_type_CT,employment_type_FT,employment_type_PT,job_title_Analyst,job_title_Data Analyst,job_title_Data Architect,job_title_Data Engineer,job_title_Data Scientist,job_title_Engineer,job_title_Machine Learning Engineer,job_title_Manager,job_title_Research Scientist,job_title_Software Engineer,employee_residence_Africa,employee_residence_Asia,employee_residence_Europe,employee_residence_North America,employee_residence_Oceania,employee_residence_South America,company_location_Africa,company_location_Asia,company_location_Europe,company_location_North America,company_location_Oceania,company_location_South America,company_size_L,company_size_M,company_size_S
0,0,Medio_Bajo,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0
1,100,Medio_Bajo,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0
2,0,Medio_Bajo,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0


In [0]:
X = data.drop('salary_class', axis=1)
Y = data['salary_class']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [0]:
import mlflow
import mlflow.sklearn


experiment_name = "/Users/jk.sepulveda@uniandes.edu.co/Random Forest JK"  

# Busca el experimento por nombre
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    # Si el experimento no existe, lo creamos
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    # Si ya existe, obtenemos su experiment_id
    experiment_id = experiment.experiment_id

In [0]:
# Aquí se ejecuta MLflow sin especificar un nombre o id del experimento.
with mlflow.start_run(experiment_id=experiment_id):
    # defina los parámetros del modelo
    n_estimators = 1000 
    max_depth = 10
    max_features = 5

    # Cree el modelo con los parámetros definidos y entrénelo
    modelo_rf = RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features,random_state=42)
    modelo_rf.fit(X_train, y_train)
    # Realice predicciones de prueba
    y_pred = modelo_rf.predict(X_test)
  
    # Registre los parámetros
    mlflow.log_param("num_trees", n_estimators)
    mlflow.log_param("maxdepth", max_depth)
    mlflow.log_param("max_feat", max_features)
  
    # Registre el modelo
    mlflow.sklearn.log_model(modelo_rf, "random-forest-model")

    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)
    print(accuracy)





0.4211701652523448
