In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv("../data/clean_offers.csv")

In [10]:
df.dtypes

url                                   object
title                                 object
company                               object
rating                               float64
salary                               float64
contract                              object
schedule                              object
description                           object
education                             object
experience                           float64
age                                   object
skills                                object
city                                  object
department                            object
category                              object
description_lem                       object
gestion_planeacion                     int64
finanzas_contabilidad                  int64
auditoria_calidad                      int64
recursos_humanos                       int64
logistica_operaciones                  int64
desarrollo_tecnologia_empresarial      int64
lenguajes_

In [4]:
conteo_ciudades = df['salary'].value_counts().sort_values(ascending=False).reset_index()
conteo_ciudades.columns = ['salary', 'frecuencia']
pd.set_option('display.max_rows', None)  # Muestra todas las filas
print(conteo_ciudades)

        salary  frecuencia
0    1423500.0         182
1    2000000.0          89
2    1800000.0          69
3    2500000.0          61
4    1500000.0          56
5    3000000.0          51
6    4000000.0          27
7    3500000.0          27
8    1600000.0          23
9    2200000.0          22
10   1300000.0          19
11   2300000.0          19
12   1700000.0          18
13   2600000.0          16
14   2800000.0          15
15   1900000.0          15
16   1935000.0          13
17   1423000.0          11
18   3274000.0          10
19   2100000.0          10
20   2540000.0           9
21   5000000.0           9
22   2700000.0           9
23   3200000.0           9
24   3300000.0           9
25   2400000.0           8
26   2123500.0           7
27   4500000.0           6
28   4300000.0           6
29   1623500.0           5
30   1452000.0           5
31   3800000.0           5
32   1885000.0           5
33   2880000.0           4
34   3600000.0           4
35   3476000.0           3
3

In [5]:
df['salary'].describe()

count    1.208000e+03
mean     2.333511e+06
std      1.009999e+06
min      1.160000e+06
25%      1.527750e+06
50%      2.000000e+06
75%      2.800000e+06
max      8.898000e+06
Name: salary, dtype: float64

In [6]:
df['salary'].value_counts().sort_index()


salary
1160000.0      1
1300000.0     19
1309600.0      1
1350000.0      1
1400000.0      3
1420000.0      1
1423000.0     11
1423500.0    182
1425000.0      3
1425500.0      1
1432000.0      1
1432500.0      1
1445000.0      1
1449999.0      1
1450000.0      1
1452000.0      5
1457000.0      1
1460000.0      1
1463008.0      1
1467000.0      1
1468000.0      1
1473500.0      1
1494675.0      2
1500000.0     56
1508000.0      1
1513500.0      1
1520812.0      2
1521000.0      1
1530000.0      1
1544530.0      1
1549000.0      1
1549998.0      1
1550000.0      3
1553536.0      1
1554000.0      1
1558541.0      1
1566000.0      1
1568000.0      1
1571500.0      2
1573200.0      1
1589000.0      2
1594500.0      2
1599998.0      1
1600000.0     23
1609000.0      2
1610000.0      1
1614200.0      1
1620000.0      1
1623000.0      2
1623500.0      5
1629000.0      1
1630000.0      1
1639200.0      2
1650000.0      2
1667000.0      2
1670000.0      1
1676500.0      1
1680000.0      3
1695000

In [7]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler



# ====================
# Paso 1: Crear columnas adicionales de experiencia
# ====================
df['experiencia2'] = df['experience'] ** 2
df['experiencia_log'] = np.log1p(df['experience'])

# ====================
# Paso 2: Separar datos con y sin salario
# ====================
df_con_salario = df[df['salary'].notna()].copy()
df_sin_salario = df[df['salary'].isna()].copy()

# ====================
# Paso 3: Preparar variables
# ====================
target = 'salary'
y = np.log1p(df_con_salario[target])  # log(1 + salario)
X = df_con_salario.drop(columns=[target, 'url', 'title', 'company', 'rating', 'description', 'schedule'])

# Mismas columnas para el DataFrame sin salario
X_missing = df_sin_salario.drop(columns=['salary', 'url', 'title', 'company', 'rating', 'description', 'schedule'])

categorical_cols = X.select_dtypes(include='object').columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# ====================
# Preprocesamiento
# ====================
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
])

# ====================
# Pipeline
# ====================
rf = RandomForestRegressor(n_estimators=200, max_depth=None,
                           min_samples_split=5, min_samples_leaf=2,
                           random_state=42)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', SelectFromModel(estimator=rf, threshold='median')),
    ('regressor', rf)
])

# ====================
# Entrenamiento
# ====================
model_pipeline.fit(X, y)

# ====================
# Predicción para los que no tienen salario
# ====================
y_missing_log = model_pipeline.predict(X_missing)
df_sin_salario['salary_pred'] = np.expm1(y_missing_log)  # inversa de log1p

# ====================
# Unión final del DataFrame
# ====================
df_final = pd.concat([
    df_con_salario,
    df_sin_salario
], ignore_index=True)

# El salario estimado estará en 'salary_pred' para los que lo tenían vacío

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# División de los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenamiento del modelo
model_pipeline.fit(X_train, y_train)

# Predicciones sobre el conjunto de prueba
y_pred_log = model_pipeline.predict(X_test)

# Métricas en escala logarítmica
mse_log = mean_squared_error(y_test, y_pred_log)
r2_log = r2_score(y_test, y_pred_log)

# Métricas en escala real (salario)
y_test_real = np.expm1(y_test)
y_pred_real = np.expm1(y_pred_log)

mse_real = mean_squared_error(y_test_real, y_pred_real)
r2_real = r2_score(y_test_real, y_pred_real)

print(f"--- Evaluación en escala logarítmica ---")
print(f"MSE (log): {mse_log:.4f}")
print(f"R² (log): {r2_log:.4f}")

print(f"\n--- Evaluación en escala real ---")
print(f"MSE (real): {mse_real:.2f}")
print(f"R² (real): {r2_real:.4f}")


--- Evaluación en escala logarítmica ---
MSE (log): 0.0781
R² (log): 0.4907

--- Evaluación en escala real ---
MSE (real): 825072274115.11
R² (real): 0.3807


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Suponiendo que 'df' ya existe antes

# Definir variables
categorical = ['contract', 'education', 'city', 'category', 'department', 'company']
binary_vars = ['gestion_planeacion', 'finanzas_contabilidad', 'auditoria_calidad',
               'recursos_humanos', 'logistica_operaciones', 'desarrollo_tecnologia_empresarial',
               'lenguajes_bases_datos', 'bi_visualizacion', 'sistemas_operativos', 'microsoft_office',
               'erp_crm', 'gis_cad_diseno', 'web_programacion', 'adobe_suite', 'otros_tech',
               'big_data_procesamiento', 'redes', 'personales', 'interpersonales', 'comerciales_servicio',
               'sectores_especificos', 'otros_operativos', 'idiomas']
numerical = ['experience'] + binary_vars

# Eliminar filas con nulos solo para modelar, sin tocar el df original
df_modelo = df.dropna(subset=['salary', 'experience'])

# Variables predictoras y objetivo (sin eliminar columnas del df original)
X = df_modelo[categorical + numerical]
y = df_modelo['salary']

# Preprocesamiento
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
], remainder='passthrough')  # mantiene numéricas y binarias

# División
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modelos a evaluar
modelos = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'DecisionTree': DecisionTreeRegressor(),
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'KNeighbors': KNeighborsRegressor(),
    'SVR': SVR()
}

# Evaluación
resultados = []
for nombre, modelo in modelos.items():
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', modelo)
    ])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    resultados.append({'Modelo': nombre, 'MSE': mse, 'R2': r2})

# Mostrar resultados
df_resultados = pd.DataFrame(resultados).sort_values(by='R2', ascending=False)
print(df_resultados)


  model = cd_fast.sparse_enet_coordinate_descent(


             Modelo           MSE        R2
4      RandomForest  7.815920e+11  0.413338
5  GradientBoosting  8.527436e+11  0.359932
6        KNeighbors  9.048840e+11  0.320795
3      DecisionTree  9.653607e+11  0.275402
2             Lasso  9.898431e+11  0.257025
1             Ridge  1.334565e+12 -0.001723
0  LinearRegression  1.334565e+12 -0.001723
7               SVR  1.470968e+12 -0.104106
