In [2]:
# librerias y configuración

import pandas as pd
from pycaret.regression import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgbm
from sklearn.model_selection import GridSearchCV

In [2]:
# cargar datos

train = pd.read_csv('../data/train.csv')

train.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2,3,2,17,140250,47,100,41,1
1,2,3,2,17,135000,47,100,41,1
2,1,2,2,5,100000,47,100,41,1
3,1,2,0,36,270000,47,100,41,0
4,1,2,2,17,26005,40,0,41,0


In [3]:
# se usa pycaret para comparar los distintos modelos de regresión

reg = setup(data=train, target='salary_in_usd', session_id=1) 

Unnamed: 0,Description,Value
0,Session id,1
1,Target,salary_in_usd
2,Target type,Regression
3,Original data shape,"(500, 9)"
4,Transformed data shape,"(500, 9)"
5,Transformed train set shape,"(350, 9)"
6,Transformed test set shape,"(150, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


In [4]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,33706.0139,2592708265.056,49043.6121,0.5013,0.491,0.4754,0.111
gbr,Gradient Boosting Regressor,34502.9183,2800782851.7951,51239.5261,0.4531,0.5004,0.4657,0.034
rf,Random Forest Regressor,35807.7761,2935108314.1632,52251.2469,0.4305,0.5114,0.496,0.042
et,Extra Trees Regressor,37979.5177,3056052767.4477,53918.2314,0.3821,0.5157,0.522,0.038
ada,AdaBoost Regressor,40720.1828,3255120796.3956,55707.6558,0.3394,0.6089,0.7338,0.022
en,Elastic Net,41318.1934,3566384283.7048,57967.4447,0.3032,0.6712,0.8554,0.015
ridge,Ridge Regression,41326.1484,3600234041.6571,58009.0511,0.3024,0.6707,0.8113,0.015
lr,Linear Regression,41347.0671,3608687029.1892,58083.3097,0.3008,0.6727,0.8114,0.361
lasso,Lasso Regression,41347.0828,3608681464.8822,58083.2038,0.3008,0.6727,0.8115,0.016
lar,Least Angle Regression,41347.0671,3608687029.1892,58083.3097,0.3008,0.6727,0.8114,0.014


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

LGBMRegressor(random_state=1)

In [3]:
# de entre los 5 mejores modelos según pycaret, se cambian sus hiperparametros y se comparan sus RMSE

df = pd.read_csv("../data/train.csv")

X = df.drop("salary_in_usd", axis=1) 
y = df["salary_in_usd"] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
# Gradient Boosting Regressor

param_grid_gbr = {
'n_estimators': [50, 100, 150],
'max_depth': [3, 5, 7],
'learning_rate': [0.1, 0.01, 0.001]
}

gbr_grid = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gbr, cv=3, n_jobs=-1)
gbr_grid.fit(X_train, y_train)
y_pred_gbr = gbr_grid.predict(X_test)
rmse_gbr = mean_squared_error(y_test, y_pred_gbr, squared=False)
print("Best RMSE for Gradient Boosting Regressor:", rmse_gbr)
print("Best parameters for Gradient Boosting Regressor:", gbr_grid.best_params_)

Best RMSE for Gradient Boosting Regressor: 46570.50685114079
Best parameters for Gradient Boosting Regressor: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}


In [None]:
# Random Forest Regressor



In [None]:
# Extra Trees Regressor



In [4]:
# Light Gradient Boosting Machine

param_grid_lgbm = {
'num_leaves': [31, 50, 100],
'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150],
'max_depth': [3, 5, 7]
}

lgbm_grid = GridSearchCV(lgbm.LGBMRegressor(random_state=42), param_grid_lgbm, cv=3, n_jobs=-1)
lgbm_grid.fit(X_train, y_train)
y_pred_lgbm = lgbm_grid.predict(X_test)
rmse_lgbm = mean_squared_error(y_test, y_pred_lgbm, squared=False)
print("Best RMSE for LGBM Regressor:", rmse_lgbm)
print("Best parameters for LGBM Regressor:", lgbm_grid.best_params_)

Best RMSE for LGBM Regressor: 47511.86490370402
Best parameters for LGBM Regressor: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'num_leaves': 31}


In [None]:
# AdaBoost Regressor



In [None]:
# se entrena el mejor modelo (LightGBM) con todo train y se predice test

df = pd.read_csv("../data/train.csv")

X_train = df.drop("salary_in_usd", axis=1)
y_train = df["salary_in_usd"]
X_test = pd.read_csv('../data/test.csv')

params = {
'learning_rate': 0.1,
'max_depth': 5,
'n_estimators': 150,
'num_leaves': 31,
}

lgbm_model = lgbm.LGBMRegressor(**params)

lgbm_model.fit(X_train, y_train)

y_pred_lgbm = lgbm_model.predict(X_test)

muestra = pd.DataFrame()

muestra['id'] = range(len(y_pred_lgbm))
muestra['salary_in_usd'] = pd.DataFrame(y_pred_lgbm)

muestra.to_csv('../data/muestra.csv', index=False)