In [1]:
# librerias y configuración

import pandas as pd
from pycaret.regression import *
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import HuberRegressor, BayesianRidge, Ridge, OrthogonalMatchingPursuit

In [2]:
# cargar datos

train = pd.read_csv('../data/train.csv')

In [3]:
# se usa pycaret para comparar los distintos modelos de regresión

reg = setup(data=train, target='salary_in_usd', session_id=1) 

Unnamed: 0,Description,Value
0,Session id,1
1,Target,salary_in_usd
2,Target type,Regression
3,Original data shape,"(500, 166)"
4,Transformed data shape,"(500, 166)"
5,Transformed train set shape,"(350, 166)"
6,Transformed test set shape,"(150, 166)"
7,Numeric features,165
8,Preprocess,True
9,Imputation type,simple


In [4]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,32056.426,2550248006.1688,48344.1074,0.5165,0.4722,0.4477,0.031
gbr,Gradient Boosting Regressor,32618.3871,2592625167.5851,48589.628,0.513,0.4813,0.4596,0.059
br,Bayesian Ridge,33653.6982,2512855895.798,48513.5857,0.5114,0.4995,0.5071,0.029
ridge,Ridge Regression,34356.0714,2589983966.8475,49328.8218,0.4945,0.5305,0.5264,0.021
omp,Orthogonal Matching Pursuit,34608.4644,2604060257.1004,49515.2999,0.4879,0.5192,0.513,0.022
rf,Random Forest Regressor,33804.6728,2788763122.1167,50653.5264,0.4661,0.4943,0.4736,0.055
llar,Lasso Least Angle Regression,35461.5959,2781444557.4716,50843.8217,0.4623,0.5521,0.5725,0.028
knn,K Neighbors Regressor,37062.0063,3062215999.0895,53102.8744,0.4102,0.5557,0.6164,0.027
par,Passive Aggressive Regressor,35286.2005,3216335125.1718,54856.1633,0.3762,0.5154,0.5012,0.027
lightgbm,Light Gradient Boosting Machine,39292.6493,3094062809.5232,54300.6725,0.37,0.5698,0.6039,0.031


In [5]:
# de entre los 5 mejores modelos según pycaret, se cambian sus hiperparametros y se comparan sus RMSE

df = pd.read_csv("../data/train.csv")

X = df.drop("salary_in_usd", axis=1) 
y = df["salary_in_usd"] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [6]:
# Huber Regressor

param_grid_hr = {
    'epsilon': [1.35, 1.5, 1.75],
    'alpha': [0.0001, 0.001, 0.01]
}

hr_grid = GridSearchCV(HuberRegressor(), param_grid_hr, cv=3, n_jobs=-1)
hr_grid.fit(X_train, y_train)
y_pred_hr = hr_grid.predict(X_test)
rmse_hr = mean_squared_error(y_test, y_pred_hr, squared=False)
print("Best RMSE for Huber Regressor:", rmse_hr)
print("Best parameters for Huber Regressor:", hr_grid.best_params_)

Best RMSE for Huber Regressor: 43746.017800068905
Best parameters for Huber Regressor: {'alpha': 0.0001, 'epsilon': 1.75}


In [7]:
# Gradient Boosting Regressor

param_grid_gbr = {
'n_estimators': [50, 100, 150],
'max_depth': [3, 5, 7],
'learning_rate': [0.1, 0.01, 0.001]
}

gbr_grid = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gbr, cv=3, n_jobs=-1)
gbr_grid.fit(X_train, y_train)
y_pred_gbr = gbr_grid.predict(X_test)
rmse_gbr = mean_squared_error(y_test, y_pred_gbr, squared=False)
print("Best RMSE for Gradient Boosting Regressor:", rmse_gbr)
print("Best parameters for Gradient Boosting Regressor:", gbr_grid.best_params_)

Best RMSE for Gradient Boosting Regressor: 43958.950400169655
Best parameters for Gradient Boosting Regressor: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150}


In [8]:
# Bayesian ridge

param_grid_br = {
    'alpha_1': [1e-7, 1e-6, 1e-5],
    'alpha_2': [1e-7, 1e-6, 1e-5],
    'lambda_1': [1e-7, 1e-6, 1e-5],
    'lambda_2': [1e-7, 1e-6, 1e-5],
}


br = BayesianRidge()
br_grid = GridSearchCV(br, param_grid_br, cv=3, n_jobs=-1)
br_grid.fit(X_train, y_train)
y_pred_br = br_grid.predict(X_test)
rmse_br = mean_squared_error(y_test, y_pred_br, squared=False)
print("Best RMSE for Bayesian Ridge:", rmse_br)
print("Best parameters for Bayesian Ridge:", br_grid.best_params_)

Best RMSE for Bayesian Ridge: 46827.6094342724
Best parameters for Bayesian Ridge: {'alpha_1': 1e-07, 'alpha_2': 1e-07, 'lambda_1': 1e-06, 'lambda_2': 1e-05}


In [9]:
# Ridge Regression

param_grid_rr = {
    'alpha': [0.1, 1, 10],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    'max_iter': [100, 500, 1000],
}

rr = Ridge(random_state=42)
rr_grid = GridSearchCV(rr, param_grid_rr, cv=3, n_jobs=-1)
rr_grid.fit(X_train, y_train)
y_pred_rr = rr_grid.predict(X_test)
rmse_rr = mean_squared_error(y_test, y_pred_rr, squared=False)
print("Best RMSE for Ridge Regression:", rmse_rr)
print("Best parameters for Ridge Regression:", rr_grid.best_params_)

Best RMSE for Ridge Regression: 45873.02566418878
Best parameters for Ridge Regression: {'alpha': 10, 'max_iter': 100, 'solver': 'sag'}


In [10]:
# Orthogonal Matching Pursuit

param_grid_omp = {
    'n_nonzero_coefs': [5, 10, 15],
    'fit_intercept': [True, False],
    'normalize': [True, False]
}

omp_grid = GridSearchCV(OrthogonalMatchingPursuit(), param_grid_omp, cv=3, n_jobs=-1)
omp_grid.fit(X_train, y_train)
y_pred_omp = omp_grid.predict(X_test)
rmse_omp = mean_squared_error(y_test, y_pred_omp, squared=False)
print("Best RMSE for Orthogonal Matching Pursuit:", rmse_omp)
print("Best parameters for Orthogonal Matching Pursuit:", omp_grid.best_params_)

Best RMSE for Orthogonal Matching Pursuit: 45614.654753841736
Best parameters for Orthogonal Matching Pursuit: {'fit_intercept': True, 'n_nonzero_coefs': 10, 'normalize': False}


In [11]:
# se juega con los parámetros del mejor modelo (Huber Regressor) para ajustarlos al máximo

hr_model = HuberRegressor(epsilon=1.54, alpha=0.0001, fit_intercept=True)
hr_model.fit(X_train, y_train)
y_pred_hr = hr_model.predict(X_test)
rmse_hr = mean_squared_error(y_test, y_pred_hr, squared=False)
print("Best RMSE for Huber Regressor:", rmse_hr)

Best RMSE for Huber Regressor: 43436.89388992864


In [12]:
# se entrena el dicho modelo con todo train y se predice test

# cargamos los datos
df = pd.read_csv("../data/train.csv")

# separamos las características y las etiquetas
X_train = df.drop("salary_in_usd", axis=1)
y_train = df["salary_in_usd"]
X_test = pd.read_csv('../data/test.csv')

# definimos el modelo y sus parámetros
hr_model2 = HuberRegressor(epsilon=1.54, alpha=0.0001, fit_intercept=True)

# entrenamos el modelo
hr_model2.fit(X_train, y_train)

# hacemos predicciones
y_pred_hr2 = hr_model2.predict(X_test)

# creamos un archivo de muestra con las predicciones
muestra = pd.DataFrame()
muestra['id'] = range(len(y_pred_hr2))
muestra['salary_in_usd'] = pd.DataFrame(y_pred_hr2)
muestra.to_csv('../data/muestra.csv', index=False)