In [74]:
import numpy as np
import pandas as pd

import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin

In [75]:
data = pd.read_csv('salaries_final.csv')

In [76]:
model = GradientBoostingRegressor(**{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100})

In [77]:
X = data.drop('salary_in_usd', axis=1)
y = data['salary_in_usd']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
data

Unnamed: 0,experience_level,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,SE,68,85847,ES,Remoto,ES,L
1,SE,36,175000,CA,Remoto,CA,M
2,SE,36,120000,CA,Remoto,CA,M
3,SE,6,222200,US,Presencial,US,L
4,SE,6,136000,US,Presencial,US,L
...,...,...,...,...,...,...,...
2900,MI,36,48000,RU,Remoto,US,S
2901,SE,72,144000,US,Hibrido,US,L
2902,EN,44,100000,US,Hibrido,US,L
2903,EN,24,120000,US,Remoto,US,M


In [79]:
variables_categoricas = ['experience_level', 'employee_residence', 'remote_ratio', 'company_location', 'company_size']
job_title = ['job_title']

In [80]:
# Crear una clase para ejecutar el LabelEncoder
class LabelEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = LabelEncoder()

    def fit(self, X, y=None):
        self.encoder.fit(y)
        return self

    def transform(self, X, y=None):
        return self.encoder.transform(y)


In [81]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, variables_categoricas)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])


In [82]:
pipeline.fit(X_train, y_train)

In [83]:
y_pred = pipeline.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print(f'R2: {r2}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')

R2: 0.29724673078565433
RMSE: 38170.5783265751
MAE: 31087.231950367124


In [None]:
joblib.dump(pipeline, 'pipeline.joblib')