In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


from sklearn.preprocessing import LabelEncoder

In [19]:
data = pd.read_csv('https://raw.githubusercontent.com/juankquintana/prediccion_salarios/main/Models/Classification/data_top10.csv')
print(data.head(3))

  experience_level employment_type                  job_title  salary_in_usd  \
0               MI              FT             Data Scientist         111740   
1               SE              FT             Data Scientist         115000   
2               SE              FT  Machine Learning Engineer         112000   

  employee_residence  remote_ratio company_location company_size  
0      North America             0    North America            M  
1      North America           100    North America            M  
2      North America             0    North America            M  


In [20]:
# función para clasificar rangos de salarios
def classify_salary(salary):
    if salary < 75000:
        return 'Bajo'
    elif 75000 <= salary < 120000:
        return 'Medio_Bajo'
    elif 120000 <= salary < 180000:
        return 'Medio_Alto'
    else:
        return 'Alto'

In [21]:
# Mapeo Opcion 1 salary_class
data['salary_class'] = data['salary_in_usd'].apply(classify_salary)
data.head(3)

Unnamed: 0,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,salary_class
0,MI,FT,Data Scientist,111740,North America,0,North America,M,Medio_Bajo
1,SE,FT,Data Scientist,115000,North America,100,North America,M,Medio_Bajo
2,SE,FT,Machine Learning Engineer,112000,North America,0,North America,M,Medio_Bajo


In [22]:
data = data.drop('salary_in_usd', axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11193 entries, 0 to 11192
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   experience_level    11193 non-null  object
 1   employment_type     11193 non-null  object
 2   job_title           11193 non-null  object
 3   employee_residence  11193 non-null  object
 4   remote_ratio        11193 non-null  int64 
 5   company_location    11193 non-null  object
 6   company_size        11193 non-null  object
 7   salary_class        11193 non-null  object
dtypes: int64(1), object(7)
memory usage: 699.7+ KB


In [23]:
# Codificar variables categóricas
codificacion = {}
categorical_cols = ['experience_level', 'employment_type', 'job_title', 
                    'employee_residence', 'company_location', 'company_size']

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    codificacion[col] = le 

In [24]:
codificacion

{'experience_level': LabelEncoder(),
 'employment_type': LabelEncoder(),
 'job_title': LabelEncoder(),
 'employee_residence': LabelEncoder(),
 'company_location': LabelEncoder(),
 'company_size': LabelEncoder()}

In [25]:
data.head(5)

Unnamed: 0,experience_level,employment_type,job_title,employee_residence,remote_ratio,company_location,company_size,salary_class
0,2,1,4,3,0,3,1,Medio_Bajo
1,3,1,4,3,100,3,1,Medio_Bajo
2,3,1,6,3,0,3,1,Medio_Bajo
3,3,1,6,3,100,3,1,Alto
4,3,1,4,3,0,3,1,Alto


In [31]:
X = data.drop('salary_class', axis=1)
Y = data['salary_class']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [33]:
model = LGBMClassifier(
    boosting_type='gbdt',
    num_leaves=31,
    max_depth=-1,
    learning_rate=0.05,
    n_estimators=100,
    random_state=42
)

In [34]:
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001072 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 35
[LightGBM] [Info] Number of data points in the train set: 8954, number of used features: 7
[LightGBM] [Info] Start training from score -1.133616
[LightGBM] [Info] Start training from score -2.216393
[LightGBM] [Info] Start training from score -1.101520
[LightGBM] [Info] Start training from score -1.440684


In [35]:
# Hacer predicciones y evaluar
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.42
