# ABC Corporation

## Imports y carga de archivos

In [1]:
# importar librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
import re

In [2]:
# cargar datos
df_abc = pd.read_csv('files/hr.csv')

## Columnas

- age: Edad del empleado.
- attrition: Indica si el empleado dejó la empresa (Yes/No).
- business_travel: Frecuencia de viajes laborales (por ejemplo, Travel_Rarely, Travel_Frequently).
- daily_rate: Tarifa diaria estimada basada en el salario.
- department: Departamento en el que trabaja el empleado.
- distance_from_home: Distancia desde el hogar hasta el lugar de trabajo.
- education: Nivel educativo del empleado representado en una escala numérica.
- education_field: Campo o área de estudios del empleado.
- employee_count: Valor constante (normalmente 1), que indica un empleado por registro.
- employee_number: Identificador único del empleado.
- environment_satisfaction: Nivel de satisfacción con el entorno laboral.
- gender: Género del empleado.
- hourly_rate: Tarifa por hora estimada.
- job_involvement: Nivel de implicación del empleado en su trabajo.
- job_level: Nivel jerárquico del puesto.
- job_role: Rol o puesto específico del empleado.
- job_satisfaction: Nivel de satisfacción con el trabajo.
- marital_status: Estado civil del empleado.
- monthly_income: Ingreso mensual estimado.
- monthly_rate: Tarifa mensual estimada.
- num_companies_worked: Número de empresas en las que ha trabajado previamente.
- over18: Indica si el empleado es mayor de 18 años.
- over_time: Indica si el empleado realiza horas extra (Yes/No).
- percent_salaryHike: Porcentaje de incremento salarial.
- performance_rating: Evaluación del desempeño del empleado.
- relationship_satisfaction: Nivel de satisfacción con las relaciones laborales.
- standard_hours: Horas estándar de trabajo.
- stock_option_level: Nivel de opciones sobre acciones asignadas.
- total_working_years: Total de años de experiencia laboral.
- training_times_last_year: Número de formaciones recibidas en el último año.
- work_life_balance: Nivel de equilibrio entre vida personal y laboral.
- years_at_company: Años que el empleado lleva en la empresa.
- years_in_current_role: Años en el rol actual.
- years_since_last_promotion: Años transcurridos desde la última promoción.
- years_with_curr_manager: Años trabajando con el gerente actual

## Exploración inicial

In [6]:
df_abc.drop_duplicates()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41.0,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80.0,0,8,0.0,1,6,4,0,5.0
1,49.0,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,,1,10,3.0,3,10,7,1,7.0
2,37.0,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,,0,7,3.0,3,0,0,0,0.0
3,33.0,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80.0,0,8,3.0,3,8,7,3,0.0
4,27.0,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80.0,1,6,3.0,3,2,2,2,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36.0,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80.0,1,17,3.0,3,5,2,0,3.0
1466,39.0,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80.0,1,9,5.0,3,7,7,1,7.0
1467,27.0,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80.0,1,6,0.0,3,6,2,0,3.0
1468,49.0,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80.0,0,17,3.0,2,9,6,0,8.0


In [9]:
df_abc.columns = [re.sub(r'(?<!^)([A-Z])', r'_\1', col).lower() for col in df_abc.columns]

In [None]:
print(sorted(df_abc['age'].unique()))

[18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 56.0, 59.0, nan, 55.0, 57.0, 58.0, 60.0]


In [13]:
df_abc['attrition'].unique()

array(['Yes', 'No'], dtype=object)

In [None]:
df_abc['business_travel'].unique()

array(['Travel_Rarely', 'Travel_Frequently', 'Non-Travel', nan],
      dtype=object)

In [16]:
df_abc['daily_rate'].nunique()

886

In [17]:
df_abc['department'].unique()

array(['Sales', 'Research & Development', nan, 'Human Resources'],
      dtype=object)

In [42]:
print(sorted(df_abc['distance_from_home'].unique()))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


In [20]:
df_abc['education'].unique()

array([2, 1, 4, 3, 5], dtype=int64)

In [21]:
df_abc['education_field'].unique()

array(['Life Sciences', 'Other', 'Medical', nan, 'Marketing',
       'Technical Degree', 'Human Resources'], dtype=object)

In [22]:
df_abc['employee_count'].unique()

array([1], dtype=int64)

In [23]:
df_abc['employee_number'].nunique()

1470

In [24]:
df_abc['environment_satisfaction'].unique()

array([2, 3, 4, 1], dtype=int64)

In [25]:
df_abc['gender'].unique()

array(['Female', 'Male'], dtype=object)

In [40]:
print(sorted(df_abc['hourly_rate'].unique()))

[30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]


In [28]:
df_abc['job_involvement'].unique()

array([3, 2, 4, 1], dtype=int64)

In [29]:
df_abc['job_level'].unique()

array([2, 1, 3, 4, 5], dtype=int64)

In [30]:
df_abc['job_role'].unique()

array([' sALES eXECUTIVE ', ' rESEARCH sCIENTIST ',
       ' lABORATORY tECHNICIAN ', ' mANUFACTURING dIRECTOR ',
       ' hEALTHCARE rEPRESENTATIVE ', ' mANAGER ',
       ' sALES rEPRESENTATIVE ', ' rESEARCH dIRECTOR ',
       ' hUMAN rESOURCES '], dtype=object)

In [32]:
df_abc['job_satisfaction'].unique()

array([ 4.,  2.,  3.,  1., nan])

In [33]:
df_abc['marital_status'].unique()

array(['Single', 'Married', 'Divorced', nan, 'Marreid'], dtype=object)

In [34]:
df_abc['monthly_income'].nunique()

1336

In [35]:
df_abc['monthly_rate'].nunique()

1427

In [38]:
sorted(df_abc['num_companies_worked'].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [43]:
df_abc['over18'].unique()

array(['Y'], dtype=object)

In [44]:
df_abc['over_time'].unique()

array(['Yes', 'No', nan], dtype=object)

In [46]:
sorted(df_abc['percent_salary_hike'].unique())

[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]

In [47]:
df_abc['performance_rating'].unique()

array([3, 4], dtype=int64)

In [48]:
df_abc['relationship_satisfaction'].unique()

array([1, 4, 2, 3], dtype=int64)

In [49]:
df_abc['standard_hours'].unique()

array([80., nan])

In [50]:
df_abc['stock_option_level'].unique()

array([0, 1, 3, 2], dtype=int64)

In [None]:
print(sorted(df_abc['total_working_years'].unique()))
# no hay 39

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40]


In [53]:
df_abc['training_times_last_year'].unique()

array([ 0.,  3.,  2.,  5.,  1.,  4., nan,  6.])

In [54]:
df_abc['work_life_balance'].unique()

array([1, 3, 2, 4], dtype=int64)

In [60]:
print(sorted(df_abc['years_at_company'].unique()))
# no hay 28, 35, 38, 39

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 36, 37, 40]


In [58]:
sorted(df_abc['years_in_current_role'].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]

In [62]:
sorted(df_abc['years_since_last_promotion'].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [65]:
print(sorted(df_abc['years_with_curr_manager'].unique()))

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, nan, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0]
