# ABC Corporation

## Imports y carga de archivos

In [1]:
# importar librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
import re

In [2]:
# cargar datos
df_abc = pd.read_csv('../files/hr.csv')

## Columnas

- age: Edad del empleado.
- attrition: Indica si el empleado dejó la empresa (Yes/No).
- business_travel: Frecuencia de viajes laborales (por ejemplo, Travel_Rarely, Travel_Frequently).
- daily_rate: Tarifa diaria estimada basada en el salario.
- department: Departamento en el que trabaja el empleado.
- distance_from_home: Distancia desde el hogar hasta el lugar de trabajo.
- education: Nivel educativo del empleado representado en una escala numérica.
- education_field: Campo o área de estudios del empleado.
- employee_count: Valor constante (normalmente 1), que indica un empleado por registro.
- employee_number: Identificador único del empleado.
- environment_satisfaction: Nivel de satisfacción con el entorno laboral.
- gender: Género del empleado.
- hourly_rate: Tarifa por hora estimada.
- job_involvement: Nivel de implicación del empleado en su trabajo.
- job_level: Nivel jerárquico del puesto.
- job_role: Rol o puesto específico del empleado.
- job_satisfaction: Nivel de satisfacción con el trabajo.
- marital_status: Estado civil del empleado.
- monthly_income: Ingreso mensual estimado.
- monthly_rate: Tarifa mensual estimada.
- num_companies_worked: Número de empresas en las que ha trabajado previamente.
- over18: Indica si el empleado es mayor de 18 años.
- over_time: Indica si el empleado realiza horas extra (Yes/No).
- percent_salary_hike: Porcentaje de incremento salarial.
- performance_rating: Evaluación del desempeño del empleado.
- relationship_satisfaction: Nivel de satisfacción con las relaciones laborales.
- standard_hours: Horas estándar de trabajo.
- stock_option_level: Nivel de opciones sobre acciones asignadas.
- total_working_years: Total de años de experiencia laboral.
- training_times_last_year: Número de formaciones recibidas en el último año.
- work_life_balance: Nivel de equilibrio entre vida personal y laboral.
- years_at_company: Años que el empleado lleva en la empresa.
- years_in_current_role: Años en el rol actual.
- years_since_last_promotion: Años transcurridos desde la última promoción.
- years_with_curr_manager: Años trabajando con el gerente actual

## Exploración inicial

In [3]:
# mostrar las primeras filas del DataFrame
df_abc.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41.0,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80.0,0,8,0.0,1,6,4,0,5.0
1,49.0,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,,1,10,3.0,3,10,7,1,7.0
2,37.0,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,,0,7,3.0,3,0,0,0,0.0
3,33.0,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80.0,0,8,3.0,3,8,7,3,0.0
4,27.0,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80.0,1,6,3.0,3,2,2,2,2.0


In [4]:
df_abc.sample(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1166,48.0,No,Travel_Frequently,365,Research & Development,4,5,Medical,1,1644,...,2,80.0,1,23,3.0,3,2,2,2,2.0
1150,35.0,No,Travel_Rarely,819,Research & Development,18,5,Life Sciences,1,1621,...,4,80.0,0,16,2.0,3,16,15,1,10.0
481,34.0,No,,254,Research & Development,1,2,Life Sciences,1,649,...,4,80.0,1,6,3.0,3,6,5,1,3.0
390,43.0,No,Travel_Rarely,982,Research & Development,12,3,Life Sciences,1,520,...,3,80.0,1,25,3.0,3,25,10,3,
1425,33.0,No,Travel_Rarely,501,,15,2,Medical,1,2009,...,1,,1,10,6.0,3,9,7,8,1.0


In [5]:
# mostrar las últimas filas del DataFrame
df_abc.tail()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1469,34.0,No,Travel_Rarely,628,Research & Development,8,3,Medical,1,2068,...,1,,0,6,3.0,4,4,3,1,2.0
1470,28.0,No,Travel_Rarely,866,Sales,5,3,Medical,1,1469,...,4,,0,6,4.0,3,5,4,1,
1471,53.0,No,Travel_Rarely,1084,Research & Development,13,2,Medical,1,250,...,3,,2,5,3.0,3,4,2,1,3.0
1472,24.0,Yes,Travel_Rarely,240,Human Resources,22,1,Human Resources,1,1714,...,3,80.0,1,1,2.0,3,1,0,0,0.0
1473,45.0,No,Travel_Rarely,1339,Research & Development,7,3,Life Sciences,1,86,...,3,,1,25,2.0,3,1,0,0,0.0


In [6]:
# mostrar el tamaño del DataFrame
df_abc.shape

(1474, 35)

In [7]:
# mostrar información del DataFrame
df_abc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1474 entries, 0 to 1473
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1401 non-null   float64
 1   Attrition                 1474 non-null   object 
 2   BusinessTravel            1357 non-null   object 
 3   DailyRate                 1474 non-null   int64  
 4   Department                1445 non-null   object 
 5   DistanceFromHome          1474 non-null   int64  
 6   Education                 1474 non-null   int64  
 7   EducationField            1416 non-null   object 
 8   EmployeeCount             1474 non-null   int64  
 9   EmployeeNumber            1474 non-null   int64  
 10  EnvironmentSatisfaction   1474 non-null   int64  
 11  Gender                    1474 non-null   object 
 12  HourlyRate                1474 non-null   int64  
 13  JobInvolvement            1474 non-null   int64  
 14  JobLevel

In [8]:
df_abc.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1469    False
1470     True
1471     True
1472     True
1473     True
Length: 1474, dtype: bool

In [11]:
df_abc_copy.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

## Limpieza

Se detectaron 4 filas duplicadas completas al final del conjunto de datos. Dado que cada registro representa un empleado único y no existe una dimensión temporal en el dataset, estos duplicados no aportan información adicional y se consideran un artefacto del proceso de carga de datos. Por ello, se procedió a su eliminación.

In [10]:
# eliminar duplicados
df_abc_copy = df_abc.copy()
df_abc_copy.drop_duplicates()
df_abc_copy.head().T

Unnamed: 0,0,1,2,3,4
Age,41.0,49.0,37.0,33.0,27.0
Attrition,Yes,No,Yes,No,No
BusinessTravel,Travel_Rarely,Travel_Frequently,Travel_Rarely,Travel_Frequently,Travel_Rarely
DailyRate,1102,279,1373,1392,591
Department,Sales,Research & Development,Research & Development,Research & Development,Research & Development
DistanceFromHome,1,8,2,3,2
Education,2,1,2,4,1
EducationField,Life Sciences,Life Sciences,Other,Life Sciences,Medical
EmployeeCount,1,1,1,1,1
EmployeeNumber,1,2,4,5,7


In [12]:
df_abc_copy.columns = [re.sub(r'(?<!^)([A-Z])', r'_\1', col).lower() for col in df_abc_copy.columns]
df_abc_copy.columns

Index(['age', 'attrition', 'business_travel', 'daily_rate', 'department',
       'distance_from_home', 'education', 'education_field', 'employee_count',
       'employee_number', 'environment_satisfaction', 'gender', 'hourly_rate',
       'job_involvement', 'job_level', 'job_role', 'job_satisfaction',
       'marital_status', 'monthly_income', 'monthly_rate',
       'num_companies_worked', 'over18', 'over_time', 'percent_salary_hike',
       'performance_rating', 'relationship_satisfaction', 'standard_hours',
       'stock_option_level', 'total_working_years', 'training_times_last_year',
       'work_life_balance', 'years_at_company', 'years_in_current_role',
       'years_since_last_promotion', 'years_with_curr_manager'],
      dtype='object')

In [14]:
# eliminación de columnas innecesarias
# no eliminar employee_count para facilitar el recuento de empleados.
df_abc_copy = df_abc_copy.drop(columns=['over18', 'standard_hours'])

In [18]:
df_abc_copy.columns

Index(['age', 'attrition', 'business_travel', 'daily_rate', 'department',
       'distance_from_home', 'education', 'education_field', 'employee_count',
       'employee_number', 'environment_satisfaction', 'gender', 'hourly_rate',
       'job_involvement', 'job_level', 'job_role', 'job_satisfaction',
       'marital_status', 'monthly_income', 'monthly_rate',
       'num_companies_worked', 'over_time', 'percent_salary_hike',
       'performance_rating', 'relationship_satisfaction', 'stock_option_level',
       'total_working_years', 'training_times_last_year', 'work_life_balance',
       'years_at_company', 'years_in_current_role',
       'years_since_last_promotion', 'years_with_curr_manager'],
      dtype='object')

In [15]:
df_abc_copy.columns = (
    df_abc_copy.columns
      .str.strip()                 # quita espacios al inicio/fin
      .str.lower()
      .str.replace(r"\s+", "_", regex=True)  # cualquier espacio → "_"
)

In [17]:
df_abc_copy['attrition'].unique()

array(['Yes', 'No'], dtype=object)

En visualizacion

categorias de 'work_life_balance' a 1=bad, 2=good, 3=better, 4=best

environment_satisfaction 1=‘Low’ 2=‘Medium’ 3=‘High’ 4=‘Very High’

job_involvement 1=‘Low’ 2=‘Medium’ 3=‘High’ 4=‘Very High’

job_satisfaction 1=‘Low’ 2=‘Medium’ 3=‘High’ 4=‘Very High’

performance_rating 1=‘Low’ 2=‘Good’ 3=‘Excellent’ 4=‘Outstanding’

relationship_satisfaction 1=‘Low’ 2=‘Medium’ 3=‘High’ 4=‘Very High’

In [20]:
columnas_texto = df_abc_copy.select_dtypes(include=['object']).columns

# 2. Aplicamos la "limpieza total" en cada una
for col in columnas_texto:
    # Primero reemplazamos guiones y guiones bajos por espacios
    # Luego quitamos espacios sobrantes en los bordes con .strip()
    # Y finalmente ponemos la primera letra de cada palabra en mayúscula
    df_abc_copy[col] = df_abc_copy[col].str.replace('-', ' ').str.replace('_', ' ').str.strip().str.title()

print("¡Limpieza de contenido completada!")

¡Limpieza de contenido completada!


In [21]:
df_abc_copy['job_role'].unique()

array(['Sales Executive', 'Research Scientist', 'Laboratory Technician',
       'Manufacturing Director', 'Healthcare Representative', 'Manager',
       'Sales Representative', 'Research Director', 'Human Resources'],
      dtype=object)

In [24]:
df_abc_copy['business_travel'].unique()

array(['Travel Rarely', 'Travel Frequently', 'Non Travel', nan],
      dtype=object)

In [None]:
# - 'edad' de float a int
# - categorias de 'education' a 1='Below College', 2=‘College’, 3=‘Bachelor’, 4=‘Master’, 5=‘Doctor’
# - marreid
# nulos