In [1]:
# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt


# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames
pd.set_option('display.max_colwidth', None) # para visualizar todo el contenido de los valores
#pd.set_option('display.max_rows', None) # Para visualizar todo el contenido de las filas

In [2]:
datos = pd.read_csv('Datos/datos_empresa.csv', index_col=0)
datos_limpios = pd.read_csv('Datos/datos_empresa_V1.csv', index_col=0)


In [3]:
#Creamos Dataframe de las columnas de los df a comparar

datos_col = pd.DataFrame(list(datos.columns), columns=['datos'])
datos_col
datos_limpio_col = pd.DataFrame(list(datos_limpios.columns), columns=['datos2'])
datos_limpio_col


Unnamed: 0,datos2
0,Age
1,Attrition
2,BusinessTravel
3,DailyRate
4,Department
5,DistanceFromHome
6,Education
7,EducationField
8,employeenumber
9,EnvironmentSatisfaction


In [4]:
# mergeamos las columnas para saber si tenemos duplicados

df_merge = datos_col.merge(datos_limpio_col, left_on='datos', right_on='datos2', how = 'left')
df_merge

Unnamed: 0,datos,datos2
0,Age,Age
1,Attrition,Attrition
2,BusinessTravel,BusinessTravel
3,DailyRate,DailyRate
4,Department,Department
5,DistanceFromHome,DistanceFromHome
6,Education,Education
7,EducationField,EducationField
8,employeecount,
9,employeenumber,employeenumber


In [5]:
#filtro para saber columnas que habíamos borrado

filtro = df_merge['datos2'].isna()
df_filtro = df_merge[filtro]

df_filtro

Unnamed: 0,datos,datos2
8,employeecount,
21,Over18,
35,SameAsMonthlyIncome,
37,Salary,
38,RoleDepartament,
39,NUMBERCHILDREN,


In [6]:
comparacion = df_merge['datos'] == df_merge['datos2']
comparacion

0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8     False
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21    False
22     True
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30     True
31     True
32     True
33     True
34     True
35    False
36     True
37    False
38    False
39    False
40     True
dtype: bool

In [7]:
#borramos columnas que no necesitamos

datos.drop( columns = ['employeecount','Over18','SameAsMonthlyIncome', 'Salary', 'RoleDepartament', 'NUMBERCHILDREN'], inplace= True)

In [8]:
# transformamos columnas para su mejor visulatización las convertimos todos a lower

new_columns = {column : column.lower() for column in datos}

new_columns

datos.rename(columns = new_columns, inplace= True)

datos.head()

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,datebirth,remotework
0,51,No,,"684,0$",,6,3,,1620.0,1,0,51,3,5,resEArch DIREcToR,3,,195370.0,6462,7,No,13,30,3,,0,,5,30.0,20,,15,15,1972,Yes
1,52,No,,"699,0$",,1,4,Life Sciences,2590.0,3,0,65,2,5,ManAGeR,3,,199990.0,5678,0,,14,30,1,,1,340.0,5,30.0,33,,11,9,1971,1
2,42,No,travel_rarely,"532,0$",Research & Development,4,2,Technical Degree,3190.0,3,0,58,3,5,ManaGER,4,Married,192320.0,4933,1,No,11,30,4,,0,220.0,3,,22,,11,15,1981,1
3,47,No,travel_rarely,"359,0$",,2,4,Medical,,1,1,82,3,4,ReseArCH DIrECtOr,3,Married,171690.0,26703,3,,19,30,2,,2,,2,,20,,5,6,1976,False
4,46,No,,"1319,0$",,3,3,Technical Degree,,1,1,45,4,4,sAleS EXECUtIve,1,Divorced,,7739,2,No,12,30,4,,1,,5,30.0,19,,2,8,1977,0


In [None]:
# Transformamos columnas

In [9]:
# Hemos cambiado las edades que estaban escritas para que sean numéricas. Lo cambiamos a tipo int.
# Estas edades son las que tenían en el 2023.

datos['age'].unique()

age = datos['age'].str.replace('fifty-eight', '58 ').str.replace('fifty-eight', '47 ').str.replace('thirty-six', '36').str.replace('fifty-five', '55').str.replace('fifty-two', '52').str.replace('thirty-one', '31').str.replace('twenty-six', '36').str.replace('thirty-seven', '37').str.replace('thirty-seven', '37').str.replace('thirty-two', '32').str.replace('twenty-four', '24').str.replace('forty-seven', '47').str.replace('thirty', '30')

datos['age'] = age

datos['age'] = datos['age'].astype(int)

datos['age'].unique()

array([51, 52, 42, 47, 46, 48, 59, 41, 56, 38, 55, 40, 58, 35, 45, 33, 36,
       34, 53, 43, 60, 32, 37, 49, 39, 50, 44, 30, 29, 31, 54, 57, 27, 28,
       26, 25, 24, 23, 22, 21, 20, 19, 18])

In [10]:
# No la tocamos

datos['attrition'].unique()

array(['No', 'Yes'], dtype=object)

In [11]:
# Reemplazamos las barras bajas por espacios y los guiones por espacio. Lo capitalizamos.

businesstravel = datos['businesstravel'].str.replace('_', ' ').str.replace('-', ' ').str.capitalize()

datos['businesstravel'] = businesstravel

datos['businesstravel'].unique()

array([nan, 'Travel rarely', 'Travel frequently', 'Non travel'],
      dtype=object)

In [12]:
# Eliminamos los $ y ,0 con un .replace. También hemos pasado los nan y luego cambiamos el tipo a float

dailyrate = datos['dailyrate'].str.replace('$', '').str.replace(',0', '').replace('nan', np.nan)

datos['dailyrate'] = dailyrate

datos['dailyrate'] = datos['dailyrate'].astype(float)

datos['dailyrate'].unique()


array([ 684.,  699.,  532.,  359., 1319.,  117., 1435.,  635., 1276.,
        840.,  247., 1369.,  201., 1360.,  692., 1398.,  286., 1402.,
        819.,  884., 1238.,  515., 1223.,  202.,  928.,  607.,  266.,
        429.,  589.,   nan, 1180., 1282.,  776.,  665.,  526., 1034.,
       1403., 1499.,  580.,  859.,  263., 1376.,  885., 1003., 1321.,
        394., 1372., 1333.,  228.,  737.,  823.,  667.,  301.,  573.,
       1329.,  630., 1063., 1017., 1296.,  939., 1355., 1448.,  200.,
       1202.,  404.,  208.,  813.,  465., 1189., 1001., 1394.,  161.,
        288.,  682., 1354.,  147.,  119., 1413.,  452.,  334., 1132.,
        982.,  480., 1099.,  672., 1379.,  583., 1492., 1050.,  469.,
        237., 1440., 1291., 1157., 1336., 1224.,  735., 1389.,  638.,
       1240.,  194., 1339.,  111., 1469.,  470., 1232., 1249.,  798.,
        549.,  570.,  541.,  164., 1117.,  619.,  319.,  956., 1245.,
       1397.,  527.,  213.,  882.,  330.,  406.,  217.,  481.,  669.,
       1465.,  685.,

In [13]:
# Eliminamos espacios de principio y final. Ver que hacer con nan.

department = datos['department'].str.strip()
datos['department'] = department

datos['department'].unique()

array([nan, 'Research & Development', 'Sales', 'Human Resources'],
      dtype=object)

In [14]:
# Hemos cambiado el tipo de int a str y eliminado el menos de los negativos. Lo pasamos a int.

datos_distance = datos['distancefromhome'].astype(str)

distancefromhome = datos_distance.str.replace('-', '')

datos['distancefromhome'] = distancefromhome

datos['distancefromhome'] = datos['distancefromhome'].astype(int)

datos['distancefromhome'].unique()

array([ 6,  1,  4,  2,  3, 22, 25,  9,  7, 23, 10, 12, 14, 13, 15,  8, 42,
       28, 37,  5, 16, 35, 26, 24, 29, 17, 21, 18, 30, 27, 20, 31, 39, 11,
       19, 33, 34, 46, 36, 45, 47, 32, 41, 49, 48, 38, 43, 40, 44])

In [15]:
datos['education'].unique() # No hay que hacerle nada

array([3, 4, 2, 1, 5])

In [16]:
datos['educationfield'].unique() # ya está capitalizado. Nos falta ver que hacer con NaN

array([nan, 'Life Sciences', 'Technical Degree', 'Medical', 'Other',
       'Marketing', 'Human Resources'], dtype=object)

In [17]:
# Tiene NaN y tiene duplicados. Esto es porque nos dimos cuenta que no son individuos, si no respuestas de estas personas realizadas en distintas encuestas.
# Posible borrado porque creemos que no nos aporta nada.

datos['employeenumber'].unique() 

array(['162,0', '259,0', '319,0', ..., '2012,0', '2023,0', '2040,0'],
      dtype=object)

In [18]:
# tomar la lista e iterar por ella para hacer la fusión. Documentar porque nos quedamos con los duplicados

datos_duplicados = datos['employeenumber'][datos['employeenumber'].duplicated()].unique() 
datos_duplicados

array([nan, '1541,0', '1947,0', '1954,0', '307,0', '374,0', '524,0',
       '569,0', '1044,0', '1053,0', '1069,0', '1131,0', '1135,0',
       '1140,0', '1157,0', '1160,0', '1161,0', '1162,0', '1185,0',
       '1195,0', '1778,0', '1797,0', '1804,0', '1816,0', '1821,0',
       '1849,0', '1869,0', '1898,0', '1911,0', '1927,0', '423,0', '424,0',
       '433,0', '440,0', '447,0', '455,0', '460,0', '465,0', '470,0',
       '475,0', '478,0', '482,0', '495,0', '501,0', '502,0', '507,0',
       '517,0', '522,0', '523,0', '525,0', '526,0', '530,0', '544,0',
       '376,0', '381,0', '388,0', '389,0', '401,0', '416,0', '430,0',
       '438,0', '446,0', '448,0', '454,0', '458,0', '476,0', '483,0',
       '488,0', '500,0', '534,0', '586,0', '595,0', '608,0', '616,0',
       '621,0', '653,0', '663,0', '682,0', '710,0', '717,0', '721,0',
       '722,0', '724,0', '725,0', '728,0', '733,0', '734,0', '742,0',
       '747,0', '762,0', '783,0', '789,0', '793,0', '803,0', '809,0',
       '823,0', '842,0', '

In [19]:
datos['employeenumber'].duplicated().sum()

534

In [20]:
# No tocar.

datos['environmentsatisfaction'].unique()

array([ 1,  3,  4,  2, 42, 37, 35, 25, 27, 31, 39, 21, 15, 14, 33, 19, 12,
       13, 28, 47, 36, 29, 24, 46, 16, 22, 41, 49, 11, 48, 18, 10, 45, 38,
       17, 20, 26, 43])

In [21]:
# Vamos a sustituir los 0 y 1 por la siniciales de su género y cambiarlo a tipo objeto.
dic_map = {0:'M', 1:'F'}

datos["gender"] = datos["gender"].map(dic_map)

datos['gender'].unique()

array(['M', 'F'], dtype=object)

In [23]:
# pasar a int y antes cambiar el Not Avaiable a NaN

hourlyrate = datos['hourlyrate'].replace('Not Available', '0')

datos['hourlyrate'] = hourlyrate.astype(int)

datos['hourlyrate'] = datos['hourlyrate'].replace(0, 'NaN')

datos['hourlyrate'].unique() 

ValueError: invalid literal for int() with base 10: 'NaN'

In [24]:
datos['jobinvolvement'].unique() # no tocar

array([3, 2, 4, 1])

In [25]:
datos['joblevel'].unique() # no tocar

array([5, 4, 3, 2, 1])

In [26]:
# Todo pasado a minusculas, dejando la primera en mayusculas y eliminando los espacios que habia al inicio y final
datos['jobrole'] = datos['jobrole'].str.lower().str.title().str.strip()
datos['jobrole'].unique()

array(['Research Director', 'Manager', 'Sales Executive',
       'Manufacturing Director', 'Research Scientist',
       'Healthcare Representative', 'Laboratory Technician',
       'Sales Representative', 'Human Resources'], dtype=object)

In [27]:
datos['jobsatisfaction'].unique() # no tocar

array([3, 4, 1, 2])

In [28]:
# Sustituyo los que están mal escritos para que sean iguales a los que si creando un diccionario previamente. Comprobar moda
replacements = {
    'divorced': 'Divorced',
    'Marreid': 'Married'
}

datos['maritalstatus'] = datos['maritalstatus'].replace(replacements)
datos['maritalstatus'].unique() 

array([nan, 'Married', 'Divorced', 'Single'], dtype=object)

In [29]:
datos['monthlyincome'].unique() # Tiene nan, hay que cambiarlo a NaN y pasarlo a float

array(['19537,0', '19999,0', '19232,0', '17169,0', nan, '17174,0',
       '16595,0', '19973,0', '13402,0', '13206,0', '19545,0', '18041,0',
       '19246,0', '10748,0', '16752,0', '6201,0', '19845,0', '4001,0',
       '10447,0', '16064,0', '3210,0', '10266,0', '10475,0', '6162,0',
       '4721,0', '4615,0', '16959,0', '10306,0', '5406,0', '5902,0',
       '10855,0', '5914,0', '6646,0', '13973,0', '13320,0', '6687,0',
       '4735,0', '13872,0', '19045,0', '16015,0', '9613,0', '11510,0',
       '4306,0', '17046,0', '5067,0', '3692,0', '19847,0', '2308,0',
       '5747,0', '10422,0', '6347,0', '2348,0', '3072,0', '12490,0',
       '8020,0', '17068,0', '8943,0', '19272,0', '5577,0', '2691,0',
       '7403,0', '8823,0', '3579,0', '17779,0', '18213,0', '13577,0',
       '19190,0', '17123,0', '19187,0', '10008,0', '7988,0', '7083,0',
       '4723,0', '3407,0', '2929,0', '12031,0', '15427,0', '5126,0',
       '9619,0', '5010,0', '19033,0', '10400,0', '2793,0', '5674,0',
       '19197,0', '841

In [30]:
datos['monthlyrate'].unique() # no tocar

array([ 6462,  5678,  4933, ..., 15302, 26956, 16642])

In [31]:
datos['numcompaniesworked'].unique() # no tocar

array([7, 0, 1, 3, 2, 4, 8, 9, 5, 6])

In [32]:
# Función para convertir la coma en punto y los objet a float

columns = ['age', 'attrition', 'businesstravel', 'dailyrate', 'department',
       'distancefromhome', 'education', 'educationfield', 'employeenumber',
       'environmentsatisfaction', 'gender', 'hourlyrate', 'jobinvolvement',
       'joblevel', 'jobrole', 'jobsatisfaction', 'maritalstatus',
       'monthlyincome', 'monthlyrate', 'numcompaniesworked', 'overtime',
       'percentsalaryhike', 'performancerating', 'relationshipsatisfaction',
       'standardhours', 'stockoptionlevel', 'totalworkingyears',
       'trainingtimeslastyear', 'worklifebalance', 'yearsatcompany',
       'yearsincurrentrole', 'yearssincelastpromotion', 'yearswithcurrmanager',
       'datebirth', 'remotework']

performancerating = datos['performancerating'].str.replace(',', '.')

datos['performancerating'] = performancerating
datos['performancerating']

0       3.0
1       3.0
2       3.0
3       3.0
4       3.0
       ... 
1609    NaN
1610    3.0
1611    3.0
1612    NaN
1613    3.0
Name: performancerating, Length: 1614, dtype: object

In [33]:
# overtime mirar qué hacer con los nan. ¿Moda?

datos['overtime'].unique()

array(['No', nan, 'Yes'], dtype=object)

In [34]:
datos['percentsalaryhike'].unique()

suma = datos['percentsalaryhike'].value_counts()
suma

percentsalaryhike
11    232
13    230
12    225
14    220
15    110
18     98
17     88
16     86
19     82
20     60
22     59
21     51
23     29
24     25
25     19
Name: count, dtype: int64

In [35]:
suma = datos['performancerating'].value_counts()
print(suma)

performancerating = datos['performancerating'].str.replace(',0', '')


datos['performancerating'] = performancerating

datos['performancerating']

performancerating
3.0    1205
4.0     214
Name: count, dtype: int64


0       3.0
1       3.0
2       3.0
3       3.0
4       3.0
       ... 
1609    NaN
1610    3.0
1611    3.0
1612    NaN
1613    3.0
Name: performancerating, Length: 1614, dtype: object

In [36]:
# No hacemos cambios de momento

datos['relationshipsatisfaction'].unique()

array([3, 1, 4, 2])

In [37]:
#borramos standard hours porque no tiene sentido tiene muchos nan y 80h es imposible
datos.drop( columns = ['standardhours'], inplace= True)

In [38]:
# No he hecho nada en esta columna parece estra bien
datos['stockoptionlevel'].unique()

array([0, 1, 2, 3])

In [39]:
# En cuanto gestionemos los nan cambiar a INT

datos['totalworkingyears'].unique()

totalworkingyears = datos['totalworkingyears'].str.replace(',0', '')


datos['totalworkingyears'] = totalworkingyears

datos['totalworkingyears']

0       NaN
1        34
2        22
3       NaN
4       NaN
       ... 
1609    NaN
1610    NaN
1611      9
1612     12
1613    NaN
Name: totalworkingyears, Length: 1614, dtype: object

In [40]:
# En cuanto gestionemos los nan cambiar a INT

datos['totalworkingyears'].unique()

array([nan, '34', '22', '28', '20', '21', '33', '40', '18', '25', '15',
       '17', '26', '16', '24', '14', '23', '27', '19', '11', '38', '37',
       '13', '12', '29', '10', '36', '35', '9', '31', '32', '8', '7',
       '30', '6', '5', '4', '3', '2', '1', '0'], dtype=object)

In [41]:
# trainingtimeslastyear no hacer nada aparentemente toda la información es correcta

datos['trainingtimeslastyear'].unique()

array([5, 3, 2, 0, 1, 4, 6])

In [42]:
# Ver qué hacemos con los NaN y convertir a INT

datos['worklifebalance'].unique()

worklifebalance = datos['worklifebalance'].str.replace(',0', '')


datos['worklifebalance'] = worklifebalance

datos['worklifebalance'] 

0         3
1         3
2       NaN
3       NaN
4         3
       ... 
1609      3
1610      2
1611      3
1612      3
1613      3
Name: worklifebalance, Length: 1614, dtype: object

In [43]:
# todo aparenta estar correcto

datos['yearsatcompany'].unique()

array([20, 33, 22, 19, 21, 18, 24, 31, 26, 16, 23, 15, 17, 32, 14, 13, 25,
       12, 11, 37, 40, 36, 27, 29, 10,  9, 30,  8,  7, 34,  6,  5,  4,  2,
        3,  1,  0])

In [44]:
# Ver qué hacemos con los NaN y convertir a INT creo que esta la tendremos que borrar porque no hay casi datos

datos['yearsincurrentrole'].unique()

yearsincurrentrole = datos['yearsincurrentrole'].str.replace(',0', '')


datos['yearsincurrentrole'] = yearsincurrentrole

datos['yearsincurrentrole']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
1609    NaN
1610    NaN
1611    NaN
1612    NaN
1613    NaN
Name: yearsincurrentrole, Length: 1614, dtype: object

In [45]:
datos['yearssincelastpromotion'].unique()

datos[['yearssincelastpromotion']]

Unnamed: 0,yearssincelastpromotion
0,15
1,11
2,11
3,5
4,2
...,...
1609,3
1610,0
1611,0
1612,0


In [46]:
# no tenemos que tocar nada

datos['yearswithcurrmanager'].unique()

array([15,  9,  6,  8,  7, 11, 10, 12,  4,  0,  5, 17,  2, 14,  1, 13,  3,
       16])

In [47]:
# esta ok

datos['datebirth'].unique()

array([1972, 1971, 1981, 1976, 1977, 1975, 1964, 1982, 1967, 1985, 1968,
       1983, 1965, 1988, 1978, 1990, 1987, 1989, 1970, 1980, 1963, 1991,
       1986, 1974, 1984, 1973, 1979, 1993, 1994, 1992, 1969, 1966, 1996,
       1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005])

In [48]:
diccionario_mapa = {'Yes':'Yes', '1':'Yes', 'False':'No', '0':'No', 'True':'Yes'}

datos["remotework"] = datos["remotework"].map(diccionario_mapa)

datos['remotework'].unique()

array(['Yes', 'No'], dtype=object)

In [49]:
datos.columns

Index(['age', 'attrition', 'businesstravel', 'dailyrate', 'department',
       'distancefromhome', 'education', 'educationfield', 'employeenumber',
       'environmentsatisfaction', 'gender', 'hourlyrate', 'jobinvolvement',
       'joblevel', 'jobrole', 'jobsatisfaction', 'maritalstatus',
       'monthlyincome', 'monthlyrate', 'numcompaniesworked', 'overtime',
       'percentsalaryhike', 'performancerating', 'relationshipsatisfaction',
       'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear',
       'worklifebalance', 'yearsatcompany', 'yearsincurrentrole',
       'yearssincelastpromotion', 'yearswithcurrmanager', 'datebirth',
       'remotework'],
      dtype='object')

In [50]:
# qué columnas tienen nulos

nulos_cat = datos[datos.columns[datos.isnull().any()]].select_dtypes(include = "O").columns
print("Las columnas categóricas que tienen nulos son : \n ")
print(nulos_cat)

Las columnas categóricas que tienen nulos son : 
 
Index(['businesstravel', 'department', 'educationfield', 'employeenumber',
       'maritalstatus', 'monthlyincome', 'overtime', 'performancerating',
       'totalworkingyears', 'worklifebalance', 'yearsincurrentrole'],
      dtype='object')


In [51]:
# Vemos la forma de estas columnas

for col in nulos_cat:
    print(f"La distribución de las categorías para la columna {col.upper()}")
    display(datos[col].value_counts() / datos.shape[0])
    print("........................")

La distribución de las categorías para la columna BUSINESSTRAVEL


businesstravel
Travel rarely        0.363073
Travel frequently    0.102230
Non travel           0.056382
Name: count, dtype: float64

........................
La distribución de las categorías para la columna DEPARTMENT


department
Research & Development    0.121437
Sales                     0.056382
Human Resources           0.009294
Name: count, dtype: float64

........................
La distribución de las categorías para la columna EDUCATIONFIELD


educationfield
Life Sciences       0.216233
Medical             0.171004
Marketing           0.064436
Technical Degree    0.042751
Other               0.036555
Human Resources     0.007435
Name: count, dtype: float64

........................
La distribución de las categorías para la columna EMPLOYEENUMBER


employeenumber
482,0     0.001239
530,0     0.001239
507,0     0.001239
517,0     0.001239
522,0     0.001239
            ...   
161,0     0.000620
164,0     0.000620
190,0     0.000620
194,0     0.000620
2040,0    0.000620
Name: count, Length: 1079, dtype: float64

........................
La distribución de las categorías para la columna MARITALSTATUS


maritalstatus
Married     0.271995
Single      0.201363
Divorced    0.123296
Name: count, dtype: float64

........................
La distribución de las categorías para la columna MONTHLYINCOME


monthlyincome
6347,0     0.002478
5304,0     0.002478
2657,0     0.001859
2258,0     0.001859
5405,0     0.001239
             ...   
3102,0     0.000620
4556,0     0.000620
4230,0     0.000620
4859,0     0.000620
19431,0    0.000620
Name: count, Length: 668, dtype: float64

........................
La distribución de las categorías para la columna OVERTIME


overtime
No     0.422553
Yes    0.158612
Name: count, dtype: float64

........................
La distribución de las categorías para la columna PERFORMANCERATING


performancerating
3.0    0.746592
4.0    0.132590
Name: count, dtype: float64

........................
La distribución de las categorías para la columna TOTALWORKINGYEARS


totalworkingyears
10    0.089219
8     0.053284
6     0.052045
9     0.042751
5     0.040892
7     0.034696
4     0.033457
1     0.032838
12    0.021066
3     0.019827
14    0.018587
13    0.018587
11    0.017968
15    0.017348
16    0.017348
20    0.017348
18    0.016729
21    0.014250
17    0.013631
2     0.013011
22    0.011152
19    0.010533
24    0.008674
23    0.008055
28    0.008055
26    0.004957
0     0.004957
29    0.003717
36    0.003717
25    0.003717
33    0.003717
37    0.003098
27    0.003098
31    0.002478
30    0.001859
32    0.001859
35    0.001859
40    0.001859
34    0.001239
38    0.000620
Name: count, dtype: float64

........................
La distribución de las categorías para la columna WORKLIFEBALANCE


worklifebalance
3    0.565675
2    0.222429
4    0.096035
1    0.048947
Name: count, dtype: float64

........................
La distribución de las categorías para la columna YEARSINCURRENTROLE


yearsincurrentrole
2     0.006815
7     0.003098
0     0.002478
4     0.001859
1     0.001859
11    0.001239
6     0.001239
3     0.001239
13    0.000620
12    0.000620
Name: count, dtype: float64

........................
