In [1]:
# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt


# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames
pd.set_option('display.max_colwidth', None) # para visualizar todo el contenido de los valores
#pd.set_option('display.max_rows', None) # Para visualizar todo el contenido de las filas

In [2]:
datos = pd.read_csv('Datos/datos_empresa.csv', index_col=0)
datos_limpios = pd.read_csv('Datos/datos_empresa_V1.csv', index_col=0)


In [3]:
#Creamos Dataframe de las columnas de los df a comparar

datos_col = pd.DataFrame(list(datos.columns), columns=['datos'])
datos_col
datos_limpio_col = pd.DataFrame(list(datos_limpios.columns), columns=['datos2'])
datos_limpio_col


Unnamed: 0,datos2
0,Age
1,Attrition
2,BusinessTravel
3,DailyRate
4,Department
5,DistanceFromHome
6,Education
7,EducationField
8,employeenumber
9,EnvironmentSatisfaction


In [4]:
# mergeamos las columnas para saber si tenemos duplicados

df_merge = datos_col.merge(datos_limpio_col, left_on='datos', right_on='datos2', how = 'left')
df_merge

Unnamed: 0,datos,datos2
0,Age,Age
1,Attrition,Attrition
2,BusinessTravel,BusinessTravel
3,DailyRate,DailyRate
4,Department,Department
5,DistanceFromHome,DistanceFromHome
6,Education,Education
7,EducationField,EducationField
8,employeecount,
9,employeenumber,employeenumber


In [5]:
#filtro para saber columnas que habíamos borrado

filtro = df_merge['datos2'].isna()
df_filtro = df_merge[filtro]

df_filtro

Unnamed: 0,datos,datos2
8,employeecount,
21,Over18,
35,SameAsMonthlyIncome,
37,Salary,
38,RoleDepartament,
39,NUMBERCHILDREN,


In [6]:
comparacion = df_merge['datos'] == df_merge['datos2']
comparacion

0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8     False
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21    False
22     True
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30     True
31     True
32     True
33     True
34     True
35    False
36     True
37    False
38    False
39    False
40     True
dtype: bool

In [7]:
datos.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'employeecount',
       'employeenumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NUMCOMPANIESWORKED',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TOTALWORKINGYEARS', 'TrainingTimesLastYear', 'WORKLIFEBALANCE',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YEARSWITHCURRMANAGER', 'SameAsMonthlyIncome', 'DateBirth', 'Salary',
       'RoleDepartament', 'NUMBERCHILDREN', 'RemoteWork'],
      dtype='object')

In [8]:
#borramos columnas que no necesitamos

datos.drop( columns = ['employeecount','Over18','SameAsMonthlyIncome', 'Salary', 'RoleDepartament', 'NUMBERCHILDREN','employeenumber','StandardHours'], inplace= True)

In [9]:
# transformamos columnas para su mejor visulatización las convertimos todos a lower

new_columns = {column : column.lower() for column in datos}

new_columns

datos.rename(columns = new_columns, inplace= True)

datos.head()

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,datebirth,remotework
0,51,No,,"684,0$",,6,3,,1,0,51,3,5,resEArch DIREcToR,3,,195370.0,6462,7,No,13,30,3,0,,5,30.0,20,,15,15,1972,Yes
1,52,No,,"699,0$",,1,4,Life Sciences,3,0,65,2,5,ManAGeR,3,,199990.0,5678,0,,14,30,1,1,340.0,5,30.0,33,,11,9,1971,1
2,42,No,travel_rarely,"532,0$",Research & Development,4,2,Technical Degree,3,0,58,3,5,ManaGER,4,Married,192320.0,4933,1,No,11,30,4,0,220.0,3,,22,,11,15,1981,1
3,47,No,travel_rarely,"359,0$",,2,4,Medical,1,1,82,3,4,ReseArCH DIrECtOr,3,Married,171690.0,26703,3,,19,30,2,2,,2,,20,,5,6,1976,False
4,46,No,,"1319,0$",,3,3,Technical Degree,1,1,45,4,4,sAleS EXECUtIve,1,Divorced,,7739,2,No,12,30,4,1,,5,30.0,19,,2,8,1977,0


In [63]:
# Transformamos columnas

In [11]:
# Hemos cambiado las edades que estaban escritas para que sean numéricas. Lo cambiamos a tipo int.
# Estas edades son las que tenían en el 2023.

datos['age'].unique()

age = datos['age'].str.replace('fifty-eight', '58 ').str.replace('fifty-eight', '47 ').str.replace('thirty-six', '36').str.replace('fifty-five', '55').str.replace('fifty-two', '52').str.replace('thirty-one', '31').str.replace('twenty-six', '36').str.replace('thirty-seven', '37').str.replace('thirty-seven', '37').str.replace('thirty-two', '32').str.replace('twenty-four', '24').str.replace('forty-seven', '47').str.replace('thirty', '30')

datos['age'] = age

datos['age'] = datos['age']

datos['age'].unique()

array(['51', '52', '42', '47', '46', '48', '59', '41', '56', '38', '55',
       '40', '58', '35', '45', '33', '36', '34', '53', '43', '60', '32',
       '37', '49', '39', '50', '44', '30', '58 ', '29', '31', '54', '57',
       '27', '28', '26', '25', '24', '23', '22', '21', '20', '19', '18'],
      dtype=object)

In [12]:
# No la tocamos

datos['attrition'].unique()

array(['No', 'Yes'], dtype=object)

In [13]:
# Reemplazamos las barras bajas por espacios y los guiones por espacio. Lo capitalizamos.



businesstravel = datos['businesstravel'].str.replace('_', ' ').str.replace('-', ' ').str.capitalize()

datos['businesstravel'] = businesstravel

datos['businesstravel'].unique()

array([nan, 'Travel rarely', 'Travel frequently', 'Non travel'],
      dtype=object)

In [14]:
# Eliminamos los $ y ,0 con un .replace. También hemos pasado los nan y luego cambiamos el tipo a float

dailyrate = datos['dailyrate'].str.replace('$', '')
datos['dailyrate'] = dailyrate

datos['dailyrate'] = datos['dailyrate']

datos['dailyrate'].unique()


array(['684,0', '699,0', '532,0', '359,0', '1319,0', '117,0', '1435,0',
       '635,0', '1276,0', '840,0', '247,0', '1369,0', '201,0', '1360,0',
       '692,0', '1398,0', '286,0', '1402,0', '819,0', '884,0', '1238,0',
       '515,0', '1223,0', '202,0', '928,0', '607,0', '266,0', '429,0',
       '589,0', 'nan', '1180,0', '1282,0', '776,0', '665,0', '526,0',
       '1034,0', '1403,0', '1499,0', '580,0', '859,0', '263,0', '1376,0',
       '885,0', '1003,0', '1321,0', '394,0', '1372,0', '1333,0', '228,0',
       '737,0', '823,0', '667,0', '301,0', '573,0', '1329,0', '630,0',
       '1063,0', '1017,0', '1296,0', '939,0', '1355,0', '1448,0', '200,0',
       '1202,0', '404,0', '208,0', '813,0', '465,0', '1189,0', '1001,0',
       '1394,0', '161,0', '288,0', '682,0', '1354,0', '147,0', '119,0',
       '1413,0', '452,0', '334,0', '1132,0', '982,0', '480,0', '1099,0',
       '672,0', '1379,0', '583,0', '1492,0', '1050,0', '469,0', '237,0',
       '1440,0', '1291,0', '1157,0', '1336,0', '1224,0',

In [15]:
# Eliminamos espacios de principio y final. Ver que hacer con nan.

department = datos['department'].str.strip()
datos['department'] = department

datos['department'].unique()

array([nan, 'Research & Development', 'Sales', 'Human Resources'],
      dtype=object)

In [17]:
# Hemos cambiado el tipo de int a str y eliminado el menos de los negativos. Lo pasamos a int.

datos_distance = datos['distancefromhome'].astype(str)

distancefromhome = datos_distance.str.replace('-', '')

datos['distancefromhome'] = distancefromhome

datos['distancefromhome'] = datos['distancefromhome']

datos['distancefromhome'].unique()

array(['6', '1', '4', '2', '3', '22', '25', '9', '7', '23', '10', '12',
       '14', '13', '15', '8', '42', '28', '37', '5', '16', '35', '26',
       '24', '29', '17', '21', '18', '30', '27', '20', '31', '39', '11',
       '19', '33', '34', '46', '36', '45', '47', '32', '41', '49', '48',
       '38', '43', '40', '44'], dtype=object)

In [18]:
datos['education'].unique() # No hay que hacerle nada

array([3, 4, 2, 1, 5], dtype=int64)

In [19]:
datos['educationfield'].unique() # ya está capitalizado. Nos falta ver que hacer con NaN

array([nan, 'Life Sciences', 'Technical Degree', 'Medical', 'Other',
       'Marketing', 'Human Resources'], dtype=object)

In [20]:
# No tocar.

datos['environmentsatisfaction'].unique()

array([ 1,  3,  4,  2, 42, 37, 35, 25, 27, 31, 39, 21, 15, 14, 33, 19, 12,
       13, 28, 47, 36, 29, 24, 46, 16, 22, 41, 49, 11, 48, 18, 10, 45, 38,
       17, 20, 26, 43], dtype=int64)

In [21]:
# Vamos a sustituir los 0 y 1 por la siniciales de su género y cambiarlo a tipo objeto.
dic_map = {0:'M', 1:'F'}

datos["gender"] = datos["gender"].map(dic_map)

datos['gender'].unique()

array(['M', 'F'], dtype=object)

In [22]:
# cambiar el Not Avaiable a NaN

hourlyrate = datos['hourlyrate'].replace('Not Available', 'NaN')

datos['hourlyrate'].unique() 

array([51, 65, 58, 82, 45, 99, 91, 64, 55, 68, 49, 61, 79, 31, 69, 48, 80,
       74, 98, 59, 33, 56, 66, 57, 53, 87, 81, 84, 32, 41, 92, 47, 'NaN',
       43, 86, 30, 42, 88, 96, 67, 62, 72, 78, 89, 52, 50, 90, 37, 94, 76,
       60, 46, 83, 100, 40, 97, 54, 75, 39, 85, 63, 44, 93, 36, 35, 73,
       71, 70, 38, 77, 95, 34], dtype=object)

In [23]:
datos['jobinvolvement'].unique() # no tocar

array([3, 2, 4, 1], dtype=int64)

In [24]:
datos['joblevel'].unique() # no tocar

array([5, 4, 3, 2, 1], dtype=int64)

In [25]:
# Todo pasado a minusculas, dejando la primera en mayusculas y eliminando los espacios que habia al inicio y final
datos['jobrole'] = datos['jobrole'].str.lower().str.title().str.strip()
datos['jobrole'].unique()

array(['Research Director', 'Manager', 'Sales Executive',
       'Manufacturing Director', 'Research Scientist',
       'Healthcare Representative', 'Laboratory Technician',
       'Sales Representative', 'Human Resources'], dtype=object)

In [26]:
datos['jobsatisfaction'].unique() # no tocar

array([3, 4, 1, 2], dtype=int64)

In [27]:
# Sustituyo los que están mal escritos para que sean iguales a los que si creando un diccionario previamente. Comprobar moda
replacements = {
    'divorced': 'Divorced',
    'Marreid': 'Married'
}

datos['maritalstatus'] = datos['maritalstatus'].replace(replacements)
datos['maritalstatus'].unique() 

array([nan, 'Married', 'Divorced', 'Single'], dtype=object)

In [28]:
datos['monthlyrate'].unique() # no tocar

array([ 6462,  5678,  4933, ..., 15302, 26956, 16642], dtype=int64)

In [29]:
datos['numcompaniesworked'].unique() # no tocar

array([7, 0, 1, 3, 2, 4, 8, 9, 5, 6], dtype=int64)

In [30]:
# overtime mirar qué hacer con los nan. ¿Moda?

datos['overtime'].unique()

array(['No', nan, 'Yes'], dtype=object)

In [31]:
datos['percentsalaryhike'].unique()

suma = datos['percentsalaryhike'].value_counts()
suma

percentsalaryhike
11    232
13    230
12    225
14    220
15    110
18     98
17     88
16     86
19     82
20     60
22     59
21     51
23     29
24     25
25     19
Name: count, dtype: int64

In [32]:
# No hacemos cambios de momento

datos['relationshipsatisfaction'].unique()

array([3, 1, 4, 2], dtype=int64)

In [33]:
# No he hecho nada en esta columna parece estra bien
datos['stockoptionlevel'].unique()

array([0, 1, 2, 3], dtype=int64)

In [34]:
# En cuanto gestionemos los nan cambiar a INT

datos['totalworkingyears'].unique()

array([nan, '34,0', '22,0', '28,0', '20,0', '21,0', '33,0', '40,0',
       '18,0', '25,0', '15,0', '17,0', '26,0', '16,0', '24,0', '14,0',
       '23,0', '27,0', '19,0', '11,0', '38,0', '37,0', '13,0', '12,0',
       '29,0', '10,0', '36,0', '35,0', '9,0', '31,0', '32,0', '8,0',
       '7,0', '30,0', '6,0', '5,0', '4,0', '3,0', '2,0', '1,0', '0,0'],
      dtype=object)

In [35]:
# trainingtimeslastyear no hacer nada aparentemente toda la información es correcta

datos['trainingtimeslastyear'].unique()

array([5, 3, 2, 0, 1, 4, 6], dtype=int64)

In [36]:
# todo aparenta estar correcto

datos['yearsatcompany'].unique()

array([20, 33, 22, 19, 21, 18, 24, 31, 26, 16, 23, 15, 17, 32, 14, 13, 25,
       12, 11, 37, 40, 36, 27, 29, 10,  9, 30,  8,  7, 34,  6,  5,  4,  2,
        3,  1,  0], dtype=int64)

In [37]:
datos['yearssincelastpromotion'].unique()



array([15, 11,  5,  2,  4,  7,  0,  1, 13, 14,  8, 12,  3,  6, 10,  9],
      dtype=int64)

In [38]:
# no tenemos que tocar nada

datos['yearswithcurrmanager'].unique()

array([15,  9,  6,  8,  7, 11, 10, 12,  4,  0,  5, 17,  2, 14,  1, 13,  3,
       16], dtype=int64)

In [39]:
# esta ok

datos['datebirth'].unique()

array([1972, 1971, 1981, 1976, 1977, 1975, 1964, 1982, 1967, 1985, 1968,
       1983, 1965, 1988, 1978, 1990, 1987, 1989, 1970, 1980, 1963, 1991,
       1986, 1974, 1984, 1973, 1979, 1993, 1994, 1992, 1969, 1966, 1996,
       1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005],
      dtype=int64)

In [40]:
diccionario_mapa = {'Yes':'Yes', '1':'Yes', 'False':'No', '0':'No', 'True':'Yes'}

datos["remotework"] = datos["remotework"].map(diccionario_mapa)

datos['remotework'].unique()

array(['Yes', 'No'], dtype=object)

In [41]:
datos.columns

Index(['age', 'attrition', 'businesstravel', 'dailyrate', 'department',
       'distancefromhome', 'education', 'educationfield',
       'environmentsatisfaction', 'gender', 'hourlyrate', 'jobinvolvement',
       'joblevel', 'jobrole', 'jobsatisfaction', 'maritalstatus',
       'monthlyincome', 'monthlyrate', 'numcompaniesworked', 'overtime',
       'percentsalaryhike', 'performancerating', 'relationshipsatisfaction',
       'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear',
       'worklifebalance', 'yearsatcompany', 'yearsincurrentrole',
       'yearssincelastpromotion', 'yearswithcurrmanager', 'datebirth',
       'remotework'],
      dtype='object')

In [42]:
# qué columnas tienen nulos

nulos_cat = datos[datos.columns[datos.isnull().any()]].select_dtypes(include = "O").columns
print("Las columnas categóricas que tienen nulos son : \n ")
print(nulos_cat)

Las columnas categóricas que tienen nulos son : 
 
Index(['businesstravel', 'department', 'educationfield', 'maritalstatus',
       'monthlyincome', 'overtime', 'performancerating', 'totalworkingyears',
       'worklifebalance', 'yearsincurrentrole'],
      dtype='object')


In [43]:
# Vemos la forma de estas columnas

for col in nulos_cat:
    print(f"La distribución de las categorías para la columna {col.upper()}")
    display(datos[col].value_counts() / datos.shape[0])
    print("........................")

La distribución de las categorías para la columna BUSINESSTRAVEL


businesstravel
Travel rarely        0.363073
Travel frequently    0.102230
Non travel           0.056382
Name: count, dtype: float64

........................
La distribución de las categorías para la columna DEPARTMENT


department
Research & Development    0.121437
Sales                     0.056382
Human Resources           0.009294
Name: count, dtype: float64

........................
La distribución de las categorías para la columna EDUCATIONFIELD


educationfield
Life Sciences       0.216233
Medical             0.171004
Marketing           0.064436
Technical Degree    0.042751
Other               0.036555
Human Resources     0.007435
Name: count, dtype: float64

........................
La distribución de las categorías para la columna MARITALSTATUS


maritalstatus
Married     0.271995
Single      0.201363
Divorced    0.123296
Name: count, dtype: float64

........................
La distribución de las categorías para la columna MONTHLYINCOME


monthlyincome
6347,0     0.002478
5304,0     0.002478
2657,0     0.001859
2258,0     0.001859
5405,0     0.001239
             ...   
3102,0     0.000620
4556,0     0.000620
4230,0     0.000620
4859,0     0.000620
19431,0    0.000620
Name: count, Length: 668, dtype: float64

........................
La distribución de las categorías para la columna OVERTIME


overtime
No     0.422553
Yes    0.158612
Name: count, dtype: float64

........................
La distribución de las categorías para la columna PERFORMANCERATING


performancerating
3,0    0.746592
4,0    0.132590
Name: count, dtype: float64

........................
La distribución de las categorías para la columna TOTALWORKINGYEARS


totalworkingyears
10,0    0.089219
8,0     0.053284
6,0     0.052045
9,0     0.042751
5,0     0.040892
7,0     0.034696
4,0     0.033457
1,0     0.032838
12,0    0.021066
3,0     0.019827
14,0    0.018587
13,0    0.018587
11,0    0.017968
15,0    0.017348
16,0    0.017348
20,0    0.017348
18,0    0.016729
21,0    0.014250
17,0    0.013631
2,0     0.013011
22,0    0.011152
19,0    0.010533
24,0    0.008674
23,0    0.008055
28,0    0.008055
26,0    0.004957
0,0     0.004957
29,0    0.003717
36,0    0.003717
25,0    0.003717
33,0    0.003717
37,0    0.003098
27,0    0.003098
31,0    0.002478
30,0    0.001859
32,0    0.001859
35,0    0.001859
40,0    0.001859
34,0    0.001239
38,0    0.000620
Name: count, dtype: float64

........................
La distribución de las categorías para la columna WORKLIFEBALANCE


worklifebalance
3,0    0.565675
2,0    0.222429
4,0    0.096035
1,0    0.048947
Name: count, dtype: float64

........................
La distribución de las categorías para la columna YEARSINCURRENTROLE


yearsincurrentrole
2,0     0.006815
7,0     0.003098
0,0     0.002478
4,0     0.001859
1,0     0.001859
11,0    0.001239
6,0     0.001239
3,0     0.001239
13,0    0.000620
12,0    0.000620
Name: count, dtype: float64

........................


A partir de Aquí hacemos las funciones

In [None]:
#creo un csv nuevo

In [44]:
datos.to_csv('Datos/datos_empresa_V1.1.csv')

In [53]:
datos = pd.read_csv('Datos/datos_empresa_V1.1.csv', index_col=0)

In [54]:
datos.head()

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,datebirth,remotework
0,51,No,,6840,,6,3,,1,M,51.0,3,5,Research Director,3,,195370.0,6462,7,No,13,30,3,0,,5,30.0,20,,15,15,1972,Yes
1,52,No,,6990,,1,4,Life Sciences,3,M,65.0,2,5,Manager,3,,199990.0,5678,0,,14,30,1,1,340.0,5,30.0,33,,11,9,1971,Yes
2,42,No,Travel rarely,5320,Research & Development,4,2,Technical Degree,3,M,58.0,3,5,Manager,4,Married,192320.0,4933,1,No,11,30,4,0,220.0,3,,22,,11,15,1981,Yes
3,47,No,Travel rarely,3590,,2,4,Medical,1,F,82.0,3,4,Research Director,3,Married,171690.0,26703,3,,19,30,2,2,,2,,20,,5,6,1976,No
4,46,No,,13190,,3,3,Technical Degree,1,F,45.0,4,4,Sales Executive,1,Divorced,,7739,2,No,12,30,4,1,,5,30.0,19,,2,8,1977,No


In [55]:
datos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1614 entries, 0 to 1613
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       1614 non-null   int64  
 1   attrition                 1614 non-null   object 
 2   businesstravel            842 non-null    object 
 3   dailyrate                 1490 non-null   object 
 4   department                302 non-null    object 
 5   distancefromhome          1614 non-null   int64  
 6   education                 1614 non-null   int64  
 7   educationfield            869 non-null    object 
 8   environmentsatisfaction   1614 non-null   int64  
 9   gender                    1614 non-null   object 
 10  hourlyrate                1530 non-null   float64
 11  jobinvolvement            1614 non-null   int64  
 12  joblevel                  1614 non-null   int64  
 13  jobrole                   1614 non-null   object 
 14  jobsatisfacti

In [56]:
cols = ['dailyrate', 'monthlyincome',  'performancerating', 'totalworkingyears', 'worklifebalance', 'yearsincurrentrole']

def cambiar_float(datos, cols):
    for col in cols:
        datos[col] = datos[col].apply(lambda dato: float(dato.replace(",", ".")) if isinstance(dato, str) else np.nan)



In [57]:
cambiar_float(datos, cols)

In [58]:
datos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1614 entries, 0 to 1613
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       1614 non-null   int64  
 1   attrition                 1614 non-null   object 
 2   businesstravel            842 non-null    object 
 3   dailyrate                 1490 non-null   float64
 4   department                302 non-null    object 
 5   distancefromhome          1614 non-null   int64  
 6   education                 1614 non-null   int64  
 7   educationfield            869 non-null    object 
 8   environmentsatisfaction   1614 non-null   int64  
 9   gender                    1614 non-null   object 
 10  hourlyrate                1530 non-null   float64
 11  jobinvolvement            1614 non-null   int64  
 12  joblevel                  1614 non-null   int64  
 13  jobrole                   1614 non-null   object 
 14  jobsatisfacti

In [129]:
cols2 = ['age', 'distancefromhome', 'hourlyrate']


        
                

In [52]:
datatexto = datos.select_dtypes(include= ['object']).columns
datatexto

Index(['attrition', 'businesstravel', 'department', 'educationfield', 'gender',
       'jobrole', 'maritalstatus', 'overtime', 'remotework'],
      dtype='object')

In [None]:
# numeros ['age', 'distancefromhome', 'hourlyrate']

In [126]:
datos.sample(2)

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,datebirth,remotework
1342,32,No,Non travel,1184.0,,1,3,Life Sciences,3,F,70,2,1,Laboratory Technician,2,Married,,3974,6,No,20,,3,0,5.0,3,3.0,3,,0,2,1991,Yes
97,36,No,Travel frequently,469.0,,3,3,,3,M,46,3,1,Research Scientist,2,Married,3692.0,9256,1,No,12,3.0,3,0,12.0,2,2.0,11,,0,7,1987,No


In [120]:
datos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1614 entries, 0 to 1613
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       1614 non-null   object 
 1   attrition                 1614 non-null   object 
 2   businesstravel            842 non-null    object 
 3   dailyrate                 1490 non-null   float64
 4   department                302 non-null    object 
 5   distancefromhome          1614 non-null   object 
 6   education                 1614 non-null   int64  
 7   educationfield            869 non-null    object 
 8   environmentsatisfaction   1614 non-null   int64  
 9   gender                    1614 non-null   object 
 10  hourlyrate                1614 non-null   object 
 11  jobinvolvement            1614 non-null   int64  
 12  joblevel                  1614 non-null   int64  
 13  jobrole                   1614 non-null   object 
 14  jobsatisfacti