In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, KFold
from sklearn.metrics import accuracy_score

# Función chequeo

In [158]:
def chequeator(df_to_submit, num_sub):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    """
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.id.all() == sample.id.all():
                print("You're ready to submit!")
                df_to_submit.to_csv("submission" + str(num_sub) + ".csv", index = False)
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")


---
---
---

## Columnas

In [3]:
df_columns = pd.read_csv('data/columns_meaning.csv', index_col=0)
df_columns.head()

Unnamed: 0_level_0,Description
Column,Unnamed: 1_level_1
0,Case_ID registered in Hospital
1,Unique code for the Hospital
2,Unique code for the type of Hospital
3,City Code of the Hospital
4,Region Code of the Hospital


In [4]:
dict_columns = {}
for i in range(0,18):
    value = df_columns.iloc[i,0]
    dict_columns[i] = value

In [5]:
dict_columns

{0: 'Case_ID registered in Hospital',
 1: 'Unique code for the Hospital',
 2: 'Unique code for the type of Hospital',
 3: 'City Code of the Hospital',
 4: 'Region Code of the Hospital',
 5: 'Number of Extra rooms available in the Hospital',
 6: 'Department overlooking the case',
 7: 'Code for the Ward type',
 8: 'Code for the Ward Facility',
 9: 'Condition of Bed in the Ward',
 10: 'Unique Patient Id',
 11: 'City Code for the patient',
 12: 'Admission Type registered by the Hospital',
 13: 'Severity of the illness recorded at the time of admission',
 14: 'Number of Visitors with the patient',
 15: 'Age of the patient',
 16: 'Deposit at the Admission Time',
 17: 'Stay Days by the patient'}

## Se carga dftrain

In [6]:
dftrain = pd.read_csv('data/hospital_train.csv')
dftrain.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,161528,6,a,6,X,2,gynecology,R,F,4.0,45810,2.0,Urgent,Moderate,2,21-30,2817.0,0-10
1,159472,23,a,6,X,4,gynecology,Q,F,2.0,128565,15.0,Trauma,Moderate,4,51-60,4498.0,21-30
2,309765,2,c,5,Z,2,anesthesia,S,F,3.0,46565,5.0,Urgent,Moderate,2,71-80,4573.0,11-20
3,279614,32,f,9,Y,3,gynecology,S,B,4.0,124546,6.0,Emergency,Moderate,4,11-20,7202.0,51-60
4,147791,14,a,1,X,3,gynecology,S,E,2.0,22729,8.0,Urgent,Moderate,2,51-60,3398.0,51-60


## Se comienza a analizar df_train

In [7]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       100000 non-null  int64  
 1   1       100000 non-null  int64  
 2   2       100000 non-null  object 
 3   3       100000 non-null  int64  
 4   4       100000 non-null  object 
 5   5       100000 non-null  int64  
 6   6       100000 non-null  object 
 7   7       100000 non-null  object 
 8   8       100000 non-null  object 
 9   9       99967 non-null   float64
 10  10      100000 non-null  int64  
 11  11      98517 non-null   float64
 12  12      100000 non-null  object 
 13  13      100000 non-null  object 
 14  14      100000 non-null  int64  
 15  15      100000 non-null  object 
 16  16      100000 non-null  float64
 17  17      100000 non-null  object 
dtypes: float64(3), int64(6), object(9)
memory usage: 13.7+ MB


In [8]:
dftrain.isna().sum()

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9       33
10       0
11    1483
12       0
13       0
14       0
15       0
16       0
17       0
dtype: int64

## Tratamiento de valores NaN

In [9]:
dftrain.loc[:,'9'].unique()

array([ 4.,  2.,  3.,  1., nan])

In [10]:
dict_columns[9]

'Condition of Bed in the Ward'

In [11]:
dftrain.loc[:,'11'].unique()

array([ 2., 15.,  5.,  6.,  8.,  7.,  4.,  1.,  9., nan, 23., 31., 12.,
       22.,  3., 30., 16., 10., 32., 27., 20., 14., 13., 21., 26., 19.,
       18., 28., 24., 25., 11., 34., 35., 37., 33., 29., 36., 38.])

In [12]:
dict_columns[11] # La columna 11 no será utilizada en el modelo.

'City Code for the patient'

## Podrían rellenarse los NaN de la columna 9 con la moda.

In [13]:
dftrain.loc[:,'9'].mode()

0    2.0
dtype: float64

In [14]:
dftrain.loc[:,'9'].fillna(dftrain.loc[:,'9'].mode()[0], inplace = True)

In [15]:
dftrain.isna().sum()

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11    1483
12       0
13       0
14       0
15       0
16       0
17       0
dtype: int64

In [16]:
dftrain.nunique()

0     100000
1         32
2          7
3         11
4          3
5         16
6          5
7          6
8          6
9          4
10     59097
11        37
12         3
13         3
14        28
15        10
16      6355
17        11
dtype: int64

## Análisis columna por columna --> Tratamiento que se le da a cada una

# Columna TARGET (17)

In [17]:
dftrain.loc[:,'17'].nunique()

11

In [18]:
dftrain.loc[:,'17'].unique()

array(['0-10', '21-30', '11-20', '51-60', '31-40', '71-80',
       'More than 100 Days', '41-50', '81-90', '61-70', '91-100'],
      dtype=object)

## Se le realiza LabelEncoder

In [19]:
leTARGET = LabelEncoder()

In [20]:
dftrain.loc[:,'17'] = leTARGET.fit_transform(dftrain.loc[:,'17'])
dftrain.loc[:,'17']

0         0
1         2
2         1
3         5
4         5
         ..
99995    10
99996     5
99997     3
99998     1
99999     4
Name: 17, Length: 100000, dtype: int32

In [21]:
list(leTARGET.inverse_transform([0,1,2,3,4,5,6,7,8,9,10]))

['0-10',
 '11-20',
 '21-30',
 '31-40',
 '41-50',
 '51-60',
 '61-70',
 '71-80',
 '81-90',
 '91-100',
 'More than 100 Days']

# Columna 0 - Case ID

In [22]:
dftrain.loc[:,'0'].nunique()

100000

## Columna 0 --> No será modificada y no participará del modelo
---
# Columna 1 - Código único de hospital

In [23]:
dftrain.loc[:,'1'].nunique()

32

In [24]:
dftrain.loc[:,'1'].unique()

array([ 6, 23,  2, 32, 14, 15, 12, 19, 11, 21, 26, 27, 29,  9,  8, 28, 24,
       10, 17, 25, 30, 18, 16, 22,  5,  1,  7, 31,  4, 13,  3, 20],
      dtype=int64)

## Columna 1 --> Se aplica One Hot Encoding

In [25]:
dummies_CH1 = pd.get_dummies(dftrain.loc[:,'1'], prefix='CH1')
dummies_CH1.head()

Unnamed: 0,CH1_1,CH1_2,CH1_3,CH1_4,CH1_5,CH1_6,CH1_7,CH1_8,CH1_9,CH1_10,...,CH1_23,CH1_24,CH1_25,CH1_26,CH1_27,CH1_28,CH1_29,CH1_30,CH1_31,CH1_32
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0



---
# Columna 2 - Código único de tipo de hospital

In [26]:
dftrain.loc[:,'2'].nunique()

7

In [27]:
dftrain.loc[:,'2'].unique()

array(['a', 'c', 'f', 'b', 'd', 'e', 'g'], dtype=object)

## Columna 2 --> Se aplica One Hot Encoding

In [28]:
dummies_CTH2 = pd.get_dummies(dftrain.loc[:,'2'], prefix='CTH2')
dummies_CTH2

Unnamed: 0,CTH2_a,CTH2_b,CTH2_c,CTH2_d,CTH2_e,CTH2_f,CTH2_g
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...
99995,1,0,0,0,0,0,0
99996,0,1,0,0,0,0,0
99997,1,0,0,0,0,0,0
99998,0,0,0,0,0,1,0



---
# Columna 3 - Código de ciudad del hospital

In [29]:
dftrain.loc[:,'3'].nunique()

11

In [30]:
dftrain.loc[:,'3'].unique()

array([ 6,  5,  9,  1,  7,  2,  3,  4, 11, 13, 10], dtype=int64)

## Columna 3 --> Se aplica One Hot Encoding

In [31]:
dummies_CCH3 = pd.get_dummies(dftrain.loc[:,'3'], prefix='CCH3')
dummies_CCH3

Unnamed: 0,CCH3_1,CCH3_2,CCH3_3,CCH3_4,CCH3_5,CCH3_6,CCH3_7,CCH3_9,CCH3_10,CCH3_11,CCH3_13
0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,0,0,0,0,0,1,0,0,0
99996,0,0,0,0,0,0,0,0,0,1,0
99997,0,0,0,0,0,1,0,0,0,0,0
99998,0,0,0,0,0,0,0,1,0,0,0



---
# Columna 4 - Código de región del hospital

In [32]:
dftrain.loc[:,'4'].nunique()

3

In [33]:
dftrain.loc[:,'4'].unique()

array(['X', 'Z', 'Y'], dtype=object)

## Columna 4 --> Altamente relacionada con columna 3 --> No se utilizará columna 4 en el modelo

In [34]:
dftrain.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,161528,6,a,6,X,2,gynecology,R,F,4.0,45810,2.0,Urgent,Moderate,2,21-30,2817.0,0
1,159472,23,a,6,X,4,gynecology,Q,F,2.0,128565,15.0,Trauma,Moderate,4,51-60,4498.0,2
2,309765,2,c,5,Z,2,anesthesia,S,F,3.0,46565,5.0,Urgent,Moderate,2,71-80,4573.0,1
3,279614,32,f,9,Y,3,gynecology,S,B,4.0,124546,6.0,Emergency,Moderate,4,11-20,7202.0,5
4,147791,14,a,1,X,3,gynecology,S,E,2.0,22729,8.0,Urgent,Moderate,2,51-60,3398.0,5


In [35]:
dftrain.drop(['4'], axis=1, inplace=True)

In [36]:
dftrain.head()

Unnamed: 0,0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,16,17
0,161528,6,a,6,2,gynecology,R,F,4.0,45810,2.0,Urgent,Moderate,2,21-30,2817.0,0
1,159472,23,a,6,4,gynecology,Q,F,2.0,128565,15.0,Trauma,Moderate,4,51-60,4498.0,2
2,309765,2,c,5,2,anesthesia,S,F,3.0,46565,5.0,Urgent,Moderate,2,71-80,4573.0,1
3,279614,32,f,9,3,gynecology,S,B,4.0,124546,6.0,Emergency,Moderate,4,11-20,7202.0,5
4,147791,14,a,1,3,gynecology,S,E,2.0,22729,8.0,Urgent,Moderate,2,51-60,3398.0,5



---
# Columna 5 - Número de habitaciones extra disponibles en el hospital

In [37]:
dftrain.loc[:,'5'].nunique()

16

In [38]:
dftrain.loc[:,'5'].unique()

array([ 2,  4,  3,  5,  6,  7,  1,  8, 10,  9,  0, 11, 14, 21, 12, 24],
      dtype=int64)

## Columna 5 --> Se dejará tal cual está, ya que una mayor disponibilidad de habitaciones podría permitir un mayor tiempo de estadía de los pacientes.
---
# Columna 6 - Departamento encargado del caso

In [39]:
dftrain.loc[:,'6'].nunique()

5

In [40]:
dftrain.loc[:,'6'].unique()

array(['gynecology', 'anesthesia', 'radiotherapy', 'TB & Chest disease',
       'surgery'], dtype=object)

## Una vez hecho el LabelEncoder para nuestro TARGET, se evalúa la media de estadía de cada persona dependiendo del departamento que lleva adelante el caso.

In [41]:
dftrain[dftrain['6'] == 'gynecology'].mean()[-1]

2.7146796369346253

In [42]:
dftrain[dftrain['6'] == 'anesthesia'].mean()[-1]

2.524724681494278

In [43]:
dftrain[dftrain['6'] == 'radiotherapy'].mean()[-1]

2.8346615781793694

In [44]:
dftrain[dftrain['6'] == 'TB & Chest disease'].mean()[-1]

2.5706924315619966

In [45]:
dftrain[dftrain['6'] == 'surgery'].mean()[-1]

3.296675191815857

## Columna 6 --> Todos tienen valores de media similares, por lo que se decide realizarle One Hot Encoding

In [46]:
dummies_DEP6 = pd.get_dummies(dftrain.loc[:,'6'], prefix='DEP6')
dummies_DEP6

Unnamed: 0,DEP6_TB & Chest disease,DEP6_anesthesia,DEP6_gynecology,DEP6_radiotherapy,DEP6_surgery
0,0,0,1,0,0
1,0,0,1,0,0
2,0,1,0,0,0
3,0,0,1,0,0
4,0,0,1,0,0
...,...,...,...,...,...
99995,0,0,1,0,0
99996,0,0,1,0,0
99997,0,0,1,0,0
99998,0,0,1,0,0



---
# Columna 7 - Código del tipo de sala

In [47]:
dftrain.loc[:,'7'].nunique()

6

In [48]:
dftrain.loc[:,'7'].unique()

array(['R', 'Q', 'S', 'P', 'T', 'U'], dtype=object)

## Una vez hecho el LabelEncoder para nuestro TARGET, se evalúa la media de estadía de cada persona dependiendo del tipo de sala

In [49]:
dftrain[dftrain['7'] == 'P'].mean()[-1]

2.1412140575079874

In [50]:
dftrain[dftrain['7'] == 'Q'].mean()[-1]

2.2920630157539383

In [51]:
dftrain[dftrain['7'] == 'R'].mean()[-1]

2.647196029776675

In [52]:
dftrain[dftrain['7'] == 'S'].mean()[-1]

3.402611802390046

In [53]:
dftrain[dftrain['7'] == 'T'].mean()[-1]

2.807017543859649

In [54]:
dftrain[dftrain['7'] == 'U'].mean()[-1]

1.6666666666666667

## Columna 7 --> Todos tienen valores de media similares, por lo que se decide realizarle One Hot Encoding

In [55]:
dummies_TS7 = pd.get_dummies(dftrain.loc[:,'7'], prefix='TS7')
dummies_TS7

Unnamed: 0,TS7_P,TS7_Q,TS7_R,TS7_S,TS7_T,TS7_U
0,0,0,1,0,0,0
1,0,1,0,0,0,0
2,0,0,0,1,0,0
3,0,0,0,1,0,0
4,0,0,0,1,0,0
...,...,...,...,...,...,...
99995,0,0,1,0,0,0
99996,0,0,1,0,0,0
99997,0,1,0,0,0,0
99998,0,0,0,1,0,0



---
# Columna 8 - Código de las instalaciones de la sala

In [56]:
dftrain.loc[:,'8'].nunique()

6

In [57]:
dftrain.loc[:,'8'].unique()

array(['F', 'B', 'E', 'C', 'D', 'A'], dtype=object)

## Una vez hecho el LabelEncoder para nuestro TARGET, se evalúa la media de estadía de cada persona dependiendo de las instalaciones de la sala

In [58]:
dftrain[dftrain['8'] == 'A'].mean()[-1]

2.7611717403790923

In [59]:
dftrain[dftrain['8'] == 'B'].mean()[-1]

2.9204085388295913

In [60]:
dftrain[dftrain['8'] == 'C'].mean()[-1]

2.3202544118964434

In [61]:
dftrain[dftrain['8'] == 'D'].mean()[-1]

2.8727361424477103

In [62]:
dftrain[dftrain['8'] == 'E'].mean()[-1]

2.5684570369082795

In [63]:
dftrain[dftrain['8'] == 'F'].mean()[-1]

2.7367016794922203

## Columna 8 --> Todos tienen valores de media similares, por lo que se decide realizarle One Hot Encoding

In [64]:
dummies_IS8 = pd.get_dummies(dftrain.loc[:,'8'], prefix='IS8')
dummies_IS8

Unnamed: 0,IS8_A,IS8_B,IS8_C,IS8_D,IS8_E,IS8_F
0,0,0,0,0,0,1
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,1,0,0,0,0
4,0,0,0,0,1,0
...,...,...,...,...,...,...
99995,0,1,0,0,0,0
99996,0,0,0,0,0,1
99997,0,0,0,0,0,1
99998,0,1,0,0,0,0



---
# Columna 9 - Condición de la cama en la sala

In [65]:
dftrain.loc[:,'9'].nunique()

4

In [66]:
dftrain.loc[:,'9'].unique()

array([4., 2., 3., 1.])

## Columna 9 --> Se dejará tal cual está, ya que un mejor estado de la cama podría derivar en un mayor tiempo de estadía de los pacientes.
---
# Columna 10 - Id de paciente

In [67]:
dftrain.loc[:,'10'].nunique()

59097

In [68]:
dftrain.loc[:,'10'].unique()

array([ 45810, 128565,  46565, ...,  59825,  38625, 113798], dtype=int64)

## Columna 10 --> Aplicar One Hot Encoding agregaría miles de columnas al dataframe, por otro lado, dejarlo tal cual se encuentra ponderaría más a un paciente que a otro. Finalmente se decide no utilizar esta columna en el análisis.

In [69]:
dftrain.head()

Unnamed: 0,0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,16,17
0,161528,6,a,6,2,gynecology,R,F,4.0,45810,2.0,Urgent,Moderate,2,21-30,2817.0,0
1,159472,23,a,6,4,gynecology,Q,F,2.0,128565,15.0,Trauma,Moderate,4,51-60,4498.0,2
2,309765,2,c,5,2,anesthesia,S,F,3.0,46565,5.0,Urgent,Moderate,2,71-80,4573.0,1
3,279614,32,f,9,3,gynecology,S,B,4.0,124546,6.0,Emergency,Moderate,4,11-20,7202.0,5
4,147791,14,a,1,3,gynecology,S,E,2.0,22729,8.0,Urgent,Moderate,2,51-60,3398.0,5


In [70]:
dftrain.drop(['10'], axis=1, inplace=True)

In [71]:
dftrain.head()

Unnamed: 0,0,1,2,3,5,6,7,8,9,11,12,13,14,15,16,17
0,161528,6,a,6,2,gynecology,R,F,4.0,2.0,Urgent,Moderate,2,21-30,2817.0,0
1,159472,23,a,6,4,gynecology,Q,F,2.0,15.0,Trauma,Moderate,4,51-60,4498.0,2
2,309765,2,c,5,2,anesthesia,S,F,3.0,5.0,Urgent,Moderate,2,71-80,4573.0,1
3,279614,32,f,9,3,gynecology,S,B,4.0,6.0,Emergency,Moderate,4,11-20,7202.0,5
4,147791,14,a,1,3,gynecology,S,E,2.0,8.0,Urgent,Moderate,2,51-60,3398.0,5



---
# Columna 11 - Código de ciudad del paciente

In [72]:
dftrain.loc[:,'11'].nunique()

37

In [73]:
dftrain.loc[:,'11'].unique()

array([ 2., 15.,  5.,  6.,  8.,  7.,  4.,  1.,  9., nan, 23., 31., 12.,
       22.,  3., 30., 16., 10., 32., 27., 20., 14., 13., 21., 26., 19.,
       18., 28., 24., 25., 11., 34., 35., 37., 33., 29., 36., 38.])

## Columna 11 --> No será utilizada en el modelo.

In [74]:
dftrain.drop(['11'], axis=1, inplace=True)

In [75]:
dftrain.head()

Unnamed: 0,0,1,2,3,5,6,7,8,9,12,13,14,15,16,17
0,161528,6,a,6,2,gynecology,R,F,4.0,Urgent,Moderate,2,21-30,2817.0,0
1,159472,23,a,6,4,gynecology,Q,F,2.0,Trauma,Moderate,4,51-60,4498.0,2
2,309765,2,c,5,2,anesthesia,S,F,3.0,Urgent,Moderate,2,71-80,4573.0,1
3,279614,32,f,9,3,gynecology,S,B,4.0,Emergency,Moderate,4,11-20,7202.0,5
4,147791,14,a,1,3,gynecology,S,E,2.0,Urgent,Moderate,2,51-60,3398.0,5



---
# Columna 12 - Tipo de admisión registrada en el hospital

In [76]:
dftrain.loc[:,'12'].nunique()

3

In [77]:
dftrain.loc[:,'12'].unique()

array(['Urgent', 'Trauma', 'Emergency'], dtype=object)

## Una vez hecho el LabelEncoder para nuestro TARGET, se evalúa la media de estadía de cada persona dependiendo del tipo de admisión con el que fue registrada en el hospital

In [78]:
dftrain[dftrain['12'] == 'Urgent'].mean()[-1]

2.499275171323142

In [79]:
dftrain[dftrain['12'] == 'Trauma'].mean()[-1]

2.9102173044155077

In [80]:
dftrain[dftrain['12'] == 'Emergency'].mean()[-1]

2.5286560999812266

## Columna 12 --> Todos tienen valores de media similares, por lo que se decide realizarle One Hot Encoding

In [81]:
dummies_TA12 = pd.get_dummies(dftrain.loc[:,'12'], prefix='TA12')
dummies_TA12

Unnamed: 0,TA12_Emergency,TA12_Trauma,TA12_Urgent
0,0,0,1
1,0,1,0
2,0,0,1
3,1,0,0
4,0,0,1
...,...,...,...
99995,1,0,0
99996,0,0,1
99997,0,1,0
99998,0,1,0



---
# Columna 13 - Severidad de la enfermedad registrado al ingreso al hospital

In [82]:
dftrain.loc[:,'13'].nunique()

3

In [83]:
dftrain.loc[:,'13'].unique()

array(['Moderate', 'Extreme', 'Minor'], dtype=object)

## Se realiza ENCODING PERSONALIZADO para ponderar las categorías correctamente

In [84]:
condicion = [(dftrain.loc[:,'13'] == 'Minor'), 
    (dftrain.loc[:,'13'] == 'Moderate'),
    (dftrain.loc[:,'13'] == 'Extreme')]
encoding = [1, 2, 3]
dftrain.loc[:,'13'] = np.select(condicion, encoding)

In [85]:
dftrain.head()

Unnamed: 0,0,1,2,3,5,6,7,8,9,12,13,14,15,16,17
0,161528,6,a,6,2,gynecology,R,F,4.0,Urgent,2,2,21-30,2817.0,0
1,159472,23,a,6,4,gynecology,Q,F,2.0,Trauma,2,4,51-60,4498.0,2
2,309765,2,c,5,2,anesthesia,S,F,3.0,Urgent,2,2,71-80,4573.0,1
3,279614,32,f,9,3,gynecology,S,B,4.0,Emergency,2,4,11-20,7202.0,5
4,147791,14,a,1,3,gynecology,S,E,2.0,Urgent,2,2,51-60,3398.0,5


## Columna 13 --> Se le realizó ENCODING personalizado para ponderar categorías
---
# Columna 14 - Número de visitantes con el paciente

In [86]:
dftrain.loc[:,'14'].nunique()

28

In [87]:
dftrain.loc[:,'14'].unique()

array([ 2,  4,  6,  3,  8,  5,  7, 12, 10,  9, 15, 20,  1, 13, 11, 16, 14,
       30, 21, 24,  0, 18, 17, 22, 25, 19, 23, 32], dtype=int64)

In [88]:
for i in np.sort(dftrain.loc[:,'14'].unique()):
    print('Número visitantes:', i, '--> ', dftrain[dftrain['14'] == i].mean()[-1])

Número visitantes: 0 -->  1.7777777777777777
Número visitantes: 1 -->  1.5041322314049588
Número visitantes: 2 -->  1.9281734886504904
Número visitantes: 3 -->  2.14228780697975
Número visitantes: 4 -->  3.097172859450727
Número visitantes: 5 -->  4.446875
Número visitantes: 6 -->  4.981848739495798
Número visitantes: 7 -->  5.767287234042553
Número visitantes: 8 -->  6.1228175797712225
Número visitantes: 9 -->  6.826603325415677
Número visitantes: 10 -->  7.941747572815534
Número visitantes: 11 -->  7.892857142857143
Número visitantes: 12 -->  7.955271565495208
Número visitantes: 13 -->  8.55
Número visitantes: 14 -->  9.0
Número visitantes: 15 -->  8.868852459016393
Número visitantes: 16 -->  8.98876404494382
Número visitantes: 17 -->  10.0
Número visitantes: 18 -->  9.95
Número visitantes: 19 -->  10.0
Número visitantes: 20 -->  9.26923076923077
Número visitantes: 21 -->  10.0
Número visitantes: 22 -->  9.166666666666666
Número visitantes: 23 -->  10.0
Número visitantes: 24 -->  10.

## Columna 14 --> Tendencia de aumento de estadía en función del número de visitantes. Se deja tal cual está
---
# Columna 15 - Edad del paciente

In [89]:
dftrain.loc[:,'15'].nunique()

10

In [90]:
dftrain.loc[:,'15'].unique()

array(['21-30', '51-60', '71-80', '11-20', '31-40', '0-10', '61-70',
       '41-50', '81-90', '91-100'], dtype=object)

## Se le realiza LabelEncoder (se espera que, mientras mayor es el paciente, más tarde en recuperarse)

In [91]:
le15 = LabelEncoder()

In [92]:
dftrain.loc[:,'15'] = le15.fit_transform(dftrain.loc[:,'15'])
dftrain.loc[:,'15']

0        2
1        5
2        7
3        1
4        5
        ..
99995    5
99996    2
99997    3
99998    4
99999    4
Name: 15, Length: 100000, dtype: int32

In [93]:
le15.inverse_transform([0,1,2,3,4,5,6,7,8,9])

array(['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70',
       '71-80', '81-90', '91-100'], dtype=object)

In [94]:
for i in np.sort(dftrain.loc[:,'15'].unique()):
    print('Label rango edad:', i, '--> ', dftrain[dftrain['15'] == i].mean()[-1])

Label rango edad: 0 -->  2.4335410176531673
Label rango edad: 1 -->  2.330016895062887
Label rango edad: 2 -->  2.443112828195222
Label rango edad: 3 -->  2.5893490533046912
Label rango edad: 4 -->  2.729505344270445
Label rango edad: 5 -->  2.8238778465183625
Label rango edad: 6 -->  2.8591880945653196
Label rango edad: 7 -->  2.9355039034776436
Label rango edad: 8 -->  3.2868954113269946
Label rango edad: 9 -->  3.33953488372093


## Aquí se observa que la estadía media aumenta en función del rango de edad, por lo que el LabelEncoder podría considerarse correcto

## Columna 15 --> Se realizó LabelEncoder en función del rango de edad. Se observó una estadía media en aumento en función del rango de edad
---
# Columna 16 - Depósito al momento de la admisión

In [95]:
dftrain.loc[:,'16'].nunique()

6355

In [96]:
dftrain.loc[:,'16'].unique()

array([2817., 4498., 4573., ..., 2721., 2370., 7373.])

In [97]:
print('Depósito inicial mínimo:', dftrain.loc[:,'16'].unique().min())

Depósito inicial mínimo: 1800.0


In [98]:
print('Depósito inicial máximo:', dftrain.loc[:,'16'].unique().max())

Depósito inicial máximo: 10842.0


In [99]:
print('Rango 1:', dftrain[dftrain['16'] <= 2805].mean()[-1])
print('Rango 2:', dftrain[(dftrain['16'] > 2805) & (dftrain['16'] <= 3810)].mean()[-1])
print('Rango 3:', dftrain[(dftrain['16'] > 3810) & (dftrain['16'] <= 4815)].mean()[-1])
print('Rango 4:', dftrain[(dftrain['16'] > 4815) & (dftrain['16'] <= 5820)].mean()[-1])
print('Rango 5:', dftrain[(dftrain['16'] > 5820) & (dftrain['16'] <= 6825)].mean()[-1])
print('Rango 6:', dftrain[(dftrain['16'] > 6825) & (dftrain['16'] <= 7830)].mean()[-1])
print('Rango 7:', dftrain[(dftrain['16'] > 7830) & (dftrain['16'] <= 8835)].mean()[-1])
print('Rango 8:', dftrain[(dftrain['16'] > 8835) & (dftrain['16'] <= 9840)].mean()[-1])
print('Rango 10:', dftrain[dftrain['16'] > 9840].mean()[-1])

Rango 1: 3.9408233276157802
Rango 2: 3.250784762874353
Rango 3: 2.6942007857478196
Rango 4: 2.453789188660164
Rango 5: 2.5751290384247754
Rango 6: 2.8903350872334532
Rango 7: 3.1004989308624378
Rango 8: 3.654891304347826
Rango 10: 4.050632911392405


## Se observa que para depósitos tiende a haber una estadía más corta, mientras que depósitos más extremos (superiores e inferiores) tienden prolongarla.
## Dos posibilidades:
## - Dejarlo tal cual aparece en el dataframe (1ra OPCIÓN)
## - Ponderar los rangos (2da OPCIÓN)

In [100]:
# CÓDIGO PARA LA 2da OPCIÓN

#cond = [(dftrain_V2['16'] <= 2805), 
#    ((dftrain_V2['16'] > 2805) & (dftrain_V2['16'] <= 3810)),
#    ((dftrain_V2['16'] > 3810) & (dftrain_V2['16'] <= 4815)),
#    ((dftrain_V2['16'] > 4815) & (dftrain_V2['16'] <= 5820)),
#    ((dftrain_V2['16'] > 5820) & (dftrain_V2['16'] <= 6825)),
#    ((dftrain_V2['16'] > 6825) & (dftrain_V2['16'] <= 7830)),
#    ((dftrain_V2['16'] > 7830) & (dftrain_V2['16'] <= 8835)),
#    ((dftrain_V2['16'] > 8835) & (dftrain_V2['16'] <= 9840)),
#    (dftrain_V2['16'] > 9840)]
#encod = [3,2,1,1,1,1,2,2,3]
#dftrain_V2.iloc[:,16] = np.select(cond, encod)


In [101]:
dftrain

Unnamed: 0,0,1,2,3,5,6,7,8,9,12,13,14,15,16,17
0,161528,6,a,6,2,gynecology,R,F,4.0,Urgent,2,2,2,2817.0,0
1,159472,23,a,6,4,gynecology,Q,F,2.0,Trauma,2,4,5,4498.0,2
2,309765,2,c,5,2,anesthesia,S,F,3.0,Urgent,2,2,7,4573.0,1
3,279614,32,f,9,3,gynecology,S,B,4.0,Emergency,2,4,1,7202.0,5
4,147791,14,a,1,3,gynecology,S,E,2.0,Urgent,2,2,5,3398.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,237869,12,a,9,3,gynecology,R,B,3.0,Emergency,2,6,5,3966.0,10
99996,254763,28,b,11,2,gynecology,R,F,2.0,Urgent,2,3,2,4005.0,5
99997,69788,6,a,6,3,gynecology,Q,F,3.0,Trauma,1,2,3,5215.0,3
99998,204442,32,f,9,2,gynecology,S,B,4.0,Trauma,2,3,4,5092.0,1


---

## Agregado de los One Hot Encodings

In [102]:
dftrain = pd.concat([dftrain, dummies_CH1, dummies_CTH2, dummies_CCH3, dummies_DEP6, dummies_TS7, dummies_IS8, dummies_TA12], axis = 1)

In [103]:
dftrain

Unnamed: 0,0,1,2,3,5,6,7,8,9,12,...,TS7_U,IS8_A,IS8_B,IS8_C,IS8_D,IS8_E,IS8_F,TA12_Emergency,TA12_Trauma,TA12_Urgent
0,161528,6,a,6,2,gynecology,R,F,4.0,Urgent,...,0,0,0,0,0,0,1,0,0,1
1,159472,23,a,6,4,gynecology,Q,F,2.0,Trauma,...,0,0,0,0,0,0,1,0,1,0
2,309765,2,c,5,2,anesthesia,S,F,3.0,Urgent,...,0,0,0,0,0,0,1,0,0,1
3,279614,32,f,9,3,gynecology,S,B,4.0,Emergency,...,0,0,1,0,0,0,0,1,0,0
4,147791,14,a,1,3,gynecology,S,E,2.0,Urgent,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,237869,12,a,9,3,gynecology,R,B,3.0,Emergency,...,0,0,1,0,0,0,0,1,0,0
99996,254763,28,b,11,2,gynecology,R,F,2.0,Urgent,...,0,0,0,0,0,0,1,0,0,1
99997,69788,6,a,6,3,gynecology,Q,F,3.0,Trauma,...,0,0,0,0,0,0,1,0,1,0
99998,204442,32,f,9,2,gynecology,S,B,4.0,Trauma,...,0,0,1,0,0,0,0,0,1,0


## Eliminación de las columnas codificadas

In [104]:
dftrain_opcion1 = dftrain.drop(['1', '2', '3', '6', '7', '8', '12'], axis=1)

In [105]:
dftrain_opcion1.head()

Unnamed: 0,0,5,9,13,14,15,16,17,CH1_1,CH1_2,...,TS7_U,IS8_A,IS8_B,IS8_C,IS8_D,IS8_E,IS8_F,TA12_Emergency,TA12_Trauma,TA12_Urgent
0,161528,2,4.0,2,2,2,2817.0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,159472,4,2.0,2,4,5,4498.0,2,0,0,...,0,0,0,0,0,0,1,0,1,0
2,309765,2,3.0,2,2,7,4573.0,1,0,1,...,0,0,0,0,0,0,1,0,0,1
3,279614,3,4.0,2,4,1,7202.0,5,0,0,...,0,0,1,0,0,0,0,1,0,0
4,147791,3,2.0,2,2,5,3398.0,5,0,0,...,0,0,0,0,0,1,0,0,0,1


## Set_index columna de Case_ID

In [106]:
dftrain_opcion1.set_index('0', inplace=True)

In [107]:
dftrain_opcion1.head()

Unnamed: 0_level_0,5,9,13,14,15,16,17,CH1_1,CH1_2,CH1_3,...,TS7_U,IS8_A,IS8_B,IS8_C,IS8_D,IS8_E,IS8_F,TA12_Emergency,TA12_Trauma,TA12_Urgent
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
161528,2,4.0,2,2,2,2817.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
159472,4,2.0,2,4,5,4498.0,2,0,0,0,...,0,0,0,0,0,0,1,0,1,0
309765,2,3.0,2,2,7,4573.0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,1
279614,3,4.0,2,4,1,7202.0,5,0,0,0,...,0,0,1,0,0,0,0,1,0,0
147791,3,2.0,2,2,5,3398.0,5,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [108]:
dftrain_opcion1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 161528 to 69647
Data columns (total 77 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   5                        100000 non-null  int64  
 1   9                        100000 non-null  float64
 2   13                       100000 non-null  int32  
 3   14                       100000 non-null  int64  
 4   15                       100000 non-null  int32  
 5   16                       100000 non-null  float64
 6   17                       100000 non-null  int32  
 7   CH1_1                    100000 non-null  uint8  
 8   CH1_2                    100000 non-null  uint8  
 9   CH1_3                    100000 non-null  uint8  
 10  CH1_4                    100000 non-null  uint8  
 11  CH1_5                    100000 non-null  uint8  
 12  CH1_6                    100000 non-null  uint8  
 13  CH1_7                    100000 non-null  uint8  
 14  

In [109]:
dftrain_opcion1['17'] = dftrain_opcion1['17'].astype('category')

In [110]:
dftrain_opcion1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 161528 to 69647
Data columns (total 77 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   5                        100000 non-null  int64   
 1   9                        100000 non-null  float64 
 2   13                       100000 non-null  int32   
 3   14                       100000 non-null  int64   
 4   15                       100000 non-null  int32   
 5   16                       100000 non-null  float64 
 6   17                       100000 non-null  category
 7   CH1_1                    100000 non-null  uint8   
 8   CH1_2                    100000 non-null  uint8   
 9   CH1_3                    100000 non-null  uint8   
 10  CH1_4                    100000 non-null  uint8   
 11  CH1_5                    100000 non-null  uint8   
 12  CH1_6                    100000 non-null  uint8   
 13  CH1_7                    100000 non-null

---

## Defino x e y

In [111]:
x1 = np.array(dftrain_opcion1.drop(['17'], axis=1))
x1.shape

(100000, 76)

In [112]:
y = np.array(dftrain_opcion1['17'])
y.shape

(100000,)

## MODELO 1 - RandomForestClassifier

In [113]:
x_train, x_test, y_train, y_test = train_test_split(x1, y, test_size=0.2, random_state=42)
print('x_train:', x_train.shape)
print('x_test:', x_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)
print('\n---------------\n')
modelo1 = RandomForestClassifier(n_estimators=200, warm_start=False, random_state= 42)
modelo1.fit(x_train, y_train)
print('Score TRAIN:', modelo1.score(x_train,y_train))
print('Score TEST:', modelo1.score(x_test,y_test))
y_pred_test = modelo1.predict(x_test)

x_train: (80000, 76)
x_test: (20000, 76)
y_train: (80000,)
y_test: (20000,)

---------------

Score TRAIN: 0.999775
Score TEST: 0.35215


In [114]:
accuracy_score(y_test, y_pred_test)

0.35215

In [115]:
modelo1.fit(x1, y)

RandomForestClassifier(n_estimators=200, random_state=42)

In [116]:
y_pred_total = modelo1.predict(x1)

In [117]:
accuracy_score(y, y_pred_total)

0.99961

## Cargo dataset de TEST

In [139]:
dftest = pd.read_csv('data/hospital_test.csv')
dftest.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,314114,19,a,7,Y,4,gynecology,S,C,2.0,59278,8.0,Emergency,Minor,2,41-50,4778.0
1,208989,15,c,5,Z,3,gynecology,S,F,2.0,102253,15.0,Trauma,Moderate,3,31-40,5734.0
2,305872,17,e,1,X,4,gynecology,R,E,4.0,5828,4.0,Emergency,Minor,3,71-80,5064.0
3,266099,3,c,3,Z,4,TB & Chest disease,R,A,2.0,56642,9.0,Urgent,Extreme,4,31-40,3254.0
4,13228,6,a,6,X,4,gynecology,R,F,1.0,116266,8.0,Emergency,Minor,3,21-30,4639.0


In [140]:
dftest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133744 entries, 0 to 133743
Data columns (total 17 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       133744 non-null  int64  
 1   1       133744 non-null  int64  
 2   2       133744 non-null  object 
 3   3       133744 non-null  int64  
 4   4       133744 non-null  object 
 5   5       133744 non-null  int64  
 6   6       133744 non-null  object 
 7   7       133744 non-null  object 
 8   8       133744 non-null  object 
 9   9       133704 non-null  float64
 10  10      133744 non-null  int64  
 11  11      131927 non-null  float64
 12  12      133744 non-null  object 
 13  13      133744 non-null  object 
 14  14      133744 non-null  int64  
 15  15      133744 non-null  object 
 16  16      133744 non-null  float64
dtypes: float64(3), int64(6), object(8)
memory usage: 17.3+ MB


## Igualando df TEST a df TRAIN

In [141]:
dftest.isna().sum()

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9       40
10       0
11    1817
12       0
13       0
14       0
15       0
16       0
dtype: int64

In [142]:
dftest.loc[:,'9'].fillna(dftest.loc[:,'9'].mode()[0], inplace = True)
dummies_CH1_test = pd.get_dummies(dftest.loc[:,'1'], prefix='CH1')
dummies_CTH2_test = pd.get_dummies(dftest.loc[:,'2'], prefix='CTH2')
dummies_CCH3_test = pd.get_dummies(dftest.loc[:,'3'], prefix='CCH3')
dummies_DEP6_test = pd.get_dummies(dftest.loc[:,'6'], prefix='DEP6')
dummies_TS7_test = pd.get_dummies(dftest.loc[:,'7'], prefix='TS7')
dummies_IS8_test = pd.get_dummies(dftest.loc[:,'8'], prefix='IS8')
dftest.drop(['10'], axis=1, inplace=True)
dftest.drop(['11'], axis=1, inplace=True)
dummies_TA12_test = pd.get_dummies(dftest.loc[:,'12'], prefix='TA12')
#-----------------------------------------------------------------------
condicion = [(dftest.loc[:,'13'] == 'Minor'), 
    (dftest.loc[:,'13'] == 'Moderate'),
    (dftest.loc[:,'13'] == 'Extreme')]
encoding = [1, 2, 3]
dftest.loc[:,'13'] = np.select(condicion, encoding)
#-----------------------------------------------------------------------
le15_test = LabelEncoder()
dftest.loc[:,'15'] = le15_test.fit_transform(dftest.loc[:,'15'])
#-----------------------------------------------------------------------
dftest = pd.concat([dftest, dummies_CH1_test, dummies_CTH2_test, dummies_CCH3_test, dummies_DEP6_test, dummies_TS7_test, dummies_IS8_test, dummies_TA12_test], axis = 1)
#-----------------------------------------------------------------------
dftest_opcion1 = dftest.drop(['1', '2', '3', '4', '6', '7', '8', '12'], axis=1)
#-----------------------------------------------------------------------
dftest_opcion1.set_index('0', inplace=True)

In [143]:
dftest_opcion1.head()

Unnamed: 0_level_0,5,9,13,14,15,16,CH1_1,CH1_2,CH1_3,CH1_4,...,TS7_U,IS8_A,IS8_B,IS8_C,IS8_D,IS8_E,IS8_F,TA12_Emergency,TA12_Trauma,TA12_Urgent
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
314114,4,2.0,1,2,4,4778.0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
208989,3,2.0,2,3,3,5734.0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
305872,4,4.0,1,3,7,5064.0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
266099,4,2.0,3,4,3,3254.0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
13228,4,1.0,1,3,2,4639.0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [144]:
x_conjunto_test = np.array(dftest_opcion1)
x_conjunto_test.shape

(133744, 76)

In [145]:
y_prediccion_submit = modelo1.predict(x_conjunto_test)

In [146]:
y_prediccion_submit

array([1, 5, 1, ..., 2, 2, 2], dtype=int64)

In [147]:
y_submit = leTARGET.inverse_transform(y_prediccion_submit)

In [148]:
y_submit

array(['11-20', '51-60', '11-20', ..., '21-30', '21-30', '21-30'],
      dtype=object)

## Cargo dataframe SAMPLE

In [160]:
sample = pd.read_csv('data/sample_submission.csv')
sample.head()

Unnamed: 0,id,days
0,314114,11-20
1,208989,31-40
2,305872,81-90
3,266099,21-30
4,13228,31-40


In [156]:
dftest_opcion1.index

Int64Index([314114, 208989, 305872, 266099,  13228, 181868, 306919, 281736,
            102076, 294069,
            ...
            296167, 190912, 225943, 225880, 173771, 318155, 144850, 180676,
             39933, 116673],
           dtype='int64', name='0', length=133744)

In [157]:
submission1 = pd.DataFrame({"id": dftest_opcion1.index, "days": y_submit})
submission1.head()

Unnamed: 0,id,days
0,314114,11-20
1,208989,51-60
2,305872,11-20
3,266099,51-60
4,13228,21-30


In [161]:
chequeator(submission1, 1)

You're ready to submit!


---