In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from scipy import stats
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense,BatchNormalization, Normalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import log_loss
from skmultilearn.model_selection import IterativeStratification




In [8]:
df = pd.read_csv('train.csv', index_col='id')
df

Unnamed: 0_level_0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.80,63.0,394.0,9.7,3.0,D
1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.50,58.0,1653.0,71.30,96.0,269.0,10.7,3.0,C
4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7900,1166,D-penicillamine,16839,F,N,N,N,N,0.8,309.0,3.56,38.0,1629.0,79.05,224.0,344.0,9.9,2.0,C
7901,1492,Placebo,17031,F,N,Y,N,N,0.9,260.0,3.43,62.0,1440.0,142.00,78.0,277.0,10.0,4.0,C
7902,1576,D-penicillamine,25873,F,N,N,Y,S,2.0,225.0,3.19,51.0,933.0,69.75,62.0,200.0,12.7,2.0,D
7903,3584,D-penicillamine,22960,M,N,Y,N,N,0.7,248.0,2.75,32.0,1003.0,57.35,118.0,221.0,10.6,4.0,D


In [23]:
# Separamos en variables dependientes y independientes
X = df.drop('Status', axis=1)
y = df['Status']

### Algunas ingenieria de variables

In [10]:
class Age_Format (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):       
        return self
    
    # Iteramos por el nombre de columna de Age, calculando el valor real edad desde los dias de nacimiento de cada paciente.
    def transform(self,X):
        X_copy = X.copy()
        
        new_age = []
        for i in X_copy['Age']:
            days = i
            age = days/365
            new_age.append(round(age))
        
        X_copy = X_copy.drop('Age', axis = 1)
        X_copy['Age'] = new_age
        
        return X_copy

In [11]:
Age_ = Age_Format()
X = Age_.transform(X)

In [12]:
class Sgot_Range (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):       
        return self
    
    # Iteramos por el nombre de columna de Sgot, calculando en base a analisis, los pacientes con mayor o menor
    # rango. Si supera el nivel 100, se marcara como 1, en caso contrario 0
    def transform(self,X):
        X_copy = X.copy()
        
        Sgot_survival = []
        for i in X_copy['SGOT']:
            if i > 100:
                Sgot_survival.append(1)
            else: Sgot_survival.append(0)
        
        X_copy['Sgot_survival'] = Sgot_survival
        
        return X_copy

In [13]:
Sgot_ =  Sgot_Range()
X = Sgot_.transform(X)

In [14]:
class Bilirubin_Range (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):       
        return self
    
    # Iteramos por el nombre de columna de Bilirubin, calculando en base a analisis, los pacientes con mayor o menor
    # rango. Si supera el nivel 3, se marcara como 1, en caso contrario 0
    def transform(self,X):
        X_copy = X.copy()
        
        Bilirubin_survival = []
        for i in X_copy['Bilirubin']:
            if i > 1.5:
                Bilirubin_survival.append(1)
            else: Bilirubin_survival.append(0)
        
        X_copy['Bilirubin_survival'] = Bilirubin_survival
        
        return X_copy

In [15]:
Bilirubin_ = Bilirubin_Range()
X = Bilirubin_.transform(X)

In [16]:
class Copper_Range (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):       
        return self
    
    # Iteramos por el nombre de columna de Copper, calculando en base a analisis, los pacientes con mayor o menor
    # rango. Si supera el nivel 60, se marcara como 1, en caso contrario 0
    def transform(self,X):
        X_copy = X.copy()
        
        Copper_survival = []
        for i in X_copy['Copper']:
            if i >= 60:
                Copper_survival.append(1)
            else: Copper_survival.append(0)
        
        X_copy['Copper_survival'] = Copper_survival
        
        return X_copy

In [17]:
Copper_ = Copper_Range()
X = Copper_.transform(X)

### Creamos caracteristica cluster

### Transformacion distribucion de variables 

In [18]:
class Tranformation_Distribution (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):       
        return self
    
    # Iteramos por el nombre de columna de X_copy, devolviendo el valor devuelto por boxcox (no el valor lambda).
    def transform(self,X):
        X_copy = X.copy()    
        column_names = ['Bilirubin','Cholesterol','Copper','Alk_Phos','SGOT','Tryglicerides','Prothrombin']
        
        for i in column_names:
            transformed_col,_ = stats.boxcox(X_copy[i])
            X_copy[i] = transformed_col
        
        X_copy = X_copy.drop('N_Days', axis = 1)
        
        return X_copy

In [19]:
transformers_ = Tranformation_Distribution()
X = transformers_.transform(X)

In [20]:
class Clustering (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):       
        return self
    
    # Iteramos por el nombre de columna de X_copy, devolviendo el valor devuelto por boxcox (no el valor lambda).
    def transform(self,X):
        X_copy = X.copy()    
        
        # pipeline para escalar numericos y codificar categoricos
        numerical_columns = X_copy.select_dtypes(include='number')
        categorical_columns = X_copy.select_dtypes(exclude='number')


        # automatizado de transformacion categoricas y numericas (Normalizado y OneHot)
        categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore',sparse_output=False, drop='first'))
        numerical_pipeline = make_pipeline(MinMaxScaler())

        preprocessing = ColumnTransformer([
            ('num', numerical_pipeline, numerical_columns.columns),
            ('cat', categorical_pipeline, categorical_columns.columns),
        ], remainder='passthrough')

        # Unimos en un pipeline global
        pipeline_full = make_pipeline(preprocessing)

        X_processed = pipeline_full.fit_transform(X_copy)

        
        kmeans = KMeans(n_clusters=2, init='k-means++', max_iter=300, n_init=10, random_state=42)
        y_kmeans = kmeans.fit_predict(X_processed)
        
        
        # transformar X_processed a series de pandas(dataframe)
        X_copy = pd.DataFrame(X_processed, columns= preprocessing.get_feature_names_out())
        # unimos vector de predicciones con df
        X_copy['cluster'] = list(y_kmeans)
        
        return X_copy

In [21]:
Clustering_ =  Clustering()
X = Clustering_.transform(X)

In [22]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7905 entries, 0 to 7904
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   num__Bilirubin           7905 non-null   float64
 1   num__Cholesterol         7905 non-null   float64
 2   num__Albumin             7905 non-null   float64
 3   num__Copper              7905 non-null   float64
 4   num__Alk_Phos            7905 non-null   float64
 5   num__SGOT                7905 non-null   float64
 6   num__Tryglicerides       7905 non-null   float64
 7   num__Platelets           7905 non-null   float64
 8   num__Prothrombin         7905 non-null   float64
 9   num__Stage               7905 non-null   float64
 10  num__Age                 7905 non-null   float64
 11  num__Sgot_survival       7905 non-null   float64
 12  num__Bilirubin_survival  7905 non-null   float64
 13  num__Copper_survival     7905 non-null   float64
 14  cat__Drug_Placebo       

In [15]:
# Guardamos csv con variables predictoras para realizar en otro notebook la agrupacion por cluster(buscando patrones).
X.to_csv('clustering_data.csv', index=False)

### Pipeline de procesado de datos

In [56]:
# Automatizado de clases con funciones de ingenieria de variables.
class_pipeline = make_pipeline(Age_, Sgot_, Bilirubin_, Copper_, transformers_, Clustering_)

X_processed = class_pipeline.fit_transform(X)
X_processed

Unnamed: 0,num__Bilirubin,num__Cholesterol,num__Albumin,num__Copper,num__Alk_Phos,num__SGOT,num__Tryglicerides,num__Platelets,num__Prothrombin,num__Stage,...,num__Bilirubin_survival,num__Copper_survival,cat__Drug_Placebo,cat__Sex_M,cat__Ascites_Y,cat__Hepatomegaly_Y,cat__Spiders_Y,cat__Edema_S,cat__Edema_Y,cluster
0,0.703920,0.611883,0.518657,0.751284,0.668167,0.675003,0.297584,0.662675,0.270725,0.666667,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1,0.462568,0.667585,0.589552,0.549236,0.641062,0.574429,0.431628,0.596806,0.580596,0.666667,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.771384,0.588312,0.593284,0.696402,0.545185,0.531684,0.197319,0.273453,0.685430,1.000000,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0
3,0.319490,0.516217,0.574627,0.532648,0.676068,0.350958,0.464528,0.413174,0.524815,0.666667,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.523705,0.648336,0.630597,0.549236,0.586411,0.549417,0.464528,0.471058,0.504469,1.000000,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7900,0.423839,0.602462,0.597015,0.447931,0.672468,0.387205,0.746485,0.562874,0.331830,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7901,0.462568,0.523829,0.548507,0.546026,0.641062,0.592505,0.384677,0.429142,0.360187,1.000000,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
7902,0.674492,0.449006,0.458955,0.506860,0.514139,0.343234,0.290848,0.275449,0.791054,0.333333,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1
7903,0.377195,0.500329,0.294776,0.413554,0.537216,0.274392,0.539435,0.317365,0.504469,1.000000,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1


In [57]:
# codificamos y
scaler_y = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_y = scaler_y.fit_transform(y.values.reshape(-1, 1))

In [49]:
encoded_y

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]])

### Prediccion con redes, Kfolds

In [58]:
# Convertir los datos a matrices NumPy
X_processed = X_processed.to_numpy()
scores = []

FOLDS = 5
skf = IterativeStratification(n_splits=FOLDS)

# Define la arquitectura del modelo fuera del bucle
modelo = tf.keras.Sequential([
    tf.keras.layers.Dense(25, activation='relu', input_dim=22),
    BatchNormalization(),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Dense(25, activation='relu'),
    BatchNormalization(),
    tf.keras.layers.Dense(25, activation='relu'),
    BatchNormalization(),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Dense(3, activation='softmax'),
])

optimizador = tf.keras.optimizers.Adam(learning_rate=0.0005)
modelo.compile(optimizer=optimizador, loss='categorical_crossentropy', metrics=['accuracy'])

for i, (train_index, test_index) in enumerate(skf.split(X_processed, encoded_y)):
    # Separar en conjunto de entrenamiento y validación
    X_train, X_val = X_processed[train_index], X_processed[test_index]
    y_train, y_val = encoded_y[train_index], encoded_y[test_index]

    # Ajustar y entrenar el modelo de red neuronal
    early_stop = tf.keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)
    filepath = './checkpoints/checkpoint'
    best_model = tf.keras.callbacks.ModelCheckpoint(filepath=filepath, save_best_only=True, save_weights_only=True,
                                                    monitor='val_loss', mode='min')
    history = modelo.fit(X_train, y_train, batch_size=32, callbacks=[early_stop, best_model], epochs=400,
                         validation_data=(X_val, y_val))
    modelo.load_weights(filepath)
    modelo.save(f'model{i+1}.h5')
    score = modelo.evaluate(X_val, y_val)
    scores.append(score)

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400


Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78/400
Epoch 79/400
Epoch 80/400
Epoch 81/400
Epoch 82/400
Epoch 83/400
Epoch 84/400
Epoch 85/400
Epoch 86/400
Epoch 87/400
Epoch 88/400
Epoch 89/400
Epoch 90/400
Epoch 91/400
Epoch 92/400
Epoch 93/400
Epoch 94/400
Epoch 95/400
Epoch 96/400
Epoch 97/400
Epoch 98/400
Epoch 99/400
Epoch 100/400
Epoch 101/400
Epoch 102/400
Epoch 103/400
Epoch 104/400
Epoch 105/400
Epoch 106/400
Epoch 107/400
Epoch 108/400
Epoch 109/400
Epoch 110/400
Epoch 111/400
Epoch 112/400
Epoch 113/400
Epoch 114/400


Epoch 115/400
Epoch 116/400
Epoch 117/400
Epoch 118/400
Epoch 119/400
Epoch 120/400
Epoch 121/400
Epoch 122/400
Epoch 123/400
Epoch 124/400
Epoch 125/400
Epoch 126/400
Epoch 127/400
Epoch 128/400
Epoch 129/400
Epoch 130/400
Epoch 131/400
Epoch 132/400
Epoch 133/400
Epoch 134/400
Epoch 135/400
Epoch 136/400
Epoch 137/400
Epoch 138/400
Epoch 139/400
Epoch 140/400
Epoch 141/400
Epoch 142/400
Epoch 143/400
Epoch 144/400
Epoch 145/400
Epoch 146/400
Epoch 147/400
Epoch 148/400
Epoch 149/400
Epoch 150/400
Epoch 151/400
Epoch 152/400
Epoch 153/400
Epoch 154/400
Epoch 155/400
Epoch 156/400
Epoch 157/400
Epoch 158/400
Epoch 159/400
Epoch 160/400
Epoch 161/400
Epoch 162/400
Epoch 163/400


  saving_api.save_model(


Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400


Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400


### Prediccion de los datos de prueba

In [62]:
X_prueba = pd.read_csv('test.csv')
X_prueba

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,7905,3839,D-penicillamine,19724,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.90,90.0,430.0,10.6,2.0
1,7906,2468,D-penicillamine,14975,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.90,155.0,227.0,10.0,2.0
2,7907,51,Placebo,13149,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0
3,7908,2330,D-penicillamine,20510,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0
4,7909,1615,D-penicillamine,21904,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.00,126.0,221.0,9.8,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,13171,2870,Placebo,12279,F,N,N,N,N,1.3,302.0,3.43,75.0,1345.0,145.00,44.0,181.0,10.6,3.0
5267,13172,1770,Placebo,24803,F,N,N,N,N,0.5,219.0,4.09,121.0,663.0,79.05,94.0,311.0,9.7,3.0
5268,13173,3707,D-penicillamine,16990,F,N,Y,N,N,0.8,315.0,4.09,13.0,1637.0,170.50,70.0,426.0,10.9,3.0
5269,13174,1216,Placebo,11773,F,N,N,N,N,0.7,329.0,3.80,52.0,678.0,57.00,126.0,306.0,10.2,1.0


In [63]:
id_ = X_prueba['id']
X_prueba = X_prueba.drop('id', axis =1)

In [64]:
# Procesamos con pipeline los datos nuevos
X_prueba = class_pipeline.fit_transform(X_prueba)
X_prueba

Unnamed: 0,num__Bilirubin,num__Cholesterol,num__Albumin,num__Copper,num__Alk_Phos,num__SGOT,num__Tryglicerides,num__Platelets,num__Prothrombin,num__Stage,...,num__Bilirubin_survival,num__Copper_survival,cat__Drug_Placebo,cat__Sex_M,cat__Ascites_Y,cat__Hepatomegaly_Y,cat__Spiders_Y,cat__Edema_S,cat__Edema_Y,cluster
0,0.548898,0.802130,0.526119,0.563208,0.690757,0.617593,0.429352,0.734531,0.533102,0.333333,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,0.524165,0.848367,0.843284,0.636867,0.622180,0.617593,0.621233,0.329341,0.379368,0.333333,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.674940,0.194980,0.373134,0.493991,0.541666,0.344679,0.472231,0.301397,0.869938,1.000000,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0
3,0.319848,0.587029,0.705224,0.465961,0.335660,0.551006,0.239510,0.415170,0.533102,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.590251,0.561563,0.376866,0.687168,0.586317,0.549469,0.551134,0.317365,0.317735,0.000000,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,0.570754,0.600263,0.548507,0.591804,0.640718,0.601358,0.134156,0.237525,0.533102,0.666667,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
5267,0.245778,0.441087,0.794776,0.687168,0.409579,0.388727,0.445665,0.497006,0.284614,0.666667,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
5268,0.424264,0.618151,0.794776,0.239503,0.690905,0.657897,0.331565,0.726547,0.595664,0.666667,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
5269,0.377594,0.635960,0.686567,0.518555,0.418301,0.273509,0.551134,0.487026,0.435419,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [65]:
X_prueba.shape

(5271, 22)

In [67]:
# Predecimos con nuestros modelos ya entrenado
preds_= []
for i in range(1, FOLDS+1):
    print(f'Fold{i}')
    model_fold = load_model(f'model{i}.h5')
    probas = model_fold.predict(X_prueba)
    preds_.append(probas)

Fold1
Fold2
Fold3
Fold4
Fold5


In [68]:
# Media de nuestros modelos (por folds)
preds_mean_prueba = np.mean(preds_, axis = 0)
preds_mean_prueba

array([[0.69986314, 0.02534635, 0.27479053],
       [0.84966356, 0.05987134, 0.09046517],
       [0.05200012, 0.01333658, 0.9346633 ],
       ...,
       [0.84398365, 0.03103164, 0.12498473],
       [0.95308304, 0.01394567, 0.03297127],
       [0.27706164, 0.03465195, 0.6882864 ]], dtype=float32)

In [69]:
# Guardar en listas cada etiqueta con la probabilidad
Status_C = []
Status_CL = []
Status_D = []
for i in range(len(preds_mean_prueba)):
    Status_C.append(preds_mean_prueba[i, 0])
    Status_CL.append(preds_mean_prueba[i, 1])
    Status_D.append(preds_mean_prueba[i, 2])

In [70]:
# Creamos la submission
submission_test = {'id': id_, 'Status_C':Status_C, 'Status_CL':Status_CL,'Status_D':Status_D}

In [71]:
data = pd.DataFrame(submission_test)
data

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.699863,0.025346,0.274791
1,7906,0.849664,0.059871,0.090465
2,7907,0.052000,0.013337,0.934663
3,7908,0.931669,0.008079,0.060252
4,7909,0.706233,0.015465,0.278302
...,...,...,...,...
5266,13171,0.800708,0.076500,0.122791
5267,13172,0.926864,0.003515,0.069621
5268,13173,0.843984,0.031032,0.124985
5269,13174,0.953083,0.013946,0.032971


In [72]:
#Guardamos
data.to_csv('prediction_V3.csv', index=False)