In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from scipy import stats
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense,BatchNormalization, Normalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import log_loss
from sklearn.preprocessing import normalize
import keras_tuner as kt
from keras_tuner import Objective
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel




In [2]:
df = pd.read_csv('train.csv', index_col='id')
df

Unnamed: 0_level_0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.80,63.0,394.0,9.7,3.0,D
1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.50,58.0,1653.0,71.30,96.0,269.0,10.7,3.0,C
4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7900,1166,D-penicillamine,16839,F,N,N,N,N,0.8,309.0,3.56,38.0,1629.0,79.05,224.0,344.0,9.9,2.0,C
7901,1492,Placebo,17031,F,N,Y,N,N,0.9,260.0,3.43,62.0,1440.0,142.00,78.0,277.0,10.0,4.0,C
7902,1576,D-penicillamine,25873,F,N,N,Y,S,2.0,225.0,3.19,51.0,933.0,69.75,62.0,200.0,12.7,2.0,D
7903,3584,D-penicillamine,22960,M,N,Y,N,N,0.7,248.0,2.75,32.0,1003.0,57.35,118.0,221.0,10.6,4.0,D


In [17]:
# Separamos en variables dependientes y independientes
X = df.drop('Status', axis=1)
y = df['Status']

### Algunas ingenieria de variables

In [4]:
class Age_Format (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):       
        return self
    
    # Iteramos por el nombre de columna de Age, calculando el valor real edad desde los dias de nacimiento de cada paciente.
    def transform(self,X):
        X_copy = X.copy()
        
        new_age = []
        for i in X_copy['Age']:
            days = i
            age = days/365
            new_age.append(round(age))
        
        X_copy = X_copy.drop('Age', axis = 1)
        X_copy['Age'] = new_age
        
        return X_copy

In [5]:
Age_ = Age_Format()
X = Age_.transform(X)

In [6]:
class Sgot_Range (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):       
        return self
    
    # Iteramos por el nombre de columna de Sgot, calculando en base a analisis, los pacientes con mayor o menor
    # rango. Si supera el nivel 100, se marcara como 1, en caso contrario 0
    def transform(self,X):
        X_copy = X.copy()
        
        Sgot_survival = []
        for i in X_copy['SGOT']:
            if i > 100:
                Sgot_survival.append(1)
            else: Sgot_survival.append(0)
        
        X_copy['Sgot_survival'] = Sgot_survival
        
        return X_copy

In [7]:
Sgot_ =  Sgot_Range()
X = Sgot_.transform(X)

In [8]:
class Bilirubin_Range (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):       
        return self
    
    # Iteramos por el nombre de columna de Bilirubin, calculando en base a analisis, los pacientes con mayor o menor
    # rango. Si supera el nivel 3, se marcara como 1, en caso contrario 0
    def transform(self,X):
        X_copy = X.copy()
        
        Bilirubin_survival = []
        for i in X_copy['Bilirubin']:
            if i > 1.5:
                Bilirubin_survival.append(1)
            else: Bilirubin_survival.append(0)
        
        X_copy['Bilirubin_survival'] = Bilirubin_survival
        
        return X_copy

In [9]:
Bilirubin_ = Bilirubin_Range()
X = Bilirubin_.transform(X)

In [10]:
class Copper_Range (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):       
        return self
    
    # Iteramos por el nombre de columna de Copper, calculando en base a analisis, los pacientes con mayor o menor
    # rango. Si supera el nivel 60, se marcara como 1, en caso contrario 0
    def transform(self,X):
        X_copy = X.copy()
        
        Copper_survival = []
        for i in X_copy['Copper']:
            if i >= 60:
                Copper_survival.append(1)
            else: Copper_survival.append(0)
        
        X_copy['Copper_survival'] = Copper_survival
        
        return X_copy

In [11]:
Copper_ = Copper_Range()
X = Copper_.transform(X)

### Transformacion distribucion de variables 

In [12]:
class Tranformation_Distribution (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):       
        return self
    
    # Iteramos por el nombre de columna de X_copy, devolviendo el valor devuelto por boxcox (no el valor lambda).
    def transform(self,X):
        X_copy = X.copy()    
        column_names = ['Bilirubin','Cholesterol','Copper','Alk_Phos','SGOT','Tryglicerides','Prothrombin']
        
        for i in column_names:
            transformed_col,_ = stats.boxcox(X_copy[i])
            X_copy[i] = transformed_col
        
        X_copy = X_copy.drop('N_Days', axis = 1)
        return X_copy

In [13]:
transformers_ = Tranformation_Distribution()
X = transformers_.transform(X)

In [14]:
class Clustering (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):       
        return self
    
    # Iteramos por el nombre de columna de X_copy, devolviendo el valor devuelto por boxcox (no el valor lambda).
    def transform(self,X):
        X_copy = X.copy()    
        
        # pipeline para escalar numericos y codificar categoricos
        numerical_columns = X_copy.select_dtypes(include='number')
        categorical_columns = X_copy.select_dtypes(exclude='number')


        # automatizado de transformacion categoricas y numericas (Normalizado y OneHot)
        categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore',sparse_output=False, drop='first'))
        numerical_pipeline = make_pipeline(MinMaxScaler())

        preprocessing = ColumnTransformer([
            ('num', numerical_pipeline, numerical_columns.columns),
            ('cat', categorical_pipeline, categorical_columns.columns),
        ], remainder='passthrough')

        # Unimos en un pipeline global
        pipeline_full = make_pipeline(preprocessing)

        X_processed = pipeline_full.fit_transform(X_copy)

        
        kmeans = KMeans(n_clusters=2, init='k-means++', max_iter=300, n_init=10, random_state=42)
        y_kmeans = kmeans.fit_predict(X_processed)
        
        
        # transformar X_processed a series de pandas(dataframe)
        X_copy = pd.DataFrame(X_processed, columns= preprocessing.get_feature_names_out())
        # unimos vector de predicciones con df
        X_copy['cluster'] = list(y_kmeans)
        
        return X_copy

In [15]:
Clustering_ =  Clustering()
X = Clustering_.transform(X)

In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7905 entries, 0 to 7904
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   num__Bilirubin           7905 non-null   float64
 1   num__Cholesterol         7905 non-null   float64
 2   num__Albumin             7905 non-null   float64
 3   num__Copper              7905 non-null   float64
 4   num__Alk_Phos            7905 non-null   float64
 5   num__SGOT                7905 non-null   float64
 6   num__Tryglicerides       7905 non-null   float64
 7   num__Platelets           7905 non-null   float64
 8   num__Prothrombin         7905 non-null   float64
 9   num__Stage               7905 non-null   float64
 10  num__Age                 7905 non-null   float64
 11  num__Sgot_survival       7905 non-null   float64
 12  num__Bilirubin_survival  7905 non-null   float64
 13  num__Copper_survival     7905 non-null   float64
 14  cat__Drug_Placebo       

### Pipeline de procesado de datos

In [18]:
# Automatizado de clases con funciones de ingenieria de variables.
class_pipeline = make_pipeline(Age_, Sgot_, Bilirubin_, Copper_, transformers_, Clustering_)

X_processed = class_pipeline.fit_transform(X)
X_processed

Unnamed: 0,num__Bilirubin,num__Cholesterol,num__Albumin,num__Copper,num__Alk_Phos,num__SGOT,num__Tryglicerides,num__Platelets,num__Prothrombin,num__Stage,...,num__Bilirubin_survival,num__Copper_survival,cat__Drug_Placebo,cat__Sex_M,cat__Ascites_Y,cat__Hepatomegaly_Y,cat__Spiders_Y,cat__Edema_S,cat__Edema_Y,cluster
0,0.703920,0.611883,0.518657,0.751284,0.668167,0.675003,0.297584,0.662675,0.270725,0.666667,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1,0.462568,0.667585,0.589552,0.549236,0.641062,0.574429,0.431628,0.596806,0.580596,0.666667,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.771384,0.588312,0.593284,0.696402,0.545185,0.531684,0.197319,0.273453,0.685430,1.000000,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0
3,0.319490,0.516217,0.574627,0.532648,0.676068,0.350958,0.464528,0.413174,0.524815,0.666667,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.523705,0.648336,0.630597,0.549236,0.586411,0.549417,0.464528,0.471058,0.504469,1.000000,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7900,0.423839,0.602462,0.597015,0.447931,0.672468,0.387205,0.746485,0.562874,0.331830,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7901,0.462568,0.523829,0.548507,0.546026,0.641062,0.592505,0.384677,0.429142,0.360187,1.000000,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
7902,0.674492,0.449006,0.458955,0.506860,0.514139,0.343234,0.290848,0.275449,0.791054,0.333333,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1
7903,0.377195,0.500329,0.294776,0.413554,0.537216,0.274392,0.539435,0.317365,0.504469,1.000000,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1


In [19]:
# codificamos y
scaler_y = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_y = scaler_y.fit_transform(y.values.reshape(-1, 1))

In [20]:
encoded_y

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [26]:
# Eliminamos caraacteristicas menos importantes(debajo)
X_processed_new = X_processed[['num__Bilirubin', 'num__Cholesterol', 'num__Albumin', 'num__Copper',
                               'num__Alk_Phos', 'num__SGOT', 'num__Tryglicerides', 'num__Platelets',
                               'num__Prothrombin', 'num__Stage', 'num__Age']]

In [27]:
# validacion y entrenamiento
X_train,X_test,y_train,y_test = train_test_split(X_processed_new, encoded_y, test_size=0.2, random_state=0, stratify=y, shuffle=True)

### Random Forest, deteccion de caracteristicas importantes 

In [23]:
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100, random_state=42))
sel.fit(X_train, y_train)

In [24]:
selected_feat= X_train.columns[(sel.get_support())]
len(selected_feat)

11

In [25]:
print(selected_feat)

Index(['num__Bilirubin', 'num__Cholesterol', 'num__Albumin', 'num__Copper',
       'num__Alk_Phos', 'num__SGOT', 'num__Tryglicerides', 'num__Platelets',
       'num__Prothrombin', 'num__Age', 'cluster'],
      dtype='object')


### Keras tuner

In [28]:
def model (hp):
    # hiperparametros con x valores posibles
    n_hidden = hp.Int('n_hidden', min_value=0, max_value=5, default=2)
    n_neurons = hp.Int('n_neurons', min_value=1, max_value=10)
    learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-3, sampling='log')
    optimizer = hp.Choice('optimizer', values=['adam'])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    # Contruccion de red con api secuancial
    model = tf.keras.Sequential()
    model.add(Normalization())
    model.add(tf.keras.layers.Flatten())
    # creando n capas
    for _ in range(n_hidden):
        model.add(tf.keras.layers.Dense(n_neurons, activation='relu'))
        model.add(BatchNormalization())
    # capa de salida    
    model.add(tf.keras.layers.Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

In [29]:
random_search_tuner = kt.RandomSearch(
    model, Objective("val_loss", direction="min"), max_trials=50, overwrite=True, seed=42
)
early_stop = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
random_search_tuner.search(X_train, y_train, epochs=100, callbacks=[early_stop], validation_data=(X_test, y_test))

Trial 50 Complete [00h 00m 42s]
val_loss: 0.5307214260101318

Best val_loss So Far: 0.5064519047737122
Total elapsed time: 00h 40m 33s


In [30]:
# cargamos el mejor modelo hasta la fecha
top3_models = random_search_tuner.get_best_models(num_models=3)
best_models_kerastuner = top3_models[0]




In [31]:
resultados = best_models_kerastuner.evaluate(X_test, y_test)



### Prediccion de los datos de prueba

In [32]:
X_prueba = pd.read_csv('test.csv')
X_prueba

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,7905,3839,D-penicillamine,19724,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.90,90.0,430.0,10.6,2.0
1,7906,2468,D-penicillamine,14975,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.90,155.0,227.0,10.0,2.0
2,7907,51,Placebo,13149,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0
3,7908,2330,D-penicillamine,20510,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0
4,7909,1615,D-penicillamine,21904,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.00,126.0,221.0,9.8,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,13171,2870,Placebo,12279,F,N,N,N,N,1.3,302.0,3.43,75.0,1345.0,145.00,44.0,181.0,10.6,3.0
5267,13172,1770,Placebo,24803,F,N,N,N,N,0.5,219.0,4.09,121.0,663.0,79.05,94.0,311.0,9.7,3.0
5268,13173,3707,D-penicillamine,16990,F,N,Y,N,N,0.8,315.0,4.09,13.0,1637.0,170.50,70.0,426.0,10.9,3.0
5269,13174,1216,Placebo,11773,F,N,N,N,N,0.7,329.0,3.80,52.0,678.0,57.00,126.0,306.0,10.2,1.0


In [33]:
id_ = X_prueba['id']
X_prueba = X_prueba.drop('id', axis =1)

In [34]:
# Procesamos con pipeline los datos nuevos
X_prueba = class_pipeline.fit_transform(X_prueba)
X_prueba
X_processed_new = X_prueba[['num__Bilirubin', 'num__Cholesterol', 'num__Albumin', 'num__Copper',
                               'num__Alk_Phos', 'num__SGOT', 'num__Tryglicerides', 'num__Platelets',
                               'num__Prothrombin', 'num__Stage', 'num__Age']]

In [35]:
X_processed_new.shape

(5271, 11)

In [36]:
# Predecimos con nuestor modelo ya entrenado
prueba_prediction = best_models_kerastuner.predict(X_processed_new)
prueba_prediction



array([[0.7112268 , 0.01647778, 0.27229545],
       [0.79708594, 0.06832565, 0.13458838],
       [0.37382925, 0.13741499, 0.48875576],
       ...,
       [0.85339254, 0.01094687, 0.13566057],
       [0.9792865 , 0.01215435, 0.00855917],
       [0.2299512 , 0.02145754, 0.7485913 ]], dtype=float32)

In [37]:
# Guardar en listas cada etiqueta con la probabilidad
Status_C = []
Status_CL = []
Status_D = []
for i in range(len(prueba_prediction)):
    Status_C.append(prueba_prediction[i, 0])
    Status_CL.append(prueba_prediction[i, 1])
    Status_D.append(prueba_prediction[i, 2])

In [38]:
# Creamos la submission
submission_test = {'id': id_, 'Status_C':Status_C, 'Status_CL':Status_CL,'Status_D':Status_D}

In [39]:
data = pd.DataFrame(submission_test)
data

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.711227,0.016478,0.272295
1,7906,0.797086,0.068326,0.134588
2,7907,0.373829,0.137415,0.488756
3,7908,0.921904,0.001336,0.076760
4,7909,0.729518,0.001523,0.268959
...,...,...,...,...
5266,13171,0.732925,0.050910,0.216165
5267,13172,0.898830,0.000372,0.100797
5268,13173,0.853393,0.010947,0.135661
5269,13174,0.979286,0.012154,0.008559


In [40]:
#Guardamos
data.to_csv('prediction_V4.csv', index=False)