In [1]:
# numpy and pandas
import numpy as np
import pandas as pd
import sklearn

In [2]:
# Generic
import os
import matplotlib.pyplot as plt

### Global paths

In [3]:
path_root = "/home/";
path_project = os.path.join(path_root, "jesusprada", "proyecto_python","x-ray");
path_scripts = os.path.join(path_project, "scripts");
path_data = os.path.join(path_project, "predictions");
path_data

'/home/jesusprada/proyecto_python/x-ray/predictions'

### Global parameters

In [4]:
fixed_var = ["dataset", "target"]
min_transactions = 3;
max_n_var = 300; # Maximum possible number of variables
samples_per_var = 1; # n_variables*samples_per_var <= n_samples;
min_n_categories = 5; # Minimum number of categories per variable to set as maximum
min_size_others = 3; # Minimun number of low frequency categories to apply grouping

## Read Data

### Data train

In [5]:
dat_train = pd.read_csv(os.path.join(path_data, 'X_train_augmented.csv'), sep=',');
dat_train['pred'] = pd.read_csv(os.path.join(path_data, "predictions_train_augmented.csv"), header=None)
dat_train['dataset'] = "train";
dat_train.columns

Index(['Unnamed: 0', 'patientid', 'offset', 'sex', 'age', 'finding',
       'intubated', 'went_icu', 'needed_supplemental_O2', 'extubated',
       'temperature', 'pO2_saturation', 'leukocyte_count', 'neutrophil_count',
       'lymphocyte_count', 'view', 'modality', 'date', 'location', 'folder',
       'filename', 'doi', 'url', 'license', 'clinical_notes', 'other_notes',
       'V27', 'survival', 'pred', 'dataset'],
      dtype='object')

### Data val

In [6]:
dat_val = pd.read_csv(os.path.join(path_data, 'X_val_augmented.csv'), sep=',');
dat_val['pred'] = pd.read_csv(os.path.join(path_data, "predictions_val_augmented.csv"), header=None)
dat_val['dataset'] = "val";
dat_val.columns


Index(['Unnamed: 0', 'patientid', 'offset', 'sex', 'age', 'finding',
       'intubated', 'went_icu', 'needed_supplemental_O2', 'extubated',
       'temperature', 'pO2_saturation', 'leukocyte_count', 'neutrophil_count',
       'lymphocyte_count', 'view', 'modality', 'date', 'location', 'folder',
       'filename', 'doi', 'url', 'license', 'clinical_notes', 'other_notes',
       'V27', 'survival', 'pred', 'dataset'],
      dtype='object')

### Data test

In [7]:
dat_test = pd.read_csv(os.path.join(path_data, 'X_test_augmented.csv'), sep=',');
dat_test['pred'] = pd.read_csv(os.path.join(path_data, "predictions_test_augmented.csv"), header=None)
dat_test['dataset'] = "test";
dat_test.columns

Index(['Unnamed: 0', 'patientid', 'offset', 'sex', 'age', 'finding',
       'intubated', 'went_icu', 'needed_supplemental_O2', 'extubated',
       'temperature', 'pO2_saturation', 'leukocyte_count', 'neutrophil_count',
       'lymphocyte_count', 'view', 'modality', 'date', 'location', 'folder',
       'filename', 'doi', 'url', 'license', 'clinical_notes', 'other_notes',
       'survival', 'pred', 'dataset'],
      dtype='object')

In [8]:
columns_names = dat_train.columns & dat_test.columns
dat_train = dat_train[columns_names]
dat_val = dat_val[columns_names]
dat_test = dat_test[columns_names]

dat = pd.concat([dat_train, dat_val, dat_test], axis=0, ignore_index=True)
print(dat_train.shape)
print(dat_val.shape)
print(dat_test.shape)
print(dat.shape)
dat.groupby(["dataset", "survival"]).size()


(63, 29)
(23, 29)
(12, 29)
(98, 29)


dataset  survival
test     0            5
         1            7
train    0           53
         1           10
val      0           20
         1            3
dtype: int64

# Preprocessing

### Select variables

In [9]:
# Select vars
target = dat["survival"]
selected_vars = ["dataset",  "offset", "sex", "age", "view", "location", "pred"]
dat = dat[selected_vars]
dat["target"] = target

### Remove constant variables

In [10]:
constant_vars = dat.columns[dat.nunique() <= 1]
dat = dat[dat.columns[dat.nunique() > 1]]
#dat.loc[:, (dat != dat.iloc[0]).any()] 

### Remove not informed variables

In [11]:
threshold_nas = 0.4
index_na = dat.apply(lambda x: x.isna().sum(), axis=0)/dat.shape[0] < threshold_nas

dat = dat.iloc[:, index_na.values]

In [12]:
dat

Unnamed: 0,dataset,offset,sex,age,view,location,pred,target
0,train,0.0,M,65.0,PA,"Cho Ray Hospital, Ho Chi Minh City, Vietnam",0.003224,0
1,train,3.0,M,65.0,PA,"Cho Ray Hospital, Ho Chi Minh City, Vietnam",0.009367,0
2,train,5.0,M,65.0,PA,"Cho Ray Hospital, Ho Chi Minh City, Vietnam",0.018515,0
3,train,6.0,M,65.0,PA,"Cho Ray Hospital, Ho Chi Minh City, Vietnam",0.088303,0
4,train,7.0,F,29.0,PA,"Mount Sinai Hospital, Toronto, Ontario, Canada",0.001410,0
...,...,...,...,...,...,...,...,...
93,test,,,69.0,,,0.010421,0
94,test,,M,65.0,L,Italy,0.104683,0
95,test,,M,71.0,PA,,0.000630,0
96,test,,M,62.0,PA,Italy,0.013853,0


### Fill missing values

In [13]:
dat.columns[dat.dtypes == object]

Index(['dataset', 'sex', 'view', 'location'], dtype='object')

In [14]:
fixed_var = ["dataset", "target"]
categorical_vars = ['sex', 'view', 'location']
numerical_vars = list(set(dat.columns) - set(categorical_vars) - set(fixed_var))
numerical_vars

['pred', 'age', 'offset']

In [15]:
method_fill_nan = 'mean'
numerical_dat = dat[numerical_vars]
categorical_dat = dat[categorical_vars]

if(method_fill_nan == 'mean'):
    numerical_dat.apply(lambda x: x.fillna(x.mean(), inplace=True), axis=0)
    #categorical_dat.apply(lambda x: x.fillna(x.mode()[0], inplace=True), axis=0)
    

Concatenamos los datos numericos con missing values imputados con los categoricos para tener el dataset completo

In [16]:
#dat = pd.concat(categorical_dat, numerical_dat)
#dat

#datos = pd.concat((numerical_dat, dat[categorical_dat].reset_index()), axis=1)
#del datos['index']

In [17]:
#X = dat
#from sklearn.preprocessing import Imputer
#imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
#imputer = imputer.fit(X[:, 1:3])
#X[:, 1:3] = imputer.transform(X[:, 1:3])

### One hot enconding

In [18]:
from sklearn.preprocessing import OneHotEncoder

In [19]:
categorical_dat.nunique()
threshold_categories = 10
index_gruop_categories = categorical_dat.apply(lambda x: x.nunique(), axis=0)> threshold_categories
index_gruop_categories

sex         False
view        False
location     True
dtype: bool

In [53]:
categorical_dat["sex"].value_counts()
categorical_dat.apply(lambda x: x.value_counts()/x.shape[0], axis=0)

Unnamed: 0,sex,view,location
AP,,0.040816,
AP Supine,,0.102041,
Axial,,0.071429,
"Cairo, Egypt",,,0.030612
"Cho Ray Hospital, Ho Chi Minh City, Vietnam",,,0.040816
F,0.5,,
"Hospital of Wuhan University, Wuhan, China",,,0.05102
Italy,,,0.153061
"Jiangxi Provincial People's Hospital, Nanchang, China",,,0.010204
L,,0.102041,


In [21]:
ohe = OneHotEncoder(sparse = False)
categorical_dat = categorical_dat.replace(np.nan,'None')
ohe_fit = ohe.fit(categorical_dat)
X_ohe = pd.DataFrame(ohe.fit_transform(categorical_dat))
X_ohe.columns = pd.DataFrame(ohe_fit.get_feature_names())


Concatenamos los datos categoricos con one hot enconding y los numericos para tener el dataset completo

In [22]:
datos = pd.concat((X_ohe, numerical_dat.reset_index()), axis=1)
datos = pd.concat((datos, dat[fixed_var].reset_index()), axis=1)
del datos['index']

In [23]:
datos.shape

(98, 31)

Comprobamos que ahora todos los datos son numericos menos fecha salida que lo quitaremos por ser una importante del modelo

In [24]:
datos.columns[datos.dtypes == object]

Index(['dataset'], dtype='object')

### Escalado

In [25]:
#fecha_salida_values = datos['fecha_salida']
y = datos[fixed_var]
del datos['target']
del datos['dataset']

In [26]:
from sklearn.preprocessing import scale

In [27]:
datos_scale = pd.DataFrame(scale(datos))
datos_scale.columns = datos.columns

In [28]:
datos = datos_scale
datos['target'] = y['target']
datos['dataset'] =  y['dataset']

### Select train/val/test

In [29]:
datos["dataset"].value_counts()


train    63
val      23
test     12
Name: dataset, dtype: int64

In [58]:
X_train = datos[datos.dataset=='train']
y_train = X_train["target"]
del X_train["dataset"]
del X_train["target"]
X_train.shape

(63, 29)

In [59]:
X_val = datos[datos.dataset=='val']
y_val = X_val["target"]
del X_val["dataset"]
del X_val["target"]
X_val.shape

(23, 29)

In [60]:
X_test = datos[datos.dataset=='test']
y_test = X_test["target"]
del X_test["dataset"]
del X_test["target"]
X_test.shape

(12, 29)

## Train model

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier

In [35]:
from sklearn.metrics import roc_auc_score as metric;

In [36]:
from xgboost import XGBClassifier

In [37]:
import multiprocessing

In [38]:
random_state = 1;
nthread = multiprocessing.cpu_count() - 1;
nthread
scale_pos_weight = sum(y_train == 0) / sum(y_train == 1);

Vamos a calcular los parametros de la SVM segun Cherskaskky

In [39]:
n = X_train.shape[0];
d = X_train.shape[1];
m = np.mean(y_train);
s = np.std(y_train);
C_cherk = np.max([np.abs(m + 3*s),np.abs(m - 3*s)]);
gamma_cherk = np.power(0.2, 1/d)

In [40]:
# Regresion Logística
regularization_values = ['l1', 'l2', 'none'];
penalty_values = [1, 10, 100];

# SVM
C_values = [C_cherk, 5e-03, 4.5e-03, 4e-03];
gamma_kernel_values = [gamma_cherk, 3.26e-09, 3.255e-09, 3.25e-09];

# Arbol de Decision
max_depth_values = [None, 6, 20];
min_samples_split_values = [2, 5, 20];
min_samples_leaf_values = [1, 5, 20];
max_features_values = [None, 1, 2];

# Random Forest
ntrees_values = [10, 100, 1000];

# Xgboost
nrounds_values = [10, 100]
eta_values = [0.3, 0.99]
gamma_values = [0, 1]
max_depth_values = [6, 20]
min_child_weight_values = [1, 20]
subsample_values = [0.1, 1]
colsample_bytree_values = [0.1, 1]
num_parallel_tree_values = [1, 20]
lambda_values = [0, 1]
alpha_values = [0, 1]

In [41]:
params_values = [{'model': 'logistic regression',
                  'regularization': regularization_values,
                 'penalty': penalty_values},
                 {'model': 'svm',
                  'C': C_values,
                 'gamma_kernel': gamma_kernel_values},
                 {'model': 'decision tree',
                 'max_depth': max_depth_values,
                 'min_samples_split': min_samples_split_values,
                 'min_samples_leaf': min_samples_leaf_values,
                 'max_features': max_features_values},
                 {'model': 'random forest',
                  'n_trees': ntrees_values,
                 'max_depth': max_depth_values,
                 'min_samples_split': min_samples_split_values,
                 'min_samples_leaf': min_samples_leaf_values,
                 'max_features': max_features_values},
                 {'model': 'xgboost',
                  'nrounds': nrounds_values,
                  'eta': eta_values,
                 'gamma': gamma_values,
                 'max_depth': max_depth_values,
                 'min_child_weight': min_child_weight_values,
                 'subsample': subsample_values,
                 'colsample_bytree': colsample_bytree_values,
                 'num_parallel_tree': num_parallel_tree_values,
                 'lambda': lambda_values,
                 'alpha': alpha_values}]

In [42]:
params_values = [{'model': 'svm',
                  'C': C_values,
                 'gamma_kernel': gamma_kernel_values},
                 {'model': 'decision tree',
                 'max_depth': max_depth_values,
                 'min_samples_split': min_samples_split_values,
                 'min_samples_leaf': min_samples_leaf_values,
                 'max_features': max_features_values},
                 {'model': 'random forest',
                  'n_trees': ntrees_values,
                 'max_depth': max_depth_values,
                 'min_samples_split': min_samples_split_values,
                 'min_samples_leaf': min_samples_leaf_values,
                 'max_features': max_features_values},
                 {'model': 'xgboost',
                  'nrounds': nrounds_values,
                  'eta': eta_values,
                 'gamma': gamma_values,
                 'max_depth': max_depth_values,
                 'min_child_weight': min_child_weight_values,
                 'subsample': subsample_values,
                 'colsample_bytree': colsample_bytree_values,
                 'num_parallel_tree': num_parallel_tree_values,
                 'lambda': lambda_values,
                 'alpha': alpha_values}]

In [43]:
total_iteraciones = 0
for params in params_values:
    if params['model'] == 'logistic regression':
        n = len(params['regularization'])*len(params['penalty'])
    elif params['model'] == 'svm':
        n = len(params['C'])*len(params['gamma_kernel'])
    elif params['model'] == 'decision tree':
        n = len(params['max_depth'])*len(params['min_samples_split'])*len(params['min_samples_leaf'])*len(params['max_features'])
    elif params['model'] == 'random forest':
        n = len(params['n_trees'])*len(params['max_depth'])*len(params['min_samples_split'])*len(params['min_samples_leaf'])*len(params['max_features'])
    elif params['model'] == 'xgboost':
        n = len(params['nrounds'])*len(params['eta'])*len(params['gamma'])*len(params['max_depth'])*len(params['min_child_weight'])*len(params['subsample'])*len(params['colsample_bytree'])*len(params['num_parallel_tree'])*len(params['lambda'])*len(params['alpha'])
    total_iteraciones = total_iteraciones + n;
    print(str(n)+ ' iteraciones de ' + str(params['model']))
print(str(total_iteraciones)+ ' iteraciones en total')     

16 iteraciones de svm
54 iteraciones de decision tree
162 iteraciones de random forest
1024 iteraciones de xgboost
1256 iteraciones en total


In [44]:
grid_results = pd.DataFrame();
num_iter = 0
for params in params_values:
    
    # Logistic Regression
    if params['model'] == 'logistic regression':
        for regularization in params['regularization']:
            for penalty in params['penalty']:  
                
                # Actualizar contador
                num_iter += 1; 
                
                # print control iteracion modelo
                print('Inicio de iteracion ' + str(num_iter) + 
                      '. Regularizacion = ' + str(regularization) + 
                      ', Lambda = '  + str(penalty) +
                      '\n')
                
                # Entrenar modelo
                if regularization == 'l1':
                    model = LogisticRegression(penalty = regularization, solver = 'liblinear', C = penalty, random_state = random_state)
                else:
                    model = LogisticRegression(penalty = regularization,solver = 'lbfgs', C = penalty, random_state = random_state)
               
                model.fit(X_train, np.array(y_train))

                # Generar predicciones
                pred_train_p = model.predict_proba(X_train)
                pred_val_p = model.predict_proba(X_val)

                # Calcular métricas de evaluación
                auc_train = metric(y_train, pred_train_p[:, 1])
                auc_val = metric(y_val, pred_val_p[:, 1])                                            

                print('Fin de iteracion ' + str(num_iter) + 
                     '. Regularizacion = ' + str(regularization) + 
                      ', Lambda = '  + str(penalty) +
                      '. AUC train = '  + str(auc_train) + 
                      ' -  AUC val = '  + str(auc_val)  + 
                      '\n')
                grid_results = grid_results.append(pd.DataFrame(data={'model':'Logistic Regression',
                                                                      'params': [{'regularization':[regularization],
                                                                                  'penalty':[penalty]}],
                                                                      'auc_train':[auc_train],
                                                                      'auc_val':[auc_val]},
                                                               columns=['model','params', 'auc_train', 'auc_val']), 
                                                   ignore_index=True)
                
     
    # SVM
    if params['model'] == 'svm':
        for C in params['C']:
            for gamma_kernel in params['gamma_kernel']:  
                
                # Actualizar contador
                num_iter += 1; 
                
                # print control iteracion modelo
                print('Inicio de iteracion ' + str(num_iter) + 
                      '. C = ' + str(C) + 
                      ', gamma = '  + str(gamma_kernel) +
                      '\n')
                
                # Entrenar modelo               
                model = SVC(C = C, gamma = gamma_kernel, probability = True, random_state = random_state)
               
                model.fit(X_train, np.array(y_train))

                # Generar predicciones
                pred_train_p = model.predict_proba(X_train)
                pred_val_p = model.predict_proba(X_val)

                # Calcular métricas de evaluación
                auc_train = metric(y_train, pred_train_p[:, 1])
                auc_val = metric(y_val, pred_val_p[:, 1])                                            

                print('Fin de iteracion ' + str(num_iter) + 
                     '. C = ' + str(C) + 
                      ', gamma = '  + str(gamma_kernel) +
                      '. AUC train = '  + str(auc_train) + 
                      ' -  AUC val = '  + str(auc_val)  + 
                      '\n')
                grid_results = grid_results.append(pd.DataFrame(data={'model':'SVM',
                                                                     'params': [{'C':[C],
                                                                              'gamma_kernel':[gamma_kernel]}],
                                                                      'auc_train':[auc_train],
                                                                      'auc_val':[auc_val]},
                                                               columns=['model','params', 'auc_train', 'auc_val']), 
                                                   ignore_index=True)
                
    # Decision Tree
    if params['model'] == 'decision tree':
        for max_depth in params['max_depth']:
            for min_samples_split in params['min_samples_split']:  
                for min_samples_leaf in params['min_samples_leaf']:  
                    for max_features in params['max_features']:  
                
                        # Actualizar contador
                        num_iter += 1; 

                        # print control iteracion modelo
                        print('Inicio de iteracion ' + str(num_iter) + 
                              '. max_depth = ' + str(max_depth) + 
                              ', min_samples_split = '  + str(min_samples_split) +
                              ', min_samples_leaf = '  + str(min_samples_leaf) +
                              ', max_features = '  + str(max_features) +
                              '\n')

                        # Entrenar modelo               
                        model = DecisionTreeClassifier(max_depth = max_depth,
                                                      min_samples_split = min_samples_split,
                                                      min_samples_leaf = min_samples_leaf,
                                                      max_features = max_features, random_state = random_state)

                        model.fit(X_train, np.array(y_train))

                        # Generar predicciones
                        pred_train_p = model.predict_proba(X_train)
                        pred_val_p = model.predict_proba(X_val)

                        # Calcular métricas de evaluación
                        auc_train = metric(y_train, pred_train_p[:, 1])
                        auc_val = metric(y_val, pred_val_p[:, 1])                                            

                        print('Fin de iteracion ' + str(num_iter) + 
                             '. max_depth = ' + str(max_depth) + 
                              ', min_samples_split = '  + str(min_samples_split) +
                              ', min_samples_leaf = '  + str(min_samples_leaf) +
                              ', max_features = '  + str(max_features) +
                              '. AUC train = '  + str(auc_train) + 
                              ' -  AUC val = '  + str(auc_val)  + 
                              '\n')
                        grid_results = grid_results.append(pd.DataFrame(data={'model':'decision tree',
                                                                              'params': [{'max_depth':[max_depth],
                                                                                          'min_samples_split':[min_samples_split],
                                                                                          'min_samples_leaf':[min_samples_leaf],
                                                                                          'max_features':[max_features]}],
                                                                      'auc_train':[auc_train],
                                                                      'auc_val':[auc_val]},
                                                                       columns=['model','params', 'auc_train', 'auc_val']), 
                                                           ignore_index=True)  
                        
    
    # Random Forest
    if params['model'] == 'random forest':
        for n_trees in params['n_trees']:
            for max_depth in params['max_depth']:
                for min_samples_split in params['min_samples_split']:  
                    for min_samples_leaf in params['min_samples_leaf']:  
                        for max_features in params['max_features']:  
                
                            # Actualizar contador
                            num_iter += 1; 

                            # print control iteracion modelo
                            print('Inicio de iteracion ' + str(num_iter) + 
                                  '. n_trees = ' + str(n_trees) + 
                                  ', max_depth = ' + str(max_depth) + 
                                  ', min_samples_split = '  + str(min_samples_split) +
                                  ', min_samples_leaf = '  + str(min_samples_leaf) +
                                  ', max_features = '  + str(max_features) +
                                  '\n')

                            # Entrenar modelo               
                            model = RandomForestClassifier(n_estimators = n_trees,
                                                          max_depth = max_depth,
                                                          min_samples_split = min_samples_split,
                                                          min_samples_leaf = min_samples_leaf,
                                                          max_features = max_features, random_state = random_state)

                            model.fit(X_train, np.array(y_train))

                            # Generar predicciones
                            pred_train_p = model.predict_proba(X_train)
                            pred_val_p = model.predict_proba(X_val)

                            # Calcular métricas de evaluación
                            auc_train = metric(y_train, pred_train_p[:, 1])
                            auc_val = metric(y_val, pred_val_p[:, 1])                                            

                            print('Fin de iteracion ' + str(num_iter) + 
                                 '. n_trees = ' + str(n_trees) + 
                                  ', max_depth = ' + str(max_depth) + 
                                  ', min_samples_split = '  + str(min_samples_split) +
                                  ', min_samples_leaf = '  + str(min_samples_leaf) +
                                  ', max_features = '  + str(max_features) +
                                  '. AUC train = '  + str(auc_train) + 
                                  ' -  AUC val = '  + str(auc_val)  + 
                                  '\n')
                            grid_results = grid_results.append(pd.DataFrame(data={'model':'random forest',
                                                                                  'params': [{'n_trees':[n_trees],
                                                                                              'max_depth':[max_depth],
                                                                                              'min_samples_split':[min_samples_split],
                                                                                              'min_samples_leaf':[min_samples_leaf],
                                                                                              'max_features':[max_features]}],
                                                                          'auc_train':[auc_train],
                                                                          'auc_val':[auc_val]},
                                                                           columns=['model','params', 'auc_train', 'auc_val']), 
                                                               ignore_index=True)  
    
    # XGBOOST
    if params['model'] == 'xgboost':
         for nrounds in params['nrounds']:
            for eta in params['eta']:
                for gamma in params['gamma']:
                    for max_depth in params['max_depth']:
                        for min_child_weight in params['min_child_weight']:
                            for subsample in params['subsample']:
                                for colsample_bytree in params['colsample_bytree']:
                                    for num_parallel_tree in params['num_parallel_tree']:
                                        for lamda in params['lambda']:
                                            for alpha in params['alpha']:
                                            
                                                # Actualizar contador
                                                num_iter += 1; 

                                                # print control iteracion modelo
                                                print('Inicio de iteracion ' + str(num_iter) + 
                                                      '. Parametro nrounds = ' + str(nrounds) + 
                                                      ', parametro eta = '  + str(eta) +
                                                      ', parametro gamma = '  + str(gamma) +
                                                      ', parametro max_depth = '  + str(max_depth) +
                                                      ', parametro min_child_weight = '  + str(min_child_weight) +
                                                      ', parametro subsample = '  + str(subsample) +
                                                      ', parametro colsample_bytree = '  + str(colsample_bytree) +
                                                      ', parametro num_parallel_tree = '  + str(num_parallel_tree) +
                                                      ', parametro lambda = '  + str(lamda) +
                                                      ', parametro alpha = '  + str(alpha) + 
                                                      '\n')
                                                # Entrenar modelo
                                                model = XGBClassifier(nthread = nthread, 
                                                                      scale_pos_weight = scale_pos_weight,
                                                                      random_state = random_state,
                                                                      n_estimators = nrounds,
                                                                      learning_rate = eta, 
                                                                      gamma = gamma,
                                                                      max_depth = max_depth,
                                                                      min_child_weight = min_child_weight ,
                                                                      subsample = subsample,
                                                                      colsample_bytree = colsample_bytree,
                                                                      num_parallel_tree = num_parallel_tree,
                                                                      reg_lambda = lamda,
                                                                      reg_alpha = alpha)
                                                model.fit(X_train, np.array(y_train))

                                                # Generar predicciones
                                                pred_train_p = model.predict_proba(X_train)
                                                pred_val_p = model.predict_proba(X_val)

                                                # Calcular métricas de evaluación
                                                auc_train = metric(y_train, pred_train_p[:, 1])
                                                auc_val = metric(y_val, pred_val_p[:, 1])                                            

                                                print('Fin de iteracion ' + str(num_iter) + 
                                                      '. Parametro nrounds = ' + str(nrounds) + 
                                                      ', parametro eta = ' + str(eta) + 
                                                      ', parametro gamma = '  + str(gamma) +
                                                      ', parametro max_depth = '  + str(max_depth) +
                                                      ', parametro min_child_weight = '  + str(min_child_weight) +
                                                      ', parametro subsample = '  + str(subsample) +
                                                      ', parametro colsample_bytree = '  + str(colsample_bytree) +
                                                      ', parametro num_parallel_tree = '  + str(num_parallel_tree) +
                                                      ', parametro lambda = '  + str(lamda) +
                                                      ', parametro alpha = '  + str(alpha) + 
                                                      '. AUC train = '  + str(auc_train) + 
                                                      ' -  AUC val = '  + str(auc_val)  + 
                                                      '\n')
                                                grid_results = grid_results.append(pd.DataFrame(data={'model':'xgboost',
                                                                                              'params': [{'nrounds':[nrounds],
                                                                                              'eta':[eta],
                                                                                              'gamma':[gamma],
                                                                                              'max_depth':[max_depth],
                                                                                              'min_child_weight':[min_child_weight],
                                                                                              'subsample':[subsample],
                                                                                              'colsample_bytree':[colsample_bytree],
                                                                                              'num_parallel_tree':[num_parallel_tree],
                                                                                              'lamda':[lamda],
                                                                                              'alpha':[alpha]}],
                                                                                              'auc_train':[auc_train],
                                                                                              'auc_val':[auc_val]},
                                                                                               columns=['model', 'params', 'auc_train', 'auc_val']), 
                                                                                   ignore_index=True)
                               

Inicio de iteracion 1. C = 1.2550029618940957, gamma = 0.9460140490375635

Fin de iteracion 1. C = 1.2550029618940957, gamma = 0.9460140490375635. AUC train = 1.0 -  AUC val = 1.0

Inicio de iteracion 2. C = 1.2550029618940957, gamma = 3.26e-09

Fin de iteracion 2. C = 1.2550029618940957, gamma = 3.26e-09. AUC train = 0.909433962264151 -  AUC val = 0.85

Inicio de iteracion 3. C = 1.2550029618940957, gamma = 3.255e-09

Fin de iteracion 3. C = 1.2550029618940957, gamma = 3.255e-09. AUC train = 0.909433962264151 -  AUC val = 0.85

Inicio de iteracion 4. C = 1.2550029618940957, gamma = 3.25e-09

Fin de iteracion 4. C = 1.2550029618940957, gamma = 3.25e-09. AUC train = 0.9056603773584906 -  AUC val = 0.8

Inicio de iteracion 5. C = 0.005, gamma = 0.9460140490375635

Fin de iteracion 5. C = 0.005, gamma = 0.9460140490375635. AUC train = 1.0 -  AUC val = 1.0

Inicio de iteracion 6. C = 0.005, gamma = 3.26e-09

Fin de iteracion 6. C = 0.005, gamma = 3.26e-09. AUC train = 0.9075471698113208 - 


Fin de iteracion 46. max_depth = 20, min_samples_split = 2, min_samples_leaf = 1, max_features = 2. AUC train = 1.0 -  AUC val = 0.4

Inicio de iteracion 47. max_depth = 20, min_samples_split = 2, min_samples_leaf = 5, max_features = None

Fin de iteracion 47. max_depth = 20, min_samples_split = 2, min_samples_leaf = 5, max_features = None. AUC train = 0.9377358490566039 -  AUC val = 0.85

Inicio de iteracion 48. max_depth = 20, min_samples_split = 2, min_samples_leaf = 5, max_features = 1

Fin de iteracion 48. max_depth = 20, min_samples_split = 2, min_samples_leaf = 5, max_features = 1. AUC train = 0.6311320754716981 -  AUC val = 0.5

Inicio de iteracion 49. max_depth = 20, min_samples_split = 2, min_samples_leaf = 5, max_features = 2

Fin de iteracion 49. max_depth = 20, min_samples_split = 2, min_samples_leaf = 5, max_features = 2. AUC train = 0.6311320754716981 -  AUC val = 0.5

Inicio de iteracion 50. max_depth = 20, min_samples_split = 2, min_samples_leaf = 20, max_features = N

Fin de iteracion 79. n_trees = 10, max_depth = 6, min_samples_split = 2, min_samples_leaf = 20, max_features = 2. AUC train = 0.6462264150943396 -  AUC val = 0.125

Inicio de iteracion 80. n_trees = 10, max_depth = 6, min_samples_split = 5, min_samples_leaf = 1, max_features = None

Fin de iteracion 80. n_trees = 10, max_depth = 6, min_samples_split = 5, min_samples_leaf = 1, max_features = None. AUC train = 0.9943396226415095 -  AUC val = 0.8999999999999999

Inicio de iteracion 81. n_trees = 10, max_depth = 6, min_samples_split = 5, min_samples_leaf = 1, max_features = 1

Fin de iteracion 81. n_trees = 10, max_depth = 6, min_samples_split = 5, min_samples_leaf = 1, max_features = 1. AUC train = 1.0 -  AUC val = 0.65

Inicio de iteracion 82. n_trees = 10, max_depth = 6, min_samples_split = 5, min_samples_leaf = 1, max_features = 2

Fin de iteracion 82. n_trees = 10, max_depth = 6, min_samples_split = 5, min_samples_leaf = 1, max_features = 2. AUC train = 0.9924528301886792 -  AUC val =

Fin de iteracion 110. n_trees = 10, max_depth = 20, min_samples_split = 5, min_samples_leaf = 5, max_features = None. AUC train = 0.9528301886792452 -  AUC val = 0.85

Inicio de iteracion 111. n_trees = 10, max_depth = 20, min_samples_split = 5, min_samples_leaf = 5, max_features = 1

Fin de iteracion 111. n_trees = 10, max_depth = 20, min_samples_split = 5, min_samples_leaf = 5, max_features = 1. AUC train = 0.9254716981132076 -  AUC val = 0.425

Inicio de iteracion 112. n_trees = 10, max_depth = 20, min_samples_split = 5, min_samples_leaf = 5, max_features = 2

Fin de iteracion 112. n_trees = 10, max_depth = 20, min_samples_split = 5, min_samples_leaf = 5, max_features = 2. AUC train = 0.8962264150943396 -  AUC val = 0.13333333333333333

Inicio de iteracion 113. n_trees = 10, max_depth = 20, min_samples_split = 5, min_samples_leaf = 20, max_features = None

Fin de iteracion 113. n_trees = 10, max_depth = 20, min_samples_split = 5, min_samples_leaf = 20, max_features = None. AUC train

Fin de iteracion 139. n_trees = 100, max_depth = 6, min_samples_split = 5, min_samples_leaf = 5, max_features = 2. AUC train = 0.9905660377358491 -  AUC val = 0.5666666666666667

Inicio de iteracion 140. n_trees = 100, max_depth = 6, min_samples_split = 5, min_samples_leaf = 20, max_features = None

Fin de iteracion 140. n_trees = 100, max_depth = 6, min_samples_split = 5, min_samples_leaf = 20, max_features = None. AUC train = 0.8952830188679245 -  AUC val = 0.7916666666666666

Inicio de iteracion 141. n_trees = 100, max_depth = 6, min_samples_split = 5, min_samples_leaf = 20, max_features = 1

Fin de iteracion 141. n_trees = 100, max_depth = 6, min_samples_split = 5, min_samples_leaf = 20, max_features = 1. AUC train = 0.9226415094339623 -  AUC val = 0.6

Inicio de iteracion 142. n_trees = 100, max_depth = 6, min_samples_split = 5, min_samples_leaf = 20, max_features = 2

Fin de iteracion 142. n_trees = 100, max_depth = 6, min_samples_split = 5, min_samples_leaf = 20, max_features = 

Fin de iteracion 167. n_trees = 100, max_depth = 20, min_samples_split = 5, min_samples_leaf = 20, max_features = None. AUC train = 0.8952830188679245 -  AUC val = 0.7916666666666666

Inicio de iteracion 168. n_trees = 100, max_depth = 20, min_samples_split = 5, min_samples_leaf = 20, max_features = 1

Fin de iteracion 168. n_trees = 100, max_depth = 20, min_samples_split = 5, min_samples_leaf = 20, max_features = 1. AUC train = 0.9226415094339623 -  AUC val = 0.6

Inicio de iteracion 169. n_trees = 100, max_depth = 20, min_samples_split = 5, min_samples_leaf = 20, max_features = 2

Fin de iteracion 169. n_trees = 100, max_depth = 20, min_samples_split = 5, min_samples_leaf = 20, max_features = 2. AUC train = 0.9047169811320754 -  AUC val = 0.4666666666666667

Inicio de iteracion 170. n_trees = 100, max_depth = 20, min_samples_split = 20, min_samples_leaf = 1, max_features = None

Fin de iteracion 170. n_trees = 100, max_depth = 20, min_samples_split = 20, min_samples_leaf = 1, max_fea

Fin de iteracion 196. n_trees = 1000, max_depth = 6, min_samples_split = 5, min_samples_leaf = 20, max_features = 2. AUC train = 0.8999999999999999 -  AUC val = 0.55

Inicio de iteracion 197. n_trees = 1000, max_depth = 6, min_samples_split = 20, min_samples_leaf = 1, max_features = None

Fin de iteracion 197. n_trees = 1000, max_depth = 6, min_samples_split = 20, min_samples_leaf = 1, max_features = None. AUC train = 0.9811320754716981 -  AUC val = 0.85

Inicio de iteracion 198. n_trees = 1000, max_depth = 6, min_samples_split = 20, min_samples_leaf = 1, max_features = 1

Fin de iteracion 198. n_trees = 1000, max_depth = 6, min_samples_split = 20, min_samples_leaf = 1, max_features = 1. AUC train = 0.9924528301886792 -  AUC val = 0.8666666666666667

Inicio de iteracion 199. n_trees = 1000, max_depth = 6, min_samples_split = 20, min_samples_leaf = 1, max_features = 2

Fin de iteracion 199. n_trees = 1000, max_depth = 6, min_samples_split = 20, min_samples_leaf = 1, max_features = 2. AU

Fin de iteracion 224. n_trees = 1000, max_depth = 20, min_samples_split = 20, min_samples_leaf = 1, max_features = None. AUC train = 0.9811320754716981 -  AUC val = 0.85

Inicio de iteracion 225. n_trees = 1000, max_depth = 20, min_samples_split = 20, min_samples_leaf = 1, max_features = 1

Fin de iteracion 225. n_trees = 1000, max_depth = 20, min_samples_split = 20, min_samples_leaf = 1, max_features = 1. AUC train = 0.9943396226415094 -  AUC val = 0.8666666666666667

Inicio de iteracion 226. n_trees = 1000, max_depth = 20, min_samples_split = 20, min_samples_leaf = 1, max_features = 2

Fin de iteracion 226. n_trees = 1000, max_depth = 20, min_samples_split = 20, min_samples_leaf = 1, max_features = 2. AUC train = 0.9943396226415094 -  AUC val = 0.8666666666666666

Inicio de iteracion 227. n_trees = 1000, max_depth = 20, min_samples_split = 20, min_samples_leaf = 5, max_features = None

Fin de iteracion 227. n_trees = 1000, max_depth = 20, min_samples_split = 20, min_samples_leaf = 5,

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [45]:
grid_results

Unnamed: 0,model,params,auc_train,auc_val
0,SVM,"{'C': [1.2550029618940957], 'gamma_kernel': [0...",1.000000,1.00
1,SVM,"{'C': [1.2550029618940957], 'gamma_kernel': [3...",0.909434,0.85
2,SVM,"{'C': [1.2550029618940957], 'gamma_kernel': [3...",0.909434,0.85
3,SVM,"{'C': [1.2550029618940957], 'gamma_kernel': [3...",0.905660,0.80
4,SVM,"{'C': [0.005], 'gamma_kernel': [0.946014049037...",1.000000,1.00
...,...,...,...,...
231,random forest,"{'n_trees': [1000], 'max_depth': [20], 'min_sa...",0.900000,0.55
232,xgboost,"{'nrounds': [10], 'eta': [0.3], 'gamma': [0], ...",0.775472,0.50
233,xgboost,"{'nrounds': [10], 'eta': [0.3], 'gamma': [0], ...",0.631132,0.50
234,xgboost,"{'nrounds': [10], 'eta': [0.3], 'gamma': [0], ...",0.631132,0.50


In [54]:
grid_results[grid_results.model=='random forest'<+.groupby(['model'], sort=False)['auc_val'].max().sort_values()

model
random forest    1.0
Name: auc_val, dtype: float64

In [66]:
best_params = grid_results[grid_results.model=='random forest'].iloc[grid_results['auc_val'].idxmax()]
best_params

model                                            random forest
params       {'n_trees': [10], 'max_depth': [6], 'min_sampl...
auc_train                                                    1
auc_val                                                    0.9
Name: 70, dtype: object

In [67]:
print('Train data size = ' + str(X_train.shape))
print('Train target size = ' + str(y_train.shape))
print('Validation data size = ' + str(X_val.shape))
print('Validation target size = ' + str(y_val.shape))

# Combinar train y validación
X_train = pd.concat((X_train,X_val), axis = 0)
y_train = np.concatenate((y_train, y_val), axis = 0)

del X_val, y_val

print('Train data size = ' + str(X_train.shape))
print('Train target size = ' + str(y_train.shape))

Train data size = (86, 29)
Train target size = (86,)


NameError: name 'X_val' is not defined

In [68]:
# Logistic Regression
if best_params['model'] == 'logistic regression':       

    # Entrenar modelo
    if best_params['params']['regularization'] == 'l1':
        model = LogisticRegression(penalty = best_params['params']['regularization'][0], solver = 'liblinear', C = best_params['params']['penalty'][0], random_state = random_state)
    else:
        model = LogisticRegression(penalty = best_params['params']['regularization'][0],solver = 'lbfgs', C = best_params['params']['penalty'][0], random_state = random_state)


# SVM
elif best_params['model'] == 'SVM':

    model = SVC(C = best_params['params']['C'][0], gamma = best_params['params']['gamma_kernel'][0], probability = True, 
                random_state = random_state)             


# Decision Tree
elif best_params['model'] == 'decision tree':
    model = DecisionTreeClassifier(max_depth = int(best_params['params']['max_depth'][0]),
                                                  min_samples_split = int(best_params['params']['min_samples_split'][0]),
                                                  min_samples_leaf = int(best_params['params']['min_samples_leaf'][0]),
                                                  max_features = int(best_params['params']['max_features'][0]), 
                                   random_state = random_state)


# Random Forest
elif best_params['model'] == 'random forest':
    model = RandomForestClassifier(n_estimators = int(best_params['params']['n_trees'][0]),
                                                      max_depth = int(best_params['params']['max_depth'][0]),
                                                      min_samples_split = int(best_params['params']['min_samples_split'][0]),
                                                      min_samples_leaf = int(best_params['params']['min_samples_leaf'][0]),
                                                      max_features = int(best_params['params']['max_features'][0]), 
                                                      random_state = random_state)

# XGBOOST
elif best_params['model'] == 'xgboost':
    model = XGBClassifier(nthread = nthread, 
                                                              scale_pos_weight = scale_pos_weight,
                                                              random_state = random_state,
                                                              n_estimators = int(best_params['params']['nrounds'][0]), 
                                                              learning_rate = best_params['params']['eta'][0], 
                                                              gamma = best_params['params']['gamma'][0],
                                                              max_depth = int(best_params['params']['max_depth'][0]),
                                                              min_child_weight = best_params['params']['min_child_weight'][0],
                                                              subsample = best_params['params']['subsample'][0],
                                                              colsample_bytree = best_params['params']['colsample_bytree'][0],
                                                              num_parallel_tree  = int(best_params['params']['num_parallel_tree'][0]),
                                                              reg_lambda = best_params['params']['lamda'][0],
                                                              reg_alpha = best_params['params']['alpha'][0])

# Entrenar modelo
model.fit(X_train, np.array(y_train))

# Generar predicciones
pred_train_p = model.predict_proba(X_train)
pred_test_p = model.predict_proba(X_test)

# Calcular métricas de evaluación
auc_train = metric(y_train, pred_train_p[:, 1])
auc_test = metric(y_test, pred_test_p[:, 1]) 

results = pd.DataFrame()
results = results.append(pd.DataFrame(data={'model':best_params['model'],'auc_train':[auc_train],'auc_test':[auc_test]}, columns=['model',  'auc_train', 'auc_test']), ignore_index=True)

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [69]:
results

Unnamed: 0,model,auc_train,auc_test
0,SVM,1.0,0.457143
