In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_text, export_graphviz
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc
from sklearn import preprocessing

from matplotlib.legend_handler import HandlerLine2D
from sklearn import metrics

import graphviz
from IPython.display import Image  
import pydotplus
from io import StringIO

import plotly_express as px

from collections import Counter

import imblearn
from imblearn.over_sampling import SMOTE

In [None]:
class AbstractClassificationProblem:
    labels = ['No Fraude', 'Fraude']

    def train(self):
        print("Comienza Entrenamiento")
        self.print_self()
        self.clf_model.fit(self.X_train, self.y_train)
        self.y_predict = self.clf_model.predict(self.X_test)
        print("Entrenado")
    
    def show_confusion_matrix(self):
        self.print_self()
        ConfusionMatrixDisplay.from_estimator(estimator=self.clf_model,
                                              X=self.X_test, 
                                              y=self.y_test,
                                              display_labels=self.labels)
        plt.show()
    
    def show_classification_report(self):
        self.print_self()
        print(classification_report(self.y_test, self.y_predict, target_names=self.labels))

    def accuracy(self):
        return accuracy_score(self.y_test, self.y_predict)
    
    def recall(self):
        return recall_score(self.y_test, self.y_predict)
    
    def precision_score(self):
        return precision_score(self.y_test, self.y_predict)
    
    def f1_score(self):
        return f1_score(self.y_test, self.y_predict)
    
    def print_self(self):
        pass
    
    def auc(self):
        false_positive_rate, true_positive_rate, thresholds = roc_curve(self.y_test, self.y_predict)
        return auc(false_positive_rate, true_positive_rate)
    
    def roc_curve(self):        
        fpr, tpr, thresholds = roc_curve(self.y_test, self.y_predict, pos_label=1)
        auc = metrics.roc_auc_score(self.y_test, self.y_predict)
        fig, ax = plt.subplots()
        ax.plot(fpr, tpr)
        ax.plot([0, 1], [0, 1], color='navy', linestyle='--', label='random')
        plt.title(f'AUC: {auc}')
        ax.set_xlabel('False positive rate')
        ax.set_ylabel('True positive rate')
    



class AbstractDecisionTree(AbstractClassificationProblem):
    criterion = ""
    tipo = ""
    
    def __init__(self, dataset, X, X_train, X_test, y_train, y_test, target, max_depth=None, min_samples_leaf=1):
        self.X_train = X_train
        self.X_test = X_test
        self.y_test = y_test
        self.y_train = y_train
        self.clf_model = DecisionTreeClassifier(criterion=self.criterion, 
                                                random_state=42,
                                                max_depth=max_depth,
                                                min_samples_leaf=min_samples_leaf,
                                                max_features=None,
                                                max_leaf_nodes=None,
                                                class_weight=None,
                                                splitter='best')

        self.target = list(dataset['isFraud'].unique())
        self.feature_names = list(X.columns)
        
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
    
    def show_matrix(self):
        dot_data = tree.export_graphviz(self.clf_model,
                                        out_file=None,
                                        feature_names=self.feature_names,
                                        class_names=str(self.target),
                                        filled=True,
                                        rounded=True,
                                        special_characters=True)
        dot_data = StringIO()
        export_graphviz(self.clf_model, 
                        out_file=dot_data, 
                        filled=True, 
                        rounded=True, 
                        special_characters=True,
                        feature_names=self.feature_names,
                        class_names=self.labels)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        return Image(graph.create_png())
    
    def print_self(self):
        print("**", "Arbol de decision - " + self.tipo, "**")
        print('**', "max_depth=" + str(self.max_depth) + ",", "min_samples_leaf=" + str(self.min_samples_leaf), "**")


class GiniDecisionTree(AbstractDecisionTree):
    criterion = "gini"
    tipo = "Gini Index"

class InformationGainDecisionTree(AbstractDecisionTree):
    criterion = "entropy"
    tipo = "Information Gain"

# https://scikit-learn.org/stable/modules/neural_networks_supervised.html
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
class AbstractNeuralNetwork(AbstractClassificationProblem):
    solver = ''
    tipo = ''
    
    def __init__(self, X, y, X_train, X_test, y_train, y_test, alpha=1e-5, hidden_layer_sizes=(15,), max_iter=5000):
        self.X = X
        self.y = y
        
        # https://scikit-learn.org/stable/modules/preprocessing.html
        self.X_train = self._scale(X_train)
        self.X_test = self._scale(X_test)
        
        #https://numpy.org/doc/stable/reference/generated/numpy.ravel.html
        self.y_train = y_train.values.ravel()

        self.y_test = y_test
        
        self.alpha = alpha
        self.hidden_layer_sizes = hidden_layer_sizes
        self.max_iter = max_iter

        self.clf_model = MLPClassifier(solver=self.solver,
                                       alpha=alpha, 
                                       hidden_layer_sizes=hidden_layer_sizes, 
                                       random_state=42, 
                                       max_iter=max_iter)   
    
    def print_self(self):
        print("**", "Red Neuronal - " + self.tipo, "**")
        print("**", 
              "alpha=" + str(self.alpha), 
              "hidden_layer_sizes=" + str(self.hidden_layer_sizes), 
              "max_iter=" + str(self.max_iter), 
              "**")

    def _scale(self, X):
        scaler = preprocessing.StandardScaler().fit(X)
        return scaler.transform(X)


class LBFGSNeuralNetwork(AbstractNeuralNetwork):
    solver = 'lbfgs'
    tipo = "LBFGS"

class SGDNeuralNetwork(AbstractNeuralNetwork):
    solver = 'sgd'
    tipo = "SGD"

class AdamNeuralNetwork(AbstractNeuralNetwork):
    solver = 'adam'
    tipo = 'ADAM'

class Comparator:
    models = []
    
    def __init__(self, models):
        self.models = models
    
    def show_confusion_matrix(self):
        for model in self.models:
            model.show_confusion_matrix()
            print("\n")

    def show_classification_report(self):
        for model in self.models:
            model.show_classification_report()
            print("\n")

    def accuracy(self):
        for model in self.models:
            print(model.accuracy())
    
    def recall(self):
        for model in self.models:
            print(model.recall())
    
    def precision_score(self):
        for model in self.models:
            print(model.precision_score())
            
    def f1_score(self):
        for model in self.models:
            print(model.f1_score())
    
    def auc(self):
        for model in self.models:
            print(model.auc())

In [None]:
class ParameterTuning():
    
    def __init__(self, decisionTreeCriterion, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_test = y_test
        self.y_train = y_train
        self.decisionTreeCriterion = decisionTreeCriterion
        
    def train(self, parameters_to_tune_array, parameter_name):
        def get_dec_tree_class(random_state=42, max_depth=None, min_samples_leaf=1, max_features=None, max_leaf_nodes=None, min_samples_split=2, min_weight_fraction_leaf=0):
            return DecisionTreeClassifier(criterion=self.decisionTreeCriterion, 
                                        random_state=random_state,
                                        max_depth=max_depth,
                                        min_samples_leaf=min_samples_leaf,
                                        max_features=max_features,
                                        max_leaf_nodes=max_leaf_nodes,
                                        min_samples_split=min_samples_split,
                                        class_weight=None,
                                        min_weight_fraction_leaf = min_weight_fraction_leaf,
                                        splitter='best')
            
        print("parameters tuning for {}:{}".format(parameter_name, parameters_to_tune_array))
        train_results = []
        test_results = []
        max_depth=None
        min_samples_split=2
        min_samples_leaf=None
        for curr_parameter in parameters_to_tune_array:
            if parameter_name == 'max_depth':
                max_depth = curr_parameter
            elif parameter_name == 'min_samples_split':
                min_samples_split = curr_parameter
            elif parameter_name == 'min_samples_leaf':
                min_samples_leaf = curr_parameter
                
            dt = get_dec_tree_class(max_depth=max_depth, min_samples_split=min_samples_split)
            dt.fit(self.X_train, self.y_train)
            train_pred = dt.predict(self.X_train)
            false_positive_rate, true_positive_rate, thresholds = roc_curve(self.y_train, train_pred)
            roc_auc = auc(false_positive_rate, true_positive_rate)
            # Add auc score to previous train results
            train_results.append(roc_auc)
            
            y_pred = dt.predict(self.X_test)
            false_positive_rate, true_positive_rate, thresholds = roc_curve(self.y_test, y_pred)
            roc_auc = auc(false_positive_rate, true_positive_rate)
            # Add auc score to previous test results
            test_results.append(roc_auc)
            
        line1, = plt.plot(parameters_to_tune_array, train_results, 'b', label='Train AUC')
        line2, = plt.plot(parameters_to_tune_array, test_results, 'r', label='Test AUC')
        plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
        plt.ylabel('AUC score')
        plt.xlabel(parameter_name)
        plt.show()
    
    def get_best_max_depth(self, hasta):
        max_depths = np.linspace(1, hasta, hasta, dtype="int", endpoint=True)
        self.train(max_depths, 'max_depth')
        
    def get_best_min_samples_split(self):
        min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
        self.train(min_samples_splits, 'min_samples_split')
        
    def get_best_min_samples_leaf(self):
        min_samples_leafs = np.linspace(0.001, 0.005, 5, endpoint=True)
        self.train(min_samples_leafs, 'min_samples_leaf')
        

# ANALISIS DE DATOS

## Levantamos los datos

In [None]:
df = pd.read_csv("./Fraud.csv")

df.head()

## Tamaño del dataset

In [None]:
df.shape

## Tipos de los datos

In [None]:
df.dtypes

## Estadisticas descriptivas de los datos

In [None]:
df.describe()

## Se borran las columnas 'nameOrig' y 'nameDest'

In [None]:
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

## Revisamos si hay valores perdidos (None, NaN) en el resto del dataset

In [None]:
df.isnull().sum()

## Correlación de los datos

### Correlacion de los datos contra la variable 'isFraud'

In [None]:
df.corr()["isFraud"].sort_values()

### Correlacion de los datos entre si

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True)

## Cantidad de fraudes y no fraudes que hay en el dataset

In [None]:
df["isFraud"].value_counts()

## Cantidad de fraudes y no fraudes que hay en el dataset (Normalizado)

In [None]:
df["isFraud"].value_counts(normalize=True)

## Cantidad de tipos de transacciones que hay en el dataset

In [None]:
df['type'].value_counts()

## Transacciones fraudulentas y no fraudulentas diferenciadas por su tipo

In [None]:
plt.figure(figsize=(15, 8))
sns.countplot(x="type", data=df, hue="isFraud")

## Porcentajes de transacciones fraudulentas de cada tipo de transaccion

In [None]:
# https://docs.python.org/3/library/collections.html#collections.Counter
df_type_fraud = pd.DataFrame(dict(Counter(df['type'])).items(), columns=['type', 'IsFraud'])

pie_porcentaje_transacciones_fraudulentas = px.pie(df_type_fraud, values="IsFraud", names='type', title='Transacciones Fraudulentas', color_discrete_sequence=px.colors.sequential.RdBu)
pie_porcentaje_transacciones_fraudulentas.show()

## Mapeo el type a números

In [None]:
mapping_type = {'CASH_IN': 0,'CASH_OUT': 1,'PAYMENT': 2,'TRANSFER': 3,'DEBIT': 4}
df['type_numeric'] = df.type.map(mapping_type)
df.drop('type', inplace=True, axis=1)

In [None]:
X = df.drop('isFraud',axis=1)
y = df[['isFraud']]

# Primeras pruebas con datos desbalanceados

## Separación de los datos de entrenamiento (80%) y datos para testing (20%)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Entreno el Arbol de Decision (Gini Impurity)

In [None]:
dtGini_baseline = GiniDecisionTree(dataset=df,
                          X=X,
                          X_train=X_train, 
                          X_test=X_test, 
                          y_train=y_train, 
                          y_test=y_test, 
                          target=df['isFraud'])
dtGini_baseline.train()

### Medidas de performance

#### Matriz de Confusion

In [None]:
dtGini_baseline.show_confusion_matrix()

#### Reporte

In [None]:
dtGini_baseline.show_classification_report()

#### Accuracy

In [None]:
dtGini_baseline.accuracy()

#### Recall

In [None]:
dtGini_baseline.recall()

#### Precision Score

In [None]:
dtGini_baseline.precision_score()

#### F1 Score

In [None]:
dtGini_baseline.f1_score()

In [None]:
dtGini_baseline.auc()

In [None]:
dtGini_baseline.roc_curve()

### Impresion del Arbol

In [None]:
dtGini_baseline.show_matrix()

### AUC (Area Under Curve) como métrica de evaluación

# Entreno el Arbol de Decision (Information Gain)

In [None]:
dtInfoGain_baseline = InformationGainDecisionTree(dataset=df,
                                                  X=X,
                                                  X_train=X_train, 
                                                  X_test=X_test, 
                                                  y_train=y_train,
                                                  y_test=y_test, 
                                                  target=df['isFraud'])
dtInfoGain_baseline.train()

In [None]:
# Hice una prueba agregando "dummies" en lugar de fijar un número para cada valor de "type", pero al parecer el resultado es el mismo.
#df2 = pd.read_csv("./Fraud.csv")
#df2.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)
# Getting Dummies from all other categorical vars
#for col in df2.dtypes[df2.dtypes == 'object'].index:
#    for_dummy = df2.pop(col)
#    df2 = pd.concat([df2, pd.get_dummies(for_dummy, prefix=col)], axis=1)
#X2 = df2.drop('isFraud',axis=1)
#y2 = df2[['isFraud']]
#X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)
#dtInfoGain2 = InformationGainDecisionTree(df2, X=X2, 
#                                         X_train=X2_train, 
#                                         X_test=X2_test, 
#                                         y_train=y2_train, 
#                                         y_test=y2_test, 
#                                         target=df2['isFraud'],
#                                         max_depth=5, 
#                                         min_samples_leaf=5)
#dtInfoGain2.train()
#dtInfoGain2.show_confusion_matrix()
#dtInfoGain2.show_classification_report()

### Medidas de Performance

#### Matriz de Confusion

In [None]:
dtInfoGain_baseline.show_confusion_matrix()

#### Reporte

In [None]:
dtInfoGain_baseline.show_classification_report()

#### Accuracy

In [None]:
dtInfoGain_baseline.accuracy()

#### Recall

In [None]:
dtInfoGain_baseline.recall()

#### Precision

In [None]:
dtInfoGain_baseline.precision_score()

### Impresion del Arbol

In [None]:
dtInfoGain_baseline.show_matrix()

### AUC (Area Under Curve) como métrica de evaluación

In [None]:
dtInfoGain_baseline.auc()

## Comparo parámetros para max_depth

In [None]:
parameterTuning_InfoGain = ParameterTuning(decisionTreeCriterion="entropy",
                                            X_train=X_train,
                                            X_test=X_test,
                                            y_train=y_train,
                                            y_test=y_test)

parameterTuning_InfoGain.get_best_max_depth(30)

In [None]:
parameterTuning_InfoGain = ParameterTuning(decisionTreeCriterion="entropy",
                                            X_train=X_train,
                                            X_test=X_test,
                                            y_train=y_train,
                                            y_test=y_test)
parameterTuning_InfoGain.get_best_min_samples_split()

In [None]:
parameterTuning_InfoGain = ParameterTuning(decisionTreeCriterion="entropy",
                                            X_train=X_train,
                                            X_test=X_test,
                                            y_train=y_train,
                                            y_test=y_test)
parameterTuning_InfoGain.get_best_min_samples_leaf()

In [None]:
dtInfoGain_baseline.roc_curve()

## Entreno Red Neuronal

In [None]:
nn = LBFGSNeuralNetwork(X=X,
                        y=y,
                        X_train=X_train, 
                        X_test=X_test, 
                        y_train=y_train, 
                        y_test=y_test, 
                        alpha=1e-5, 
                        hidden_layer_sizes=(15,), 
                        max_iter=200)
nn.train()

### Medidas de Performance

#### Matriz de Confusion

In [None]:
nn.show_confusion_matrix()

#### Reporte

In [None]:
nn.show_classification_report()

#### Accuracy

In [None]:
nn.accuracy()

#### Recall

In [None]:
nn.recall()

#### Precision

In [None]:
nn.precision_score()

# Agrego un Comparador de modelos

In [None]:
dtGini = GiniDecisionTree(df,
                          X=X, 
                          X_train=X_train, 
                          X_test=X_test, 
                          y_train=y_train, 
                          y_test=y_test, 
                          target=df['isFraud'],
                          max_depth=5, 
                          min_samples_leaf=5)
dtGini.train()

dtInfoGain = InformationGainDecisionTree(df,
                                         X=X, 
                                         X_train=X_train, 
                                         X_test=X_test, 
                                         y_train=y_train, 
                                         y_test=y_test, 
                                         target=df['isFraud'],
                                         max_depth=5, 
                                         min_samples_leaf=5)
dtInfoGain.train()

nn = LBFGSNeuralNetwork(X=X,
                        y=y,
                        X_train=X_train, 
                        X_test=X_test, 
                        y_train=y_train, 
                        y_test=y_test, 
                        alpha=1e-5, 
                        hidden_layer_sizes=(15,), 
                        max_iter=200)

nn.train()

In [None]:
models = [
    dtGini, dtInfoGain, nn
]

comparator = Comparator(models)

In [None]:
comparator.show_confusion_matrix()

In [None]:
comparator.show_classification_report()

In [None]:
comparator.accuracy()

In [None]:
comparator.recall()

In [None]:
comparator.precision_score()

# Primeras pruebas con datos Balanceados

In [None]:
# Count classes and plot
target_count = df["isFraud"].value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
target_count.plot(kind='bar', title='Count (isFraud)');

## Random undersampling

In [None]:
# Class count
target_0_count, target_1_count=df["isFraud"].value_counts()
# Seperate classes
target_0 = df[df["isFraud"] == 0]
target_1 = df[df["isFraud"] == 1]

# Resample target1 to match target 0 count
target_0_undersample = target_0.sample(target_1_count)
# Merge back to single df
test_undersample = pd.concat([target_0_undersample, target_1], axis=0)
# Show counts and plot
print('Random under-sampling:')
test_undersample["isFraud"].value_counts().plot(kind='bar', title='Count (target)');

In [None]:
X_undersample = test_undersample.drop('isFraud',axis=1)
y_undersample = test_undersample[['isFraud']]
X_undersample_train, X_undersample_test, y_undersample_train, y_undersample_test = train_test_split(X_undersample, y_undersample, test_size=0.2, random_state=42)

In [None]:
len(X_undersample)

In [None]:
dtGini_undersample = GiniDecisionTree(test_undersample,
                                      X=X_undersample, 
                                      X_train=X_undersample_train, 
                                      X_test=X_undersample_test, # Para testear debe ser sobre el dataset original
                                      y_train=y_undersample_train, 
                                      y_test=y_undersample_test, # Para testear debe ser sobre el dataset original
                                      target=test_undersample['isFraud'],
                                      max_depth=5,
                                      min_samples_leaf=5)
dtGini_undersample.train()

In [None]:
dtGini_undersample.show_classification_report()
dtGini_undersample.show_confusion_matrix()

## Random oversampling

In [None]:
# Class count
target_0_count, target_1_count = df["isFraud"].value_counts()
# Seperate classes
target_0 = df[df["isFraud"] == 0]
target_1 = df[df["isFraud"] == 1]

# Resample target0 to match target 1 count
target_1_oversample = target_1.sample(target_0_count, replace=True)
# Merge back to single df
test_oversample = pd.concat([target_1_oversample, target_0], axis=0)
# Show counts and plot
print('Random over-sampling:')
print(test_oversample["isFraud"].value_counts())
test_oversample["isFraud"].value_counts().plot(kind='bar', title='Count (isFraud)');


In [None]:
X_oversample = test_oversample.drop('isFraud',axis=1)
y_oversample = test_oversample[['isFraud']]
X_oversample_train, X_oversample_test, y_oversample_train, y_oversample_test = train_test_split(X_oversample, y_oversample, test_size=0.2, random_state=42)

In [None]:
dtGini_oversample = GiniDecisionTree(test_oversample,
                                      X=X_oversample, 
                                      X_train=X_oversample_train, 
                                      X_test=X_oversample_test, # Para testear debe ser sobre el dataset original
                                      y_train=y_oversample_train, 
                                      y_test=y_oversample_test, # Para testear debe ser sobre el dataset original
                                      target=test_oversample['isFraud'],
                                      max_depth=5,
                                      min_samples_leaf=5)
dtGini_oversample.train()

In [None]:
dtGini_oversample.show_classification_report()
dtGini_oversample.show_confusion_matrix()

## SMOTE

In [None]:
oversample = SMOTE()
X_balanced, y_balanced = oversample.fit_resample(X, y)

In [None]:
len(X)

In [None]:
len(X_balanced)

In [None]:
y.value_counts()

In [None]:
y_balanced.value_counts()

In [None]:
X_balanced_train, X_balanced_test, y_balanced_train, y_balanced_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

In [None]:
dtGiniBalanced = GiniDecisionTree(df,
                                  X=X, 
                                  X_train=X_balanced_train, 
                                  X_test=X_test, # Para testear debe ser sobre el dataset original
                                  y_train=y_balanced_train, 
                                  y_test=y_test, # Para testear debe ser sobre el dataset original
                                  target=df['isFraud'],
                                  max_depth=5,
                                  min_samples_leaf=5)
dtGiniBalanced.train()

In [None]:
dtGiniBalanced.show_confusion_matrix()

In [None]:
dtGiniBalanced.show_classification_report()

In [None]:
dtGiniBalanced.accuracy()

In [None]:
dtGiniBalanced.precision_score()

In [None]:
dtGiniBalanced.recall()

# GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='newton-cg')

#Setting the range for class weights
weights = np.linspace(0.0,0.99,2)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= lr, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring='f1', 
                          verbose=2).fit(X_train, y_train)

#Ploting the score for different values of weight
sns.set_style('whitegrid')
plt.figure(figsize=(12,8))
weigh_data = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (1- weights)})
sns.lineplot(weigh_data['weight'], weigh_data['score'])
plt.xlabel('Weight for class 1')
plt.ylabel('F1 score')
plt.xticks([round(i/10,1) for i in range(0,11,1)])
plt.title('Scoring for different class weights', fontsize=24)


# RandomizedSearchCV

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Clasificador usando xgboost
clf1 = xgb.XGBClassifier()
  
# Establezca el rango de búsqueda de los parámetros de xgboost para buscar, búsqueda de valores para los 6 parámetros principales de XGBoost
param_dist = {
        #'n_estimators':range(80,200,4),
        'max_depth':range(2,15,1),
        #'learning_rate':np.linspace(0.01,2,20),
        #'subsample':np.linspace(0.7,0.9,20),
        #'colsample_bytree':np.linspace(0.5,0.98,10),
        'min_child_weight':range(1,9,1)
        }

grid = RandomizedSearchCV(clf1,param_dist,cv = 3,scoring = 'neg_log_loss',n_iter=1,n_jobs = -1)

#Entrenamiento en el set de entrenamiento
grid.fit(df.values,np.ravel(df["isFraud"].values))
#Vuelve con el mejor entrenador
best_estimator = grid.best_estimator_
print(best_estimator)
#Output la precisión del entrenador óptimo
print(grid.best_score_)

## Comparo parámetros para datos Balanceados

In [None]:
parameterTuning_InfoGain = ParameterTuning(decisionTreeCriterion="entropy",
                                            X_train=X_balanced_train,
                                            X_test=X_balanced_test,
                                            y_train=y_balanced_train,
                                            y_test=y_balanced_test)

parameterTuning_InfoGain.get_best_max_depth(30)

In [None]:
parameterTuning_InfoGain = ParameterTuning(decisionTreeCriterion="entropy",
                                            X_train=X_balanced_train,
                                            X_test=X_balanced_test,
                                            y_train=y_balanced_train,
                                            y_test=y_balanced_test)
parameterTuning_InfoGain.get_best_min_samples_split()

In [None]:
parameterTuning_InfoGain = ParameterTuning(decisionTreeCriterion="entropy",
                                            X_train=X_balanced_train,
                                            X_test=X_balanced_test,
                                            y_train=y_balanced_train,
                                            y_test=y_balanced_test)
parameterTuning_InfoGain.get_best_min_samples_leaf()