## Imports

In [None]:
import Bibliotecas
%matplotlib inline
import Modelos
import Comparador
import ParameterTuning

### Timing

In [None]:
inicio_ppio = time.time()

# ANALISIS DE DATOS

## Levantamos los datos

In [None]:
df = pd.read_csv("./Fraud.csv")

df.head()

## Tamaño del dataset

In [None]:
df.shape

## Tipos de los datos

In [None]:
df.dtypes

## Estadisticas descriptivas de los datos

In [None]:
df.describe()

## Se borran las columnas 'nameOrig' y 'nameDest'

In [None]:
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

## Revisamos si hay valores perdidos (None, NaN) en el resto del dataset

In [None]:
df.isnull().sum()

## Correlación de los datos

### Correlacion de los datos contra la variable 'isFraud'

In [None]:
df.corr()["isFraud"].sort_values()

### Correlacion de los datos entre si

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True)

## Cantidad de fraudes y no fraudes que hay en el dataset

In [None]:
df["isFraud"].value_counts()

## Cantidad de fraudes y no fraudes que hay en el dataset (Normalizado)

In [None]:
df["isFraud"].value_counts(normalize=True)

## Cantidad de tipos de transacciones que hay en el dataset

In [None]:
df['type'].value_counts()

## Transacciones fraudulentas y no fraudulentas diferenciadas por su tipo

In [None]:
plt.figure(figsize=(15, 8))
sns.countplot(x="type", data=df, hue="isFraud")

## Porcentajes de transacciones fraudulentas de cada tipo de transaccion

In [None]:
# https://docs.python.org/3/library/collections.html#collections.Counter
df_type_fraud = pd.DataFrame(dict(Counter(df['type'])).items(), columns=['type', 'IsFraud'])

pie_porcentaje_transacciones_fraudulentas = px.pie(df_type_fraud, values="IsFraud", names='type', title='Transacciones Fraudulentas', color_discrete_sequence=px.colors.sequential.RdBu)
pie_porcentaje_transacciones_fraudulentas.show()

## Mapeo el type a números

In [None]:
mapping_type = {'CASH_IN': 0,'CASH_OUT': 1,'PAYMENT': 2,'TRANSFER': 3,'DEBIT': 4}
df['type_numeric'] = df.type.map(mapping_type)
df.drop('type', inplace=True, axis=1)

In [None]:
X = df.drop('isFraud',axis=1)
y = df[['isFraud']]

# Primeras pruebas con datos desbalanceados

## Separación de los datos de entrenamiento (80%) y datos para testing (20%)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Entreno el Arbol de Decision (Gini Impurity)

In [None]:
dtGini_baseline = GiniDecisionTree(dataset=df,
                          X=X,
                          X_train=X_train, 
                          X_test=X_test, 
                          y_train=y_train, 
                          y_test=y_test, 
                          target=df['isFraud'])
dtGini_baseline.train()

### Medidas de performance

#### Matriz de Confusion

In [None]:
dtGini_baseline.show_confusion_matrix()

#### Reporte

In [None]:
dtGini_baseline.show_classification_report()

#### Accuracy

In [None]:
dtGini_baseline.accuracy()

#### Recall

In [None]:
dtGini_baseline.recall()

#### Precision Score

In [None]:
dtGini_baseline.precision_score()

#### F1 Score

In [None]:
dtGini_baseline.f1_score()

In [None]:
dtGini_baseline.auc()

### AUC (Area Under Curve) como métrica de evaluación

In [None]:
dtGini_baseline.roc_curve()

### Impresion del Arbol

In [None]:
#dtGini_baseline.show_matrix()

# Entreno el Arbol de Decision (Information Gain)

In [None]:
dtInfoGain_baseline = InformationGainDecisionTree(dataset=df,
                                                  X=X,
                                                  X_train=X_train, 
                                                  X_test=X_test, 
                                                  y_train=y_train,
                                                  y_test=y_test, 
                                                  target=df['isFraud'])
dtInfoGain_baseline.train()

In [None]:
#Hice una prueba agregando "dummies" en lugar de fijar un número para cada valor de "type"
df2 = pd.read_csv("./Fraud.csv")
df2.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

#Getting Dummies from all other categorical vars
for col in df2.dtypes[df2.dtypes == 'object'].index:
    for_dummy = df2.pop(col)
    df2 = pd.concat([df2, pd.get_dummies(for_dummy, prefix=col)], axis=1)
X2 = df2.drop('isFraud',axis=1)
y2 = df2[['isFraud']]
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)
dtInfoGain2 = InformationGainDecisionTree(df2, X=X2, 
                                        X_train=X2_train, 
                                        X_test=X2_test, 
                                        y_train=y2_train, 
                                        y_test=y2_test, 
                                        target=df2['isFraud'])
dtInfoGain2.train()
dtInfoGain2.show_classification_report()

In [None]:
print(dtInfoGain2.auc())
dtGini_baseline.auc()

### Medidas de Performance

#### Matriz de Confusion

In [None]:
dtInfoGain_baseline.show_confusion_matrix()

#### Reporte

In [None]:
dtInfoGain_baseline.show_classification_report()

#### Accuracy

In [None]:
dtInfoGain_baseline.accuracy()

#### Recall

In [None]:
dtInfoGain_baseline.recall()

#### Precision

In [None]:
dtInfoGain_baseline.precision_score()

### Impresion del Arbol

In [None]:
#dtInfoGain_baseline.show_matrix()

### AUC (Area Under Curve) como métrica de evaluación

In [None]:
dtInfoGain_baseline.auc()

In [None]:
dtInfoGain_baseline.roc_curve()

## Comparo parámetros para max_depth, min_samples_split y min_samples_leaf (baseline)

In [None]:
parameterTuning_InfoGain = ParameterTuning(decisionTreeCriterion="entropy",
                                            X_train=X_train,
                                            X_test=X_test,
                                            y_train=y_train,
                                            y_test=y_test)

infogain_best_max_depth_baseline = parameterTuning_InfoGain.get_best_max_depth(30)
infogain_best_max_depth_baseline = ParameterTuning.list_only_parameters(infogain_best_max_depth_baseline)
infogain_best_min_samples_split_baseline_1 = parameterTuning_InfoGain.get_best_min_samples_split()
infogain_best_min_samples_split_baseline_2 = parameterTuning_InfoGain.get_best_min_samples_split(20)
infogain_best_min_samples_split_baseline = ParameterTuning.get_best_result(infogain_best_min_samples_split_baseline_1, infogain_best_min_samples_split_baseline_2)
infogain_best_min_samples_leaf_baseline_1 = parameterTuning_InfoGain.get_best_min_samples_leaf()
infogain_best_min_samples_leaf_baseline_2 = parameterTuning_InfoGain.get_best_min_samples_leaf(20)
infogain_best_min_samples_leaf_baseline = ParameterTuning.get_best_result(infogain_best_min_samples_leaf_baseline_1, infogain_best_min_samples_leaf_baseline_2)

In [None]:
parameterTuning_Gini = ParameterTuning(decisionTreeCriterion="gini",
                                            X_train=X_train,
                                            X_test=X_test,
                                            y_train=y_train,
                                            y_test=y_test)
gini_best_max_depth_baseline = parameterTuning_Gini.get_best_max_depth(30)
gini_best_max_depth_baseline = ParameterTuning.list_only_parameters(gini_best_max_depth_baseline)
gini_best_min_samples_split_baseline_1 = parameterTuning_Gini.get_best_min_samples_split()
gini_best_min_samples_split_baseline_2 = parameterTuning_Gini.get_best_min_samples_split(20)
gini_best_min_samples_split_baseline = ParameterTuning.get_best_result(gini_best_min_samples_split_baseline_1, gini_best_min_samples_split_baseline_2)
gini_best_min_samples_leaf_baseline_1 = parameterTuning_Gini.get_best_min_samples_leaf()
gini_best_min_samples_leaf_baseline_2 = parameterTuning_Gini.get_best_min_samples_leaf(20)
gini_best_min_samples_leaf_baseline = ParameterTuning.get_best_result(gini_best_min_samples_leaf_baseline_1, gini_best_min_samples_leaf_baseline_2)

# Pruebas con datos Balanceados

In [None]:
# Count classes and plot
target_count = df["isFraud"].value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
target_count.plot(kind='bar', title='Count (isFraud)');

## Random undersampling

### TODO: creo que NO debería usarse df, sino X_train ++ y_train. Lo mismo para over/smote. La parte de test no debería aparecer en ningún dataset de entrenamiento.

In [None]:
# Class count
target_0_count, target_1_count=df["isFraud"].value_counts()
# Seperate classes
target_0 = df[df["isFraud"] == 0]
target_1 = df[df["isFraud"] == 1]

# Resample target1 to match target 0 count
target_0_undersample = target_0.sample(target_1_count)
# Merge back to single df
test_undersample = pd.concat([target_0_undersample, target_1], axis=0)
# Show counts and plot
print('Random under-sampling:')
test_undersample["isFraud"].value_counts().plot(kind='bar', title='Count (target)');

In [None]:
X_undersample = test_undersample.drop('isFraud',axis=1)
y_undersample = test_undersample[['isFraud']]
X_undersample_train, X_undersample_test, y_undersample_train, y_undersample_test = train_test_split(X_undersample, y_undersample, test_size=0.2, random_state=42)

In [None]:
len(X_undersample)

In [None]:
y_undersample_test.value_counts()

In [None]:
y_test.value_counts()

In [None]:
dtGini_undersample = GiniDecisionTree(test_undersample,
                                      X=X_undersample, 
                                      X_train=X_undersample_train, 
                                      X_test=X_test, # Para testear debe ser sobre el dataset original
                                      y_train=y_undersample_train, 
                                      y_test=y_test, # Para testear debe ser sobre el dataset original
                                      target=test_undersample['isFraud'])
dtGini_undersample.train()

In [None]:
dtGini_undersample.show_classification_report()
dtGini_undersample.show_confusion_matrix()

## Random oversampling

In [None]:
# Class count
target_0_count, target_1_count = df["isFraud"].value_counts()
# Seperate classes
target_0 = df[df["isFraud"] == 0]
target_1 = df[df["isFraud"] == 1]

# Resample target0 to match target 1 count
target_1_oversample = target_1.sample(target_0_count, replace=True)
# Merge back to single df
test_oversample = pd.concat([target_1_oversample, target_0], axis=0)
# Show counts and plot
print('Random over-sampling:')
print(test_oversample["isFraud"].value_counts())
test_oversample["isFraud"].value_counts().plot(kind='bar', title='Count (isFraud)');


In [None]:
X_oversample = test_oversample.drop('isFraud',axis=1)
y_oversample = test_oversample[['isFraud']]
X_oversample_train, X_oversample_test, y_oversample_train, y_oversample_test = train_test_split(X_oversample, y_oversample, test_size=0.2, random_state=42)

In [None]:
dtGini_oversample = GiniDecisionTree(test_oversample,
                                      X=X_oversample, 
                                      X_train=X_oversample_train, 
                                      X_test=X_test, # Para testear debe ser sobre el dataset original
                                      y_train=y_oversample_train, 
                                      y_test=y_test, # Para testear debe ser sobre el dataset original
                                      target=test_oversample['isFraud'])
dtGini_oversample.train()

In [None]:
dtGini_oversample.show_classification_report()
dtGini_oversample.show_confusion_matrix()

## SMOTE

In [None]:
oversample = SMOTE()
X_smote, y_smote = oversample.fit_resample(X, y)

In [None]:
len(X)

In [None]:
len(X_smote)

In [None]:
y.value_counts()

In [None]:
y_smote.value_counts()

In [None]:
X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

In [None]:
dtGini_smote = GiniDecisionTree(df,
                                  X=X, 
                                  X_train=X_smote_train, 
                                  X_test=X_test, # Para testear debe ser sobre el dataset original
                                  y_train=y_smote_train, 
                                  y_test=y_test, # Para testear debe ser sobre el dataset original
                                  target=df['isFraud'])
dtGini_smote.train()

In [None]:
dtGini_smote.show_confusion_matrix()

In [None]:
dtGini_smote.show_classification_report()

In [None]:
dtGini_smote.accuracy()

In [None]:
dtGini_smote.precision_score()

In [None]:
dtGini_smote.recall()

## Comparo parámetros para datos Balanceados

### undersample

In [None]:
parameterTuning_InfoGain_undersample = ParameterTuning(decisionTreeCriterion="entropy",
                                            X_train=X_undersample_train,
                                            X_test=X_test,
                                            y_train=y_undersample_train,
                                            y_test=y_test)

infogain_best_max_depth_undersample = parameterTuning_InfoGain_undersample.get_best_max_depth(30)
infogain_best_max_depth_undersample = ParameterTuning.list_only_parameters(infogain_best_max_depth_undersample)

infogain_best_min_samples_split_undersample_1 = parameterTuning_InfoGain_undersample.get_best_min_samples_split()
infogain_best_min_samples_split_undersample_2 = parameterTuning_InfoGain_undersample.get_best_min_samples_split(20)
infogain_best_min_samples_split_undersample = ParameterTuning.get_best_result(infogain_best_min_samples_split_undersample_1, infogain_best_min_samples_split_undersample_2)

infogain_best_min_samples_leaf_undersample_1 = parameterTuning_InfoGain_undersample.get_best_min_samples_leaf()
infogain_best_min_samples_leaf_undersample_2 = parameterTuning_InfoGain_undersample.get_best_min_samples_leaf(20)
infogain_best_min_samples_leaf_undersample = ParameterTuning.get_best_result(infogain_best_min_samples_leaf_undersample_1, infogain_best_min_samples_leaf_undersample_2)

In [None]:
parameterTuning_Gini_undersample = ParameterTuning(decisionTreeCriterion="gini",
                                            X_train=X_undersample_train,
                                            X_test=X_test,
                                            y_train=y_undersample_train,
                                            y_test=y_test)

gini_best_max_depth_undersample = parameterTuning_Gini_undersample.get_best_max_depth(30)
gini_best_max_depth_undersample = ParameterTuning.list_only_parameters(gini_best_max_depth_undersample)

gini_best_min_samples_split_undersample_1 = parameterTuning_Gini_undersample.get_best_min_samples_split()
gini_best_min_samples_split_undersample_2 = parameterTuning_Gini_undersample.get_best_min_samples_split(20)
gini_best_min_samples_split_undersample = ParameterTuning.get_best_result(gini_best_min_samples_split_undersample_1, gini_best_min_samples_split_undersample_2)

gini_best_min_samples_leaf_undersample_1 = parameterTuning_Gini_undersample.get_best_min_samples_leaf()
gini_best_min_samples_leaf_undersample_2 = parameterTuning_Gini_undersample.get_best_min_samples_leaf(20)
gini_best_min_samples_leaf_undersample = ParameterTuning.get_best_result(gini_best_min_samples_leaf_undersample_1, gini_best_min_samples_leaf_undersample_2)

### oversample

In [None]:
parameterTuning_InfoGain_over = ParameterTuning(decisionTreeCriterion="entropy",
                                            X_train=X_oversample_train,
                                            X_test=X_test,
                                            y_train=y_oversample_train,
                                            y_test=y_test)

infogain_best_max_depth_oversample = parameterTuning_InfoGain_over.get_best_max_depth(30)
infogain_best_max_depth_oversample = ParameterTuning.list_only_parameters(infogain_best_max_depth_oversample)

infogain_best_min_samples_split_oversample_1 = parameterTuning_InfoGain_over.get_best_min_samples_split()
infogain_best_min_samples_split_oversample_2 = parameterTuning_InfoGain_over.get_best_min_samples_split(20)
infogain_best_min_samples_split_oversample = ParameterTuning.get_best_result(infogain_best_min_samples_split_oversample_1, infogain_best_min_samples_split_oversample_2)

infogain_best_min_samples_leaf_oversample_1 = parameterTuning_InfoGain_over.get_best_min_samples_leaf()
infogain_best_min_samples_leaf_oversample_2 = parameterTuning_InfoGain_over.get_best_min_samples_leaf(20)
infogain_best_min_samples_leaf_oversample = ParameterTuning.get_best_result(infogain_best_min_samples_leaf_oversample_1, infogain_best_min_samples_leaf_oversample_2)

In [None]:
parameterTuning_Gini_oversample = ParameterTuning(decisionTreeCriterion="gini",
                                            X_train=X_oversample_train,
                                            X_test=X_test,
                                            y_train=y_oversample_train,
                                            y_test=y_test)

gini_best_max_depth_oversample = parameterTuning_Gini_oversample.get_best_max_depth(30)
gini_best_max_depth_oversample = ParameterTuning.list_only_parameters(gini_best_max_depth_oversample)

gini_best_min_samples_split_oversample_1 = parameterTuning_Gini_oversample.get_best_min_samples_split()
gini_best_min_samples_split_oversample_2 = parameterTuning_Gini_oversample.get_best_min_samples_split(20)
gini_best_min_samples_split_oversample = ParameterTuning.get_best_result(gini_best_min_samples_split_oversample_1, gini_best_min_samples_split_oversample_2)

gini_best_min_samples_leaf_oversample_1 = parameterTuning_Gini_oversample.get_best_min_samples_leaf()
gini_best_min_samples_leaf_oversample_2 = parameterTuning_Gini_oversample.get_best_min_samples_leaf(20)
gini_best_min_samples_leaf_oversample = ParameterTuning.get_best_result(gini_best_min_samples_leaf_oversample_1, gini_best_min_samples_leaf_oversample_2)


### smote

In [None]:
parameterTuning_InfoGain_smote = ParameterTuning(decisionTreeCriterion="entropy",
                                            X_train=X_smote_train,
                                            X_test=X_test,
                                            y_train=y_smote_train,
                                            y_test=y_test)

infogain_best_max_depth_smote = parameterTuning_InfoGain_smote.get_best_max_depth(30)
infogain_best_max_depth_smote = ParameterTuning.list_only_parameters(infogain_best_max_depth_smote)

infogain_best_min_samples_split_smote_1 = parameterTuning_InfoGain_smote.get_best_min_samples_split()
infogain_best_min_samples_split_smote_2 = parameterTuning_InfoGain_smote.get_best_min_samples_split(20)
infogain_best_min_samples_split_smote = ParameterTuning.get_best_result(infogain_best_min_samples_split_smote_1, infogain_best_min_samples_split_smote_2)

infogain_best_min_samples_leaf_smote_1 = parameterTuning_InfoGain_smote.get_best_min_samples_leaf()
infogain_best_min_samples_leaf_smote_2 = parameterTuning_InfoGain_smote.get_best_min_samples_leaf(20)
infogain_best_min_samples_leaf_smote = ParameterTuning.get_best_result(infogain_best_min_samples_leaf_smote_1, infogain_best_min_samples_leaf_smote_2)

In [None]:
parameterTuning_Gini_smote = ParameterTuning(decisionTreeCriterion="gini",
                                            X_train=X_smote_train,
                                            X_test=X_test,
                                            y_train=y_smote_train,
                                            y_test=y_test)

gini_best_max_depth_smote = parameterTuning_Gini_smote.get_best_max_depth(30)
gini_best_max_depth_smote = ParameterTuning.list_only_parameters(gini_best_max_depth_smote)

gini_best_min_samples_split_smote_1 = parameterTuning_Gini_smote.get_best_min_samples_split()
gini_best_min_samples_split_smote_2 = parameterTuning_Gini_smote.get_best_min_samples_split(20)
gini_best_min_samples_split_smote = ParameterTuning.get_best_result(gini_best_min_samples_split_smote_1, gini_best_min_samples_split_smote_2)

gini_best_min_samples_leaf_smote_1 = parameterTuning_Gini_smote.get_best_min_samples_leaf()
gini_best_min_samples_leaf_smote_2 = parameterTuning_Gini_smote.get_best_min_samples_leaf(20)
gini_best_min_samples_leaf_smote = ParameterTuning.get_best_result(gini_best_min_samples_leaf_smote_1, gini_best_min_samples_leaf_smote_2)

# GridSearchCV para DecisionTree

#### Tomamos los mejores 5 resultados para cada parámetro

In [None]:
#Creating a dictionary grid for grid search
param_grid_baseline_gini = {'criterion': ['gini'],
              'max_depth': gini_best_max_depth_baseline[:5],
              'min_samples_split': gini_best_min_samples_split_baseline[:5],
              'min_samples_leaf': gini_best_min_samples_leaf_baseline[:5]}

param_grid_baseline_infogain = {'criterion': ['entropy'],
              'max_depth': infogain_best_max_depth_baseline[:5],
              'min_samples_split': infogain_best_min_samples_split_baseline[:5],
              'min_samples_leaf': infogain_best_min_samples_leaf_baseline[:5]}

In [None]:
#Creating a dictionary grid for grid search
param_grid_undersample_gini = {'criterion': ['gini'],
              'max_depth': gini_best_max_depth_undersample[:5],
              'min_samples_split': gini_best_min_samples_split_undersample[:5],
              'min_samples_leaf': gini_best_min_samples_leaf_undersample[:5]}

param_grid_undersample_infogain = {'criterion': ['entropy'],
              'max_depth': infogain_best_max_depth_undersample[:5],
              'min_samples_split': infogain_best_min_samples_split_undersample[:5],
              'min_samples_leaf': infogain_best_min_samples_leaf_undersample[:5]}

In [None]:
#Creating a dictionary grid for grid search
param_grid_oversample_gini = {'criterion': ['gini'],
              'max_depth': gini_best_max_depth_oversample[:5],
              'min_samples_split': gini_best_min_samples_split_oversample[:5],
              'min_samples_leaf': gini_best_min_samples_leaf_oversample[:5]}

param_grid_oversample_infogain = {'criterion': ['entropy'],
              'max_depth': infogain_best_max_depth_oversample[:5],
              'min_samples_split': infogain_best_min_samples_split_oversample[:5],
              'min_samples_leaf': infogain_best_min_samples_leaf_oversample[:5]}

In [None]:
#Creating a dictionary grid for grid search
param_grid_smote_gini = {'criterion': ['gini'],
              'max_depth': gini_best_max_depth_smote[:5],
              'min_samples_split': gini_best_min_samples_split_smote[:5],
              'min_samples_leaf': gini_best_min_samples_leaf_smote[:5]}

param_grid_smote_infogain = {'criterion': ['entropy'],
              'max_depth': infogain_best_max_depth_smote[:5],
              'min_samples_split': infogain_best_min_samples_split_smote[:5],
              'min_samples_leaf': infogain_best_min_samples_leaf_smote[:5]}

In [None]:
def TrainGridSearchDTC(param_grid, X_train, y_train):
    #Fitting grid search to the train data with 3 folds
    gridsearch_dt = GridSearchCV(estimator= DecisionTreeClassifier(), 
                          param_grid= param_grid,
                          cv=2, 
                          n_jobs=-1, 
                          scoring='roc_auc', 
                          verbose=2)
    inicio = time.time()
    gridsearch_dt.fit(X_train, y_train)
    fin = time.time()
    print("Tiempo total (min): {}".format((fin-inicio)/60))
    return gridsearch_dt

### Best Params Baseline

In [None]:
gridsearch_gini_baseline = TrainGridSearchDTC(param_grid_baseline_gini, X_train, y_train)
print(gridsearch_gini_baseline.best_params_)
print(gridsearch_gini_baseline.best_score_)

In [None]:
gridsearch_infogain_baseline = TrainGridSearchDTC(param_grid_baseline_infogain, X_train, y_train)
print(gridsearch_infogain_baseline.best_params_)
print(gridsearch_infogain_baseline.best_score_)

### Best Params undersample

In [None]:
gridsearch_gini_undersample = TrainGridSearchDTC(param_grid_undersample_gini, X_undersample_train, y_undersample_train)
print("Best params: ", gridsearch_gini_undersample.best_params_)
print("score: ",gridsearch_gini_undersample.best_score_)

In [None]:
gridsearch_infogain_undersample = TrainGridSearchDTC(param_grid_undersample_infogain, X_undersample_train, y_undersample_train)
print(gridsearch_infogain_undersample.best_params_)
print(gridsearch_infogain_undersample.best_score_)

### Best Params oversample

In [None]:
gridsearch_gini_oversample = TrainGridSearchDTC(param_grid_oversample_gini, X_oversample_train, y_oversample_train)
print(gridsearch_gini_oversample.best_params_)
print(gridsearch_gini_oversample.best_score_)

In [None]:
gridsearch_infogain_oversample = TrainGridSearchDTC(param_grid_oversample_infogain, X_oversample_train, y_oversample_train)
print(gridsearch_infogain_oversample.best_params_)
print(gridsearch_infogain_oversample.best_score_)

### Best Params smote

In [None]:
gridsearch_gini_smote = TrainGridSearchDTC(param_grid_smote_gini, X_smote_train, y_smote_train)
print(gridsearch_gini_smote.best_params_)
print(gridsearch_gini_smote.best_score_)

In [None]:
gridsearch_infogain_smote = TrainGridSearchDTC(param_grid_smote_infogain, X_smote_train, y_smote_train)
print(gridsearch_infogain_smote.best_params_)
print(gridsearch_infogain_smote.best_score_)

# Comparador de modelos DT (Gini)

In [None]:
dtGini = GiniDecisionTree(df,
                          X=X, 
                          X_train=X_train, 
                          X_test=X_test, 
                          y_train=y_train, 
                          y_test=y_test, 
                          target=df['isFraud'],
                          desc="desbalanceado")
dtGini.train()

dtGini_bestparams = GiniDecisionTree(df,
                          X=X, 
                          X_train=X_train, 
                          X_test=X_test, 
                          y_train=y_train, 
                          y_test=y_test, 
                          target=df['isFraud'],
                          max_depth=gridsearch_gini_baseline.best_params_['max_depth'],
                          min_samples_split=gridsearch_gini_baseline.best_params_['min_samples_split'],
                          min_samples_leaf=gridsearch_gini_baseline.best_params_['min_samples_leaf'],
                          desc="desbalanceado-bp")
dtGini_bestparams.train()

dtGini_undersample = GiniDecisionTree(test_undersample,
                                      X=X_undersample, 
                                      X_train=X_undersample_train, 
                                      X_test=X_test, # Para testear debe ser sobre el dataset original
                                      y_train=y_undersample_train, 
                                      y_test=y_test, # Para testear debe ser sobre el dataset original
                                      target=test_undersample['isFraud'],
                                      desc="undersample")
dtGini_undersample.train()

dtGini_undersample_bestparams = GiniDecisionTree(test_undersample,
                                      X=X_undersample, 
                                      X_train=X_undersample_train, 
                                      X_test=X_test, # Para testear debe ser sobre el dataset original
                                      y_train=y_undersample_train, 
                                      y_test=y_test, # Para testear debe ser sobre el dataset original
                                      target=test_undersample['isFraud'],
                                      max_depth=gridsearch_gini_undersample.best_params_['max_depth'],
                                      min_samples_split=gridsearch_gini_undersample.best_params_['min_samples_split'],
                                      min_samples_leaf=gridsearch_gini_undersample.best_params_['min_samples_leaf'],
                                      desc="undersample-bp")
dtGini_undersample_bestparams.train()

dtGini_oversample = GiniDecisionTree(test_oversample,
                                      X=X_oversample, 
                                      X_train=X_oversample_train, 
                                      X_test=X_test, # Para testear debe ser sobre el dataset original
                                      y_train=y_oversample_train, 
                                      y_test=y_test, # Para testear debe ser sobre el dataset original
                                      target=test_oversample['isFraud'],
                                      desc="oversample")
dtGini_oversample.train()

dtGini_oversample_bestparams = GiniDecisionTree(test_oversample,
                                      X=X_oversample, 
                                      X_train=X_oversample_train, 
                                      X_test=X_test, # Para testear debe ser sobre el dataset original
                                      y_train=y_oversample_train, 
                                      y_test=y_test, # Para testear debe ser sobre el dataset original
                                      target=test_oversample['isFraud'],
                                      max_depth=gridsearch_gini_oversample.best_params_['max_depth'],
                                      min_samples_split=gridsearch_gini_oversample.best_params_['min_samples_split'],
                                      min_samples_leaf=gridsearch_gini_oversample.best_params_['min_samples_leaf'],
                                      desc="oversample-bp")
dtGini_oversample_bestparams.train()

dtGini_smote = GiniDecisionTree(df,
                                  X=X, 
                                  X_train=X_smote_train, 
                                  X_test=X_test, # Para testear debe ser sobre el dataset original
                                  y_train=y_smote_train, 
                                  y_test=y_test, # Para testear debe ser sobre el dataset original
                                  target=df['isFraud'],
                                  desc="smote")
dtGini_smote.train()

dtGini_smote_bestparams = GiniDecisionTree(df,
                                  X=X, 
                                  X_train=X_smote_train, 
                                  X_test=X_test, # Para testear debe ser sobre el dataset original
                                  y_train=y_smote_train, 
                                  y_test=y_test, # Para testear debe ser sobre el dataset original
                                  target=df['isFraud'],
                                  max_depth=gridsearch_gini_smote.best_params_['max_depth'],
                                  min_samples_split=gridsearch_gini_smote.best_params_['min_samples_split'],
                                  min_samples_leaf=gridsearch_gini_smote.best_params_['min_samples_leaf'],
                                  desc="smote-bp")
dtGini_smote_bestparams.train()

In [None]:
models = [
    dtGini, dtGini_bestparams, dtGini_undersample, dtGini_undersample_bestparams
]

comparator = Comparator(models)

In [None]:
comparator.auc()
comparator.roc_curve()

In [None]:
models = [
    dtGini_oversample, dtGini_oversample_bestparams,
    dtGini_smote, dtGini_smote_bestparams
]

comparator = Comparator(models)

In [None]:
comparator.auc()
comparator.roc_curve()

## Mejor modelo para GINI

In [None]:
#Agregar mejor
# models = [
#     dtGini_oversample, dtGini_oversample_bestparams,
#     dtGini_smote, dtGini_smote_bestparams, Agregar todos?
# ]

# comparator = Comparator(models)
# comparator.best() -> que imprima AUC del mejor e imprima quién es

# Comparador de modelos DT (Information Gain)

In [None]:
dtInfoGain = InformationGainDecisionTree(df,
                                         X=X, 
                                         X_train=X_train, 
                                         X_test=X_test, 
                                         y_train=y_train, 
                                         y_test=y_test, 
                                         target=df['isFraud'],
                                         desc="desbalanceado")
dtInfoGain.train()

dtInfoGain_undersample = InformationGainDecisionTree(test_undersample,
                                      X=X_undersample, 
                                      X_train=X_undersample_train, 
                                      X_test=X_test, # Para testear debe ser sobre el dataset original
                                      y_train=y_undersample_train, 
                                      y_test=y_test, # Para testear debe ser sobre el dataset original
                                      target=test_undersample['isFraud'],
                                      desc="undersample")
dtInfoGain_undersample.train()

dtInfoGain_undersample_bestparams = InformationGainDecisionTree(test_undersample,
                                      X=X_undersample, 
                                      X_train=X_undersample_train, 
                                      X_test=X_test, # Para testear debe ser sobre el dataset original
                                      y_train=y_undersample_train, 
                                      y_test=y_test, # Para testear debe ser sobre el dataset original
                                      target=test_undersample['isFraud'],
                                      max_depth=gridsearch_infogain_undersample.best_params_['max_depth'],
                                      min_samples_split=gridsearch_infogain_undersample.best_params_['min_samples_split'],
                                      min_samples_leaf=gridsearch_infogain_undersample.best_params_['min_samples_leaf'],
                                      desc="undersample")
dtInfoGain_undersample_bestparams.train()

dtInfoGain_oversample = InformationGainDecisionTree(test_oversample,
                                      X=X_oversample, 
                                      X_train=X_oversample_train, 
                                      X_test=X_test, # Para testear debe ser sobre el dataset original
                                      y_train=y_oversample_train, 
                                      y_test=y_test, # Para testear debe ser sobre el dataset original
                                      target=test_oversample['isFraud'],
                                      desc="oversample")
dtInfoGain_oversample.train()

dtInfoGain_oversample_bestparams = InformationGainDecisionTree(test_oversample,
                                      X=X_oversample, 
                                      X_train=X_oversample_train, 
                                      X_test=X_test, # Para testear debe ser sobre el dataset original
                                      y_train=y_oversample_train, 
                                      y_test=y_test, # Para testear debe ser sobre el dataset original
                                      target=test_oversample['isFraud'],
                                      max_depth=gridsearch_infogain_oversample.best_params_['max_depth'],
                                      min_samples_split=gridsearch_infogain_oversample.best_params_['min_samples_split'],
                                      min_samples_leaf=gridsearch_infogain_oversample.best_params_['min_samples_leaf'],
                                      desc="oversample")
dtInfoGain_oversample_bestparams.train()

dtInfoGain_smote = InformationGainDecisionTree(df,
                                  X=X, 
                                  X_train=X_smote_train, 
                                  X_test=X_test, # Para testear debe ser sobre el dataset original
                                  y_train=y_smote_train, 
                                  y_test=y_test, # Para testear debe ser sobre el dataset original
                                  target=df['isFraud'],
                                  desc="smote")
dtInfoGain_smote.train()

dtInfoGain_smote_bestparams = InformationGainDecisionTree(df,
                                  X=X, 
                                  X_train=X_smote_train, 
                                  X_test=X_test, # Para testear debe ser sobre el dataset original
                                  y_train=y_smote_train, 
                                  y_test=y_test, # Para testear debe ser sobre el dataset original
                                  target=df['isFraud'],
                                  max_depth=gridsearch_infogain_smote.best_params_['max_depth'],
                                  min_samples_split=gridsearch_infogain_smote.best_params_['min_samples_split'],
                                  min_samples_leaf=gridsearch_infogain_smote.best_params_['min_samples_leaf'],
                                  desc="smote")
dtInfoGain_smote_bestparams.train()

In [None]:
models = [
    dtInfoGain, dtInfoGain_undersample, 
    dtInfoGain_bestparams, dtInfoGain_undersample_bestparams
]

comparator = Comparator(models)

In [None]:
comparator.auc()
comparator.roc_curve()

In [None]:
models = [
    dtInfoGain_oversample, dtInfoGain_smote,
    dtInfoGain_oversample_bestparams, dtInfoGain_smote_bestparams
]

comparator = Comparator(models)

In [None]:
comparator.auc()
comparator.roc_curve()

## Mejor modelo para Information Gain

In [None]:
#Agregar mejor

In [None]:
fin_final_dt = time.time()
print("Tiempo total (min): {}\n".format(round((fin_final_dt-inicio_ppio)/60, 2)))

# Redes Neuronales

## Entreno Red Neuronal

In [None]:
nn = LBFGSNeuralNetwork(X=X,
                        y=y,
                        X_train=X_train, 
                        X_test=X_test, 
                        y_train=y_train, 
                        y_test=y_test, 
                        alpha=1e-5, 
                        hidden_layer_sizes=(15,), 
                        max_iter=200)
nn.train()

### Medidas de Performance

#### Matriz de Confusion

In [None]:
nn.show_confusion_matrix()

#### Reporte

In [None]:
nn.show_classification_report()

#### Accuracy

In [None]:
nn.accuracy()

#### Recall

In [None]:
nn.recall()

#### Precision

In [None]:
nn.precision_score()

# GridSearchCV para NN

In [None]:
#Creating a dictionary grid for grid search
param_grid = {
    'hidden_layer_sizes': [(20,), (20,20), (40,)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.05, 0.5]
}

mlp = MLPClassifier(max_iter=200)

In [None]:
#Fitting grid search to the train data with 5 folds
gridsearchNN = GridSearchCV(estimator= mlp, 
                          param_grid= param_grid,
                          cv=2, 
                          n_jobs=-1,  
                          verbose=2,
                          scoring="roc_auc")

inicio = time.time()
gridsearchNN.fit(X_train, y_train.to_numpy().ravel())
fin = time.time()

joblib.dump(gridsearchNN, 'gridsearchNN.joblib')

In [None]:
print("Tiempo total (min): {}".format((fin-inicio)/60))