In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_text, export_graphviz
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, accuracy_score, recall_score, precision_score
from sklearn import preprocessing


import graphviz
from IPython.display import Image  
import pydotplus
from io import StringIO

import plotly_express as px

from collections import Counter

import imblearn
from imblearn.over_sampling import SMOTE

In [None]:
class AbstractClassificationProblem:
    labels = ['No Fraude', 'Fraude']

    def train(self):
        print("Comienza Entrenamiento")
        self.print_self()
        self.clf_model.fit(self.X_train, self.y_train)
        self.y_predict = self.clf_model.predict(self.X_test)
        print("Entrenado")
    
    def show_confusion_matrix(self):
        self.print_self()
        ConfusionMatrixDisplay.from_estimator(estimator=self.clf_model,
                                              X=self.X_test, 
                                              y=self.y_test,
                                              display_labels=self.labels)
        plt.show()
    
    def show_classification_report(self):
        self.print_self()
        print(classification_report(self.y_test, self.y_predict, target_names=self.labels))

    def accuracy(self):
        return accuracy_score(self.y_test, self.y_predict)
    
    def recall(self):
        return recall_score(self.y_test, self.y_predict)
    
    def precision_score(self):
        return precision_score(self.y_test, self.y_predict)
    
    def print_self(self):
        pass

class AbstractDecisionTree(AbstractClassificationProblem):
    criterion = ""
    tipo = ""
    
    def __init__(self, X, y, X_train, X_test, y_train, y_test, target, max_depth=5, min_samples_leaf=5):
        self.X_train = X_train
        self.X_test = X_test
        self.y_test = y_test
        self.y_train = y_train
        self.clf_model = DecisionTreeClassifier(criterion=self.criterion, 
                                                random_state=42,
                                                max_depth=max_depth,
                                                min_samples_leaf=min_samples_leaf)

        self.target = list(df['isFraud'].unique())
        self.feature_names = list(X.columns)
        
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
    
    def show_matrix(self):
        dot_data = tree.export_graphviz(self.clf_model,
                                        out_file=None,
                                        feature_names=self.feature_names,
                                        class_names=str(self.target),
                                        filled=True,
                                        rounded=True,
                                        special_characters=True)
        dot_data = StringIO()
        export_graphviz(self.clf_model, 
                        out_file=dot_data, 
                        filled=True, 
                        rounded=True, 
                        special_characters=True,
                        feature_names=self.feature_names,
                        class_names=self.labels)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        return Image(graph.create_png())
    
    def print_self(self):
        print("**", "Arbol de decision - " + self.tipo, "**")
        print('**', "max_depth=" + str(self.max_depth) + ",", "min_samples_leaf=" + str(self.min_samples_leaf), "**")


class GiniDecisionTree(AbstractDecisionTree):
    criterion = "gini"
    tipo = "Gini Index"

class InformationGainDecisionTree(AbstractDecisionTree):
    criterion = "entropy"
    tipo = "Information Gain"

# https://scikit-learn.org/stable/modules/neural_networks_supervised.html
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
class AbstractNeuralNetwork(AbstractClassificationProblem):
    solver = ''
    tipo = ''
    
    def __init__(self, X, y, X_train, X_test, y_train, y_test, alpha=1e-5, hidden_layer_sizes=(15,), max_iter=5000):
        self.X = X
        self.y = y
        
        # https://scikit-learn.org/stable/modules/preprocessing.html
        self.X_train = self._scale(X_train)
        self.X_test = self._scale(X_test)
        
        #https://numpy.org/doc/stable/reference/generated/numpy.ravel.html
        self.y_train = y_train.values.ravel()

        self.y_test = y_test
        
        self.alpha = alpha
        self.hidden_layer_sizes = hidden_layer_sizes
        self.max_iter = max_iter

        self.clf_model = MLPClassifier(solver=self.solver,
                                       alpha=alpha, 
                                       hidden_layer_sizes=hidden_layer_sizes, 
                                       random_state=42, 
                                       max_iter=max_iter)   
    
    def print_self(self):
        print("**", "Red Neuronal - " + self.tipo, "**")
        print("**", 
              "alpha=" + str(self.alpha), 
              "hidden_layer_sizes=" + str(self.hidden_layer_sizes), 
              "max_iter=" + str(self.max_iter), 
              "**")

    def _scale(self, X):
        scaler = preprocessing.StandardScaler().fit(X)
        return scaler.transform(X)


class LBFGSNeuralNetwork(AbstractNeuralNetwork):
    solver = 'lbfgs'
    tipo = "LBFGS"

class SGDNeuralNetwork(AbstractNeuralNetwork):
    solver = 'sgd'
    tipo = "SGD"

class AdamNeuralNetwork(AbstractNeuralNetwork):
    solver = 'adam'
    tipo = 'ADAM'

class Comparator:
    models = []
    
    def __init__(self, models):
        self.models = models
    
    def show_confusion_matrix(self):
        for model in self.models:
            model.show_confusion_matrix()
            print("\n")

    def show_classification_report(self):
        for model in self.models:
            model.show_classification_report()
            print("\n")

    def accuracy(self):
        for model in self.models:
            print(model.accuracy())
    
    def recall(self):
        for model in self.models:
            print(model.recall())
    
    def precision_score(self):
        for model in self.models:
            print(model.precision_score())

# ANALISIS DE DATOS

## Levantamos los datos

In [None]:
df = pd.read_csv("./Fraud.csv")

df.head()

## Tamaño del dataset

In [None]:
df.shape

## Tipos de los datos

In [None]:
df.dtypes

## Estadisticas descriptivas de los datos

In [None]:
df.describe()

## Se borran las columnas 'nameOrig' y 'nameDest'

In [None]:
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

## Revisamos si hay valores perdidos (None, NaN) en el resto del dataset

In [None]:
df.isnull().sum()

## Correlación de los datos

### Correlacion de los datos contra la variable 'isFraud'

In [None]:
df.corr()["isFraud"].sort_values()

### Correlacion de los datos entre si

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True)

## Cantidad de fraudes y no fraudes que hay en el dataset

In [None]:
df["isFraud"].value_counts()

## Cantidad de fraudes y no fraudes que hay en el dataset (Normalizado)

In [None]:
df["isFraud"].value_counts(normalize=True)

## Cantidad de tipos de transacciones que hay en el dataset

In [None]:
df['type'].value_counts()

## Transacciones fraudulentas y no fraudulentas diferenciadas por su tipo

In [None]:
plt.figure(figsize=(15, 8))
sns.countplot(x="type", data=df, hue="isFraud")

## Porcentajes de transacciones fraudulentas de cada tipo de transaccion

In [None]:
# https://docs.python.org/3/library/collections.html#collections.Counter
df_type_fraud = pd.DataFrame(dict(Counter(df['type'])).items(), columns=['type', 'IsFraud'])

pie_porcentaje_transacciones_fraudulentas = px.pie(df_type_fraud, values="IsFraud", names='type', title='Transacciones Fraudulentas', color_discrete_sequence=px.colors.sequential.RdBu)
pie_porcentaje_transacciones_fraudulentas.show()

## Mapeo el type a numeros

In [None]:
mapping_type = {'CASH_IN': 0,'CASH_OUT': 1,'PAYMENT': 2,'TRANSFER': 3,'DEBIT': 4}
df['type_numeric'] = df.type.map(mapping_type)
df.drop('type', inplace=True, axis=1)

In [None]:
X = df.drop('isFraud',axis=1)
y = df[['isFraud']]

# Primeras pruebas con datos desbalanceados

## Separación de los datos de entrenamiento (80%) y datos para testing (20%)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Entreno el Arbol de Decision (Gini Impurity)

In [None]:
dtGini = GiniDecisionTree(X=X, 
                          y=y, 
                          X_train=X_train, 
                          X_test=X_test, 
                          y_train=y_train, 
                          y_test=y_test, 
                          target=df['isFraud'],
                          max_depth=5, 
                          min_samples_leaf=5)
dtGini.train()

### Medidas de performance

#### Matriz de Confusion

In [None]:
dtGini.show_confusion_matrix()

#### Reporte

In [None]:
dtGini.show_classification_report()

#### Accuracy

In [None]:
dtGini.accuracy()

#### Recall

In [None]:
dtGini.recall()

#### Precision Score

In [None]:
dtGini.precision_score()

### Impresion del Arbol

In [None]:
dtGini.show_matrix()

## Entreno el Arbol de Decision (Information Gain)

In [None]:
dtInfoGain = InformationGainDecisionTree(X=X, 
                                         y=y, 
                                         X_train=X_train, 
                                         X_test=X_test, 
                                         y_train=y_train, 
                                         y_test=y_test, 
                                         target=df['isFraud'],
                                         max_depth=5, 
                                         min_samples_leaf=5)
dtInfoGain.train()

### Medidas de Performance

#### Matriz de Confusion

In [None]:
dtInfoGain.show_confusion_matrix()

#### Reporte

In [None]:
dtInfoGain.show_classification_report()

#### Accuracy

In [None]:
dtInfoGain.accuracy()

#### Recall

In [None]:
dtInfoGain.recall()

#### Precision

In [None]:
dtInfoGain.precision_score()

### Impresion del Arbol

In [None]:
dtInfoGain.show_matrix()

## Entreno Red Neuronal

In [None]:
nn = LBFGSNeuralNetwork(X=X,
                        y=y,
                        X_train=X_train, 
                        X_test=X_test, 
                        y_train=y_train, 
                        y_test=y_test, 
                        alpha=1e-5, 
                        hidden_layer_sizes=(15,), 
                        max_iter=200)
nn.train()

### Medidas de Performance

#### Matriz de Confusion

In [None]:
nn.show_confusion_matrix()

#### Reporte

In [None]:
nn.show_classification_report()

#### Accuracy

In [None]:
nn.accuracy()

#### Recall

In [None]:
nn.recall()

#### Precision

In [None]:
nn.precision_score()

# Agrego un Comparador de modelos

In [None]:
dtGini = GiniDecisionTree(X=X, 
                          y=y, 
                          X_train=X_train, 
                          X_test=X_test, 
                          y_train=y_train, 
                          y_test=y_test, 
                          target=df['isFraud'],
                          max_depth=5, 
                          min_samples_leaf=5)
dtGini.train()

dtInfoGain = InformationGainDecisionTree(X=X, 
                                         y=y, 
                                         X_train=X_train, 
                                         X_test=X_test, 
                                         y_train=y_train, 
                                         y_test=y_test, 
                                         target=df['isFraud'],
                                         max_depth=5, 
                                         min_samples_leaf=5)
dtInfoGain.train()

nn = LBFGSNeuralNetwork(X=X,
                        y=y,
                        X_train=X_train, 
                        X_test=X_test, 
                        y_train=y_train, 
                        y_test=y_test, 
                        alpha=1e-5, 
                        hidden_layer_sizes=(15,), 
                        max_iter=200)
nn.train()

In [None]:
models = [
    dtGini, dtInfoGain, nn
]

comparator = Comparator(models)

In [None]:
comparator.show_confusion_matrix()

In [None]:
comparator.show_classification_report()

In [None]:
comparator.accuracy()

In [None]:
comparator.recall()

In [None]:
comparator.precision_score()

# Primeras pruebas con datos Balanceados

In [None]:
oversample = SMOTE()
X_balanced, y_balanced = oversample.fit_resample(X, y)

In [None]:
len(X)

In [None]:
len(X_balanced)

In [None]:
y.value_counts()

In [None]:
y_balanced.value_counts()

In [None]:
X_balanced_train, X_balanced_test, y_balanced_train, y_balanced_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

In [None]:
dtGiniBalanced = GiniDecisionTree(X=X, 
                                  y=y, 
                                  X_train=X_balanced_train, 
                                  X_test=X_balanced_test, 
                                  y_train=y_balanced_train, 
                                  y_test=y_balanced_test, 
                                  target=df['isFraud'],
                                  max_depth=5,
                                  min_samples_leaf=5)
dtGiniBalanced.train()