# Detección de fraude - Experimentación

In [None]:
# Cargamos las librerías necesarias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pprint import pprint
import sklearn.metrics

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score,plot_confusion_matrix,roc_auc_score, classification_report, confusion_matrix, precision_recall_curve, auc

from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC 
from sklearn.calibration import CalibratedClassifierCV

from imblearn.over_sampling import RandomOverSampler

import pickle

from google.colab import drive

In [None]:
# Nos conectamos con la unidad de Google Drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# Levantamos el dataset
df = pd.read_csv('/content/drive/MyDrive/Aprendizaje de máquina 1/TP 1/data/PS_20174392719_1491204439457_log.csv')

In [None]:
# Visualizamos que los datos se hayan cargado correctamente
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


### Pre-procesamiento de los datos

#### 1. Renombrar columnas

In [None]:
# Renombrar columnas
columns = {
    'step': 'step',
    'type': 'type',
    'amount': 'amount',
    'nameOrig': 'name_orig',
    'oldbalanceOrg': 'old_balance_org',
    'newbalanceOrig': 'new_balance_orig',
    'nameDest': 'name_dest',
    'oldbalanceDest': 'old_balance_dest',
    'newbalanceDest': 'new_balance_dest',
    'isFraud': 'is_fraud',
    'isFlaggedFraud': 'is_flagged_fraud',
}

df.rename(columns=columns, inplace=True)

#### 2. Eliminar columnas innecesarias

In [None]:
# Eliminar columnas que no resultan útiles
df.drop(columns=['name_orig', 'name_dest', 'is_flagged_fraud'], inplace=True)

#### 3. División de las variables predictoras (X) y variable objetivo (y)

In [None]:
# Definición de X e Y
X = df[['type', 'step', 'amount', 'old_balance_org', 'new_balance_orig', 'old_balance_dest', 'new_balance_dest']]
y = df[['is_fraud']]

#### 4. Transformación de datos y pipeline de pre-procesamiento

In [None]:
numeric_features = ['step', 'amount', 'old_balance_org', 'new_balance_orig', 'old_balance_dest', 'new_balance_dest']
categorical_features = ['type']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

#### 5. Separación en datos de entrenamiento y testeo

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Preparación de la experimentación

#### 1. Definición de los modelos

In [None]:
decision_tree_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', DecisionTreeClassifier(criterion='entropy', max_depth=15))])

random_forest_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', RandomForestClassifier(criterion='entropy', max_depth=15))])

linear_svc_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', LinearSVC(tol=1e-5))])

#### 2. Oversampling de los datos

In [None]:
oversampler=RandomOverSampler(sampling_strategy='minority');

X_train_os,y_train_os=oversampler.fit_resample(X_train, y_train);

print('Composición del training set:')
print(y_train_os.value_counts())

print('\nComposición del test set:')
print(y_test.value_counts())

Composición del training set:
is_fraud
0           5083526
1           5083526
dtype: int64

Composición del test set:
is_fraud
0           1270881
1              1643
dtype: int64


#### 3. Definición de las métricas

In [None]:
def metric_report(y_test, y_pred, y_proba=None):  
    print(classification_report(y_test, y_pred))  
    if y_proba is not None:
        print('Area bajo la curva ROC:',np.round(roc_auc_score(y_test, y_proba[:,1]), 4)) 
        precision, recall,threshold=precision_recall_curve(y_test, y_proba[:,1]);
        print('Area bajo la curva Precision-Recall:',np.round(auc(recall, precision), 4))

## Experimentación

### 1. Decision Tree

In [None]:
decision_tree_model.fit(X_train_os, y_train_os)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['step', 'amount',
                                                   'old_balance_org',
                                                   'new_balance_orig',
                                                   'old_balance_dest',
                                                   'new_balance_dest']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['type'])])),


In [None]:
y_pred = decision_tree_model.predict(X_test)

In [None]:
y_proba = decision_tree_model.predict_proba(X_test)

In [None]:
metric_report(y_test, y_pred, y_proba) 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.51      0.95      0.66      1643

    accuracy                           1.00   1272524
   macro avg       0.75      0.98      0.83   1272524
weighted avg       1.00      1.00      1.00   1272524

Area bajo la curva ROC: 0.9759
Area bajo la curva Precision-Recall: 0.9222


In [None]:
# save the model
filename = '/content/drive/MyDrive/Aprendizaje de máquina 1/TP 1/models/decision_tree_1.sav'
pickle.dump(decision_tree_model, open(filename, 'wb'))

### 2. Random Forest

In [None]:
random_forest_model.fit(X_train_os, y_train_os)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['step', 'amount',
                                                   'old_balance_org',
                                                   'new_balance_orig',
                                                   'old_balance_dest',
                                                   'new_balance_dest']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['type'])])),


In [None]:
y_pred = random_forest_model.predict(X_test)

In [None]:
y_proba = random_forest_model.predict_proba(X_test)

In [None]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00   1270881
           1       0.20      0.98      0.33      1643

    accuracy                           0.99   1272524
   macro avg       0.60      0.99      0.66   1272524
weighted avg       1.00      0.99      1.00   1272524

Area bajo la curva ROC: 0.9991
Area bajo la curva Precision-Recall: 0.9251


In [None]:
# save the model
filename = '/content/drive/MyDrive/Aprendizaje de máquina 1/TP 1/models/random_forest_1.sav'
pickle.dump(random_forest_model, open(filename, 'wb'))

### 3. Linear SVC

In [None]:
linear_svc_model.fit(X_train_os, y_train_os)

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['step', 'amount',
                                                   'old_balance_org',
                                                   'new_balance_orig',
                                                   'old_balance_dest',
                                                   'new_balance_dest']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['type'])])),


In [None]:
y_pred = linear_svc_model.predict(X_test)

In [None]:
metric_report(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      0.95      0.97   1270881
           1       0.02      0.94      0.04      1643

    accuracy                           0.95   1272524
   macro avg       0.51      0.94      0.51   1272524
weighted avg       1.00      0.95      0.97   1272524



In [None]:
# save the model
filename = '/content/drive/MyDrive/Aprendizaje de máquina 1/TP 1/models/linear_svc_1.sav'
pickle.dump(linear_svc_model, open(filename, 'wb'))

### 4. SVC

Se intentó entrenar un modelo de SVC con los siguientes parámetros:
- C=50, 
- kernel='rbf'

Sin embargo se interrumpió, debido a que no se logró la convergencia luego de 5 horas de entremaniento, tanto en un entorno local como en Google Colaboratory.

## Análisis de resultados

|     Metric/Model     |      Decision Tree     |    Random Forest   |     Linear SVC     |
|:--------------------:|:----------------------:|:------------------:|:------------------:|
| precision            | **0: 1.00 \| 1: 0.51** | 0: 1.00 \| 1: 0.20 | 0: 1.00 \| 1: 0.02 |
| recall               |   0: 1.00 \| 1: 0.95   | 0: 0.99 \| 1: 0.98 | 0: 0.95 \| 1: 0.94 |
| f1-score             | **0: 1.00 \| 1: 0.66** | 0: 1.00 \| 1: 0.33 | 0: 0.97 \| 1: 0.04 |
| accuracy             |          1.00          | 0.99               | 0.95               |
| AUCROC               |         0.9759         |       0.9991       | -                  |
| AUC Precision-Recall |         0.9222         |       0.9251       | -                  |

### Conclusiones

- Luego de experimentar con los siguientes modelos: Decision Tree, Random Forest y Linear SVC (Support Vector Machine), se llegó a la conclusión de que **el modelo que arroja mejores resultados para la métrica F1-Score es el Decision Tree**. 

- Se optó por el F1-Score como métrica, debido a que al ser la media armónica del precision y recall, permite mantener información conjunta de ambas, las cuales resultan muy importantes en problemas de clasificación binaria con datos desbalanceados ya que miden la precisión y la exhaustividad, respectivamente.

- Próximos pasos: experimentar con diferentes parámetros sobre el modelo de Decision Tree con el objetivo de incrementar la performance del mismo.