In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Configuración de estilo para las gráficas
sns.set_style('darkgrid')
plt.style.use('ggplot')

In [2]:
# Se carga archivo CSV creado enla etapa de EDA y se imprime la información del dataframe para corroborar.
df_model = pd.read_csv('./data/creditcard_model.csv', sep=',', engine='python')
print(df_model.info())
df_model.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 18 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Amount  284807 non-null  float64
 1   V10     284807 non-null  float64
 2   V11     284807 non-null  float64
 3   V12     284807 non-null  float64
 4   V14     284807 non-null  float64
 5   V16     284807 non-null  float64
 6   V17     284807 non-null  float64
 7   V18     284807 non-null  float64
 8   V19     284807 non-null  float64
 9   V2      284807 non-null  float64
 10  V20     284807 non-null  float64
 11  V21     284807 non-null  float64
 12  V3      284807 non-null  float64
 13  V4      284807 non-null  float64
 14  V7      284807 non-null  float64
 15  V8      284807 non-null  float64
 16  V9      284807 non-null  float64
 17  Class   284807 non-null  int64  
dtypes: float64(17), int64(1)
memory usage: 39.1 MB
None


Unnamed: 0,Amount,V10,V11,V12,V14,V16,V17,V18,V19,V2,V20,V21,V3,V4,V7,V8,V9,Class
0,149.62,0.090794,-0.5516,-0.617801,-0.311169,-0.470401,0.207971,0.025791,0.403993,-0.072781,0.251412,-0.018307,2.536347,1.378155,0.239599,0.098698,0.363787,0
1,2.69,-0.166974,1.612727,1.065235,-0.143772,0.463917,-0.114805,-0.183361,-0.145783,0.266151,-0.069083,-0.225775,0.16648,0.448154,-0.078803,0.085102,-0.255425,0
2,378.66,0.207643,0.624501,0.066084,-0.165946,-2.890083,1.109969,-0.121359,-2.261857,-1.340163,0.52498,0.247998,1.773209,0.37978,0.791461,0.247676,-1.514654,0
3,123.5,-0.054952,-0.226487,0.178228,-0.287924,-1.059647,-0.684093,1.965775,-1.232622,-0.185226,-0.208038,-0.1083,1.792993,-0.863291,0.237609,0.377436,-1.387024,0
4,69.99,0.753074,-0.822843,0.538196,-1.11967,-0.451449,-0.237033,-0.038195,0.803487,0.877737,0.408542,-0.009431,1.548718,0.403034,0.592941,-0.270533,0.817739,0


In [3]:
# Se dividen las variables independientes y dependientes en dos dataframe correr las pruebas.
X = df_model.drop('Class', axis=1)
y = df_model['Class']

In [4]:
# Se crea una funcion para entrenar y evaluar el modelo Isolation Forest
def train_and_evaluate_IF(X, y):
    '''Función para entrenar y evaluar preliminarmente el modelo Isolation Forest de detección de anomalías.'''

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_normal = X_train_scaled[y_train == 0]

    if_model = IsolationForest(n_estimators=300, contamination='auto', random_state=42, n_jobs=-1)
    if_model.fit(X_train_normal)

    y_pred_test = if_model.predict(X_test_scaled)
    y_pred_test = np.where(y_pred_test == 1, 0, 1)
    
    print(classification_report(y_test, y_pred_test))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred_test))
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_test):.4f}")

In [5]:
# Se entrena y evalua el modelo Isolation Forest para el dataset original
train_and_evaluate_IF(X, y)    

              precision    recall  f1-score   support

           0       1.00      0.95      0.98     85307
           1       0.03      0.92      0.06       136

    accuracy                           0.95     85443
   macro avg       0.52      0.94      0.52     85443
weighted avg       1.00      0.95      0.98     85443

Confusion Matrix:
[[81444  3863]
 [   11   125]]
ROC AUC Score: 0.9369


In [6]:
# Se entrena y evalua el modelo Isolation Forest para el dataset balanceado con Under Sampling.
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

train_and_evaluate_IF(X_rus, y_rus)

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       150
           1       0.92      0.88      0.90       146

    accuracy                           0.91       296
   macro avg       0.91      0.91      0.91       296
weighted avg       0.91      0.91      0.91       296

Confusion Matrix:
[[139  11]
 [ 17 129]]
ROC AUC Score: 0.9051


In [7]:
# Se entrena y evalua el modelo Isolation Forest para el dataset balanceado con Over Sampling (SMOTE). 
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

train_and_evaluate_IF(X_smote, y_smote)

              precision    recall  f1-score   support

           0       0.90      0.95      0.92     85149
           1       0.95      0.89      0.92     85440

    accuracy                           0.92    170589
   macro avg       0.92      0.92      0.92    170589
weighted avg       0.92      0.92      0.92    170589

Confusion Matrix:
[[81247  3902]
 [ 9337 76103]]
ROC AUC Score: 0.9224


In [8]:
# Se declara una función para entrenar y evaluar el modelo One-Class SVM
def train_and_evaluate_OCSVM(X, y):
    '''Función para entrenar y evaluar preliminarmente el modelo One-Class SVM de detección de anomalías.'''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_normal = X_train_scaled[y_train == 0]

    nu = sum(y_train) / len(y_train)
    print(f'Contaminación: {nu}')
    
    ocsvm_model = OneClassSVM(kernel='rbf', nu=nu, gamma='scale')
    ocsvm_model .fit(X_train_normal)

    y_pred_test = ocsvm_model.predict(X_test_scaled)
    y_pred_test = np.where(y_pred_test == -1, 1, 0)

    print("Classification Report:")
    print(classification_report(y_test, y_pred_test))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred_test))
    roc_auc = roc_auc_score(y_test, y_pred_test)
    print(f"\nROC AUC Score: {roc_auc:.4f}")


In [9]:
# Se entrena y evalua el modelo One-Class SVM para el dataset original
train_and_evaluate_OCSVM(X, y)

Contaminación: 0.0017856784574948336
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     85307
           1       0.13      0.83      0.23       136

    accuracy                           0.99     85443
   macro avg       0.57      0.91      0.61     85443
weighted avg       1.00      0.99      0.99     85443


Confusion Matrix:
[[84570   737]
 [   23   113]]

ROC AUC Score: 0.9111


In [10]:
# Se entrena y evalua el modelo One-Class SVM para el dataset balanceado con Under Sampling.
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

train_and_evaluate_OCSVM(X_rus, y_rus)

Contaminación: 0.502906976744186
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.50      0.65       150
           1       0.65      0.97      0.78       146

    accuracy                           0.73       296
   macro avg       0.80      0.73      0.72       296
weighted avg       0.80      0.73      0.71       296


Confusion Matrix:
[[ 75  75]
 [  5 141]]

ROC AUC Score: 0.7329


In [11]:
# Se entrena y evalua el modelo One-Class SVM para el dataset balanceado con Over Sampling (SMOTE).
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

train_and_evaluate_OCSVM(X_smote, y_smote)

Contaminación: 0.49963445976670745
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.50      0.65     85149
           1       0.66      0.96      0.78     85440

    accuracy                           0.73    170589
   macro avg       0.80      0.73      0.72    170589
weighted avg       0.80      0.73      0.72    170589


Confusion Matrix:
[[42535 42614]
 [ 3023 82417]]

ROC AUC Score: 0.7321
