# Detección de fraudes

In [None]:
# Librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    average_precision_score, precision_recall_curve,
    precision_score, recall_score, f1_score, confusion_matrix
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import mutual_info_classif
import itertools

In [None]:
try:
    import xgboost as xgb
    HAS_XGB = True
except Exception:
    HAS_XGB = False

try:
    import lightgbm as lgb
    HAS_LGB = True
except Exception:
    HAS_LGB = False

# Análisis Exploratorio de Datos

In [None]:
df = pd.read_csv('/content/creditcard-train-in-.csv')
display(df.head())

In [None]:
df.info()

### Analisis exploratorio de train

In [None]:
# Gráfica de las clases
plt.figure(figsize=(8, 6))
ax = sns.countplot(x='Class', data=df, palette=['skyblue', 'red'])
plt.title('Distribución de la clase de transacción', fontsize=16)
plt.xlabel('Clase de transacción (0: No fraude, 1: Fraude)', fontsize=12)
plt.ylabel('Número de transacciones', fontsize=12)
plt.xticks(ticks=[0, 1], labels=['No Fraude', 'Fraude'])

#Número por Clase
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')
plt.show()

In [None]:
display(df[df['Class']==0].describe().T)
display(df[df['Class']==1].describe().T)

In [None]:
df.hist(figsize=(20, 15))
plt.tight_layout()
plt.show()

In [None]:
df['Class'].value_counts(normalize=True)

In [None]:
df['Class'].value_counts()

In [None]:
variables = df.columns.drop('Class')

# Cajones con bigotes
fig, axes = plt.subplots(9, 4, figsize=(12, 28))
axes = axes.flatten()

for i, var in enumerate(variables):
    sns.boxplot(data=df, x='Class', y=var, palette=['skyblue', 'red'], ax=axes[i])
    axes[i].set_xlabel('Clase')
    axes[i].set_ylabel(var)
    axes[i].set_xticks(ticks=[0, 1], labels=['No Fraude', 'Fraude'])

#Subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
#Matriz de correlación de Pearson
correlation_matrix = df.corr()
plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False)
plt.show()

In [None]:
df.corr()

In [None]:
#Graficos bivariados
selected_variables = ["Amount", "Time", "V17", "V14", "V12", "V10"]
for x_var, y_var in itertools.combinations(selected_variables, 2):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df, x=x_var, y=y_var, hue='Class', alpha=0.5, palette=['skyblue', 'red'])
    plt.title(f'{x_var} vs. {y_var} por Clase')
    plt.xlabel(x_var)
    plt.ylabel(y_var)
    plt.show()

In [None]:
# Cajines con bigotes de todas las variables
for var in df.columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(data=df, y=var)
    plt.title(f'Box Plot of {var}')
    plt.ylabel(var)
    plt.show()

In [None]:
df[['Amount','Time']].hist(bins=30, figsize=(10,4))

In [None]:
# SUponiento que el tiempo de transacción es en segundos
plt.figure(figsize=(8,4))
plt.hist(df['Time']/3600, bins=24)
plt.title("Distribución temporal (en horas)")

### Analisis exploratorio del test

In [None]:
df_test = pd.read_csv('/content/creditcard-test-in-.csv')
display(df_test.head())

In [None]:
df_test.info()

In [None]:
#Matriz de correlación
correlation_matrix = df_test.corr()
plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False)
plt.title('Correlation Matrix of Variables')
plt.show()

In [None]:
for var in df_test.columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(data=df, y=var)
    plt.title(f'Box Plot of {var}')
    plt.ylabel(var)
    plt.show()

# Selección de Características

In [None]:
X = df.drop(columns=['Class'])
y = df['Class']

#### Método filtro

In [None]:
mi = mutual_info_classif(X, y, random_state=42)
mi_scores = pd.Series(mi, index=X.columns).sort_values(ascending=False)
mi_scores.head(10)

#### Método integrado

In [None]:
rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)
rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
importances.head(10)

#### Método wrapper


In [None]:
rfe = RFE(LogisticRegression(max_iter=1000, solver='liblinear'), n_features_to_select=10)
rfe.fit(X, y)
selected_rfe = X.columns[rfe.support_]

In [None]:
selected_rfe

# Modelación

In [None]:
quick_run = False
RANDOM_STATE = 42

n_splits = 3 if quick_run else 5
rf_n_estimators = 50 if quick_run else 200
xgb_n_estimators = 50 if quick_run else 200
lgb_n_estimators = 50 if quick_run else 200

In [None]:
# Calculo de métricas
def calc_metrics(y_true, y_scores, threshold=0.5, top_k=None):
    """
    Devuelve métricas: AUPRC, precision, recall, f1, confusion matrix,
    y precision
    y_scores: probabilidades de clase positiva.
    """
    ap = average_precision_score(y_true, y_scores)
    y_pred = (y_scores >= threshold).astype(int)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    res = {
        "AUPRC": ap,
        "threshold": threshold,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "confusion_matrix": cm
    }
    if top_k is not None:
        idx = np.argsort(y_scores)[::-1][:top_k]
        prec_at_k = y_true.iloc[idx].sum() / float(top_k)
        res[f"precision{top_k}"] = prec_at_k
    return res

In [None]:
#Calculo de umbral mínimo tal que precision >= target_precision
def threshold_for_precision(y_true, y_scores, target_precision=0.90):
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    mask = precision[:-1] >= target_precision
    if mask.any():
        cand_thresholds = thresholds[mask]
        cand_recalls = recall[:-1][mask]
        best_idx = np.argmax(cand_recalls)
        return cand_thresholds[best_idx]
    else:
        return 1.0

In [None]:
#Aplicación de modelo
def get_models():
    models = {}
    #Logistic Regression
    models['Logistic'] = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(solver='liblinear', class_weight='balanced', max_iter=500, random_state=RANDOM_STATE))
    ])
    #SVM
    models['SVM'] = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', SVC(probability=True, class_weight='balanced', random_state=RANDOM_STATE))
    ])
    #Decision Tree
    models['DecisionTree'] = DecisionTreeClassifier(class_weight='balanced', random_state=RANDOM_STATE)
    #Random Forest
    models['RandomForest'] = RandomForestClassifier(n_estimators=rf_n_estimators, class_weight='balanced', n_jobs=-1, random_state=RANDOM_STATE)
    #XGBoost
    if HAS_XGB:
        models['XGBoost'] = xgb.XGBClassifier(n_estimators=xgb_n_estimators, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=1.0, random_state=RANDOM_STATE, n_jobs=-1)
    #LightGBM
    if HAS_LGB:
        models['LightGBM'] = lgb.LGBMClassifier(n_estimators=lgb_n_estimators, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1)
    #MLP
    models['MLP'] = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', MLPClassifier(hidden_layer_sizes=(64,32), max_iter=200, random_state=RANDOM_STATE))
    ])
    return models

In [None]:
def evaluate_models_cv(X, y, model_dict, n_splits=5):
    """
    Realiza StratifiedKFold CV y retorna un DataFrame con métricas por modelo.
    Además devuelve dict de listas con scores y thresholds por modelo para análisis.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    results = []
    model_scores = {name: [] for name in model_dict}
    for name, model in model_dict.items():
        print(f"\nEvaluando modelo: {name}")
        fold_metrics = []
        fold_scores = []
        for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), 1):
            X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]
            model.fit(X_tr, y_tr)
            if hasattr(model, "predict_proba"):
                probs = model.predict_proba(X_val)[:,1]
            else:
                try:
                    dfun = model.decision_function(X_val)
                    probs = 1 / (1 + np.exp(-dfun))
                except Exception:
                    probs = model.predict(X_val)
            ap = average_precision_score(y_val, probs)
            # umbral para precision >= 0.90
            thr_90 = threshold_for_precision(y_val.reset_index(drop=True), pd.Series(probs), target_precision=0.90)
            m_default = calc_metrics(y_val.reset_index(drop=True), pd.Series(probs), threshold=0.5, top_k=100)
            m_thr90 = calc_metrics(y_val.reset_index(drop=True), pd.Series(probs), threshold=thr_90, top_k=100)
            fold_metrics.append({
                "model": name,
                "fold": fold,
                "AUPRC": ap,
                "precision@0.5": m_default['precision'],
                "recall@0.5": m_default['recall'],
                "f1@0.5": m_default['f1'],
                "precision@100@0.5": m_default.get("precision@100", np.nan),
                "threshold_for_prec90": thr_90,
                "precision@thr90": m_thr90['precision'],
                "recall@thr90": m_thr90['recall'],
                "f1@thr90": m_thr90['f1']
            })
            fold_scores.append((y_val.reset_index(drop=True), probs))
        # promedio folds
        dfm = pd.DataFrame(fold_metrics)
        mean_metrics = dfm.drop(columns=['model','fold']).mean().to_dict()
        mean_metrics.update({"model": name})
        results.append(mean_metrics)
        model_scores[name] = fold_scores
    results_df = pd.DataFrame(results).set_index('model')
    return results_df, model_scores

In [None]:
#Variables seleccionadas
selected_vars = ['V14','V10','V16','V4','V12','V17','V11','V3','V9']

y = df['Class']
X_full = df.drop(columns=['Class']).copy()

In [None]:
#Revisar variables
selected_vars = [v for v in selected_vars if v in X_full.columns]
X_reduced = X_full[selected_vars].copy()

print("Tamaño features full:", X_full.shape)
print("Tamaño features reduced:", X_reduced.shape)

# -------------------------
# Definir modelos y evaluar
# -------------------------
models = get_models()


In [None]:
print("\n--- EVALUACIÓN con FEATURES REDUCIDAS ---")
results_reduced, scores_reduced = evaluate_models_cv(X_reduced, y, models, n_splits=n_splits)
print(results_reduced.sort_values('AUPRC', ascending=False))

results_reduced.to_csv("results_reduced_features.csv")


# Predicción

In [None]:
best_model = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
best_model.fit(X_reduced, y)

In [None]:
X_test = df_test[X_reduced.columns].copy()

# Predicciones
probs_test = best_model.predict_proba(X_test)[:, 1]
THRESHOLD_FINAL = 0.29  # según precision >= 90%
preds_test = (probs_test >= THRESHOLD_FINAL).astype(int)


In [None]:
# Verificar valores nulos
assert not np.isnan(preds_test).any(), "Error: hay NaN en las predicciones"

#Crear DataFrame final
output = df_test.copy()
output['prediction'] = preds_test
output['probability'] = probs_test

In [None]:
# Guardar
output.to_csv("creditcard_test_evaluate.csv", index=False)

print("creditcard_test_evaluate.csv generado correctamente.")
print("Registros:", output.shape[0])
print(output['prediction'].value_counts())

In [None]:
output.head()

In [None]:
# Filtro de transacciones
filtered_output = output[(output['probability'] >= 0.1) & (output['probability'] <= 0.9)]
display(filtered_output)