In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from collections import Counter
from sklearn import metrics

## 1.- Análisis de datos

#### Exploración

Dataset sacado de: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud#creditcard.csv

In [None]:
df = pd.read_csv('../Datasets/creditcard.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

#### Visualizaciones

In [None]:
# para visualizarlo
df_sample = df.sample(frac=0.2, random_state=42)
sns.set_palette("dark")

In [None]:
features = df_sample.drop("Class", axis=1)

plt.figure(figsize=(12, 32))

# Usamos subplots para crear los ejes
num_plots = len(features.columns)
num_rows = (num_plots + 1) // 2  # Número de filas
num_cols = 2  # Dos subgráficos por fila

for i, f in enumerate(features.columns):
    plt.subplot(num_rows, num_cols, i + 1)
    # Usando histplot para los casos donde Class == 1
    sns.histplot(data=df_sample[df_sample["Class"] == 1], x=f, kde=True, color="red", stat="density", label="Fraud", alpha=0.5)
    # Usando histplot para los casos donde Class == 0
    sns.histplot(data=df_sample[df_sample["Class"] == 0], x=f, kde=True, color="blue", stat="density", label="Legit", alpha=0.5)
    plt.xlabel('')
    plt.title(f"Feature: {f}")
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Definir el tamaño de la figura
plt.figure(figsize=(6, 4))

# Utilizar seaborn para graficar el scatter plot
sns.scatterplot(x="V10", y="V14", hue="Class", data=df_sample, marker=".")

# Establecer etiquetas de los ejes
plt.xlabel("V10", fontsize=14)
plt.ylabel("V14", fontsize=14)

# Mostrar el gráfico
plt.show()

## 2.- Preprocesamiento de datos

In [None]:
df.head()

Eliminamos las columnas amount y time por que no son muy importantes y no estan escaladas

In [None]:
x = df.drop(['Class'], axis=1)

y = df['Class']

In [None]:
rf_model = RandomForestClassifier(n_estimators= 50, random_state= 42, n_jobs= -1)

rf_model.fit(x, y)

rf_model.feature_importances_

In [None]:
# Obtener las importancias de las características
feature_importances = {name: score for name, score in zip(list(x), rf_model.feature_importances_)}

# Crear un DataFrame a partir del diccionario de importancias
feature_importances_df = pd.DataFrame(list(feature_importances.items()), columns=['Feature', 'Importance'])

# Ordenar el DataFrame por importancia en orden descendente
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

In [None]:
# Visualización de importancias
sns.barplot(x='Importance', y='Feature', data=feature_importances_df, )
plt.title('Feature Importance')
plt.show()

In [None]:
selected_columns = feature_importances_df.head(7)['Feature'].tolist()
x = x[selected_columns].copy()

In [None]:
x.head()

## 3.- Entrenamiento

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42)

clusters = kmeans.fit_predict(x)

In [None]:
counter = Counter(clusters.tolist())

bad_counter = Counter(clusters[df['Class'] == 1].tolist())

for key in sorted(counter.keys()):
    print("cluster {0}, {1} ejemplos - {2} malicioso".format(
        key, counter[key], bad_counter[key]))

In [None]:
df['Class'].value_counts()

In [None]:
def purity_score(y, y_predict):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y, y_predict)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

In [None]:
 purity_score(y, clusters)

In [None]:
metrics.silhouette_score(x, clusters, sample_size=10000)

In [None]:
metrics.calinski_harabasz_score(x, clusters)