In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.gridspec as gridspec
from collections import Counter
from sklearn import metrics
import numpy as np

In [None]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning, message='.*X does not have valid feature names.*')

In [None]:
def purity_score(y_true, y_pred):
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

In [None]:
df = pd.read_csv("datasets/creditcard.csv")

In [None]:
df.head(80)

In [None]:
df = df.drop(["Time", "Amount"], axis=1)
X = df.drop("Class", axis=1)
y = df["Class"].copy()

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_rnd = RandomForestClassifier(n_estimators=80, random_state=42, n_jobs=-1)
clf_rnd.fit(X, y)

In [None]:
feature_importances = {name: score for name, score in zip(list(df), clf_rnd.feature_importances_)}
feature_importances_sorted = pd.Series(feature_importances).sort_values(ascending=False)

In [None]:
X_reduced = X[list(feature_importances_sorted.head(10).index)].copy()

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X_reduced)

In [None]:
import matplotlib.pyplot as plt

cluster_counts = Counter(clusters.tolist())
bad_cluster_counts = Counter(clusters[y == 1].tolist())

for key in sorted(cluster_counts.keys()):
    print("Label {0} has {1} samples - {2} are malicious samples".format(
        key, cluster_counts[key], bad_cluster_counts[key]))

cluster_labels = sorted(cluster_counts.keys())
counts = [cluster_counts[key] for key in cluster_labels]
bad_counts = [bad_cluster_counts[key] for key in cluster_labels]

fig, ax = plt.subplots()

ax.bar(cluster_labels, counts, label='Total Samples', color='b')

ax.bar(cluster_labels, bad_counts, label='Malicious Samples', color='r')

ax.set_xlabel('Cluster Label')
ax.set_ylabel('Sample Count')
ax.set_title('Sample Distribution in Clusters')
ax.legend()

plt.show()


In [None]:
print("Purity Score:", purity_score(y, clusters))
print("Shiloutte: ", metrics.silhouette_score(X_reduced, clusters, sample_size=10000))
print("Calinski harabasz: ", metrics.calinski_harabasz_score(X_reduced, clusters))