# Project: Anomaly Detection in Banking Transactions Using K-means


## Objective

Use K-means to automatically identify unusual behaviors in a set of synthetic transactions. The clusters model different behavior patterns, and anomalies can be detected by observing points that are too far from the cluster centers.

In [186]:
# Importing necessary modules for data processing clustering and visualization

import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [187]:
# This function is responsible for loading a dataset from a CSV file

def load_data(file_path: str) -> pd.DataFrame:
    """
    Load the transaction dataset from the given CSV file.

    :param file_path: Path to the CSV file
    :return: Pandas DataFrame with the transaction data
    """

    data_as_dataframe = pd.read_csv(file_path)
    return data_as_dataframe

In [188]:
# This function is used to select the useful variables for the program. 
# I chose to remove global_id, sender_id, and date because they are only useful for identifying transactions, 
# but this information will not help us. 
# Unless we consider specific days (e.g., Fridays) associated with fraud, which we won't analyze here 
# since we are not treating this as a time series problem.

def select_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Select relevant numerical features for clustering.

    :param df: Full transaction DataFrame
    :return: DataFrame with selected features only
    """

    select_features_df = df.drop(columns=['global_id', 'sender_id', 'receiver_id', 'date']).copy()
    return select_features_df

In [189]:
# This function is used during the preprocessing step to standardize datasets. 
# Its goal is to mitigate bias that could arise from large variations in feature magnitudes within the dataset.

def normalize_features(df: pd.DataFrame) -> np.ndarray:
    """
    Normalize the selected features using StandardScaler.

    :param df: DataFrame of selected features
    :return: Normalized NumPy array of features
    """
    
    scaler = StandardScaler()
    normalize_data = scaler.fit_transform(df)
    return normalize_data 


In [190]:
# The function below is used to apply K-Means clustering on the data with 6 clusters
#help(KMeans)

def apply_kmeans(data: np.ndarray, n_clusters: int = 6) -> tuple:
    """
    Apply KMeans clustering to the normalized transaction data.

    :param data: Normalized feature array
    :param n_clusters: Number of clusters
    :return: Tuple (trained model, labels)
    """
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init="auto").fit( data)
    return kmeans, kmeans.labels_

#type(apply_kmeans(normalize_select_features_df))

In [191]:
# This function is used to add metrics to the data, such as:
# - The distance from each point to its cluster centroid
# - The distance as a percentage, which helps us understand how "far" a point is compared to others this is useful especially when we are not domain experts
# I also implemented the Modified Z-Score, which takes into account the density of the cluster.
# 
# The Modified Z-Score is useful for detecting outliers. It tells us how far a point is from the median
# considering the spread using Median Absolute Deviation instead of standard deviation
# It works better than the standard Z-Score when data is not normally distributed or has outliers it's the case there
# I was inspired by this article : https://www.kaggle.com/code/praxitelisk/anomaly-detection-techniques-summary


def add_distance_centroid_zscore(data_kmeans: KMeans, data: np.ndarray) -> pd.DataFrame:
    """
    Adds to each point:
    - Euclidean distance to the centroid,
    - distance to the centroid as a percentage of the cluster's maximum length,
    - a z-score based on the distance to the centroid computed within each cluster.

    :param data_kmeans: Trained KMeans model
    :param data: Input data (n_samples, n_features) used for clustering
    :return: DataFrame enriched with cluster-specific metrics
    """
    
    cluster_labels = data_kmeans.labels_
    centroids = data_kmeans.cluster_centers_
    n_clusters = data_kmeans.n_clusters

    # Compute Euclidean distances from each point to its cluster's centroid
    distances_to_centroid = np.linalg.norm(data - centroids[cluster_labels], axis=1)
    percent_distances = np.zeros(len(data))
    zscores = np.zeros(len(data))

    for cluster_id in range(n_clusters):
        # Mask for points in the current cluster
        mask = cluster_labels == cluster_id
        cluster_distances = distances_to_centroid[mask]
        max_dist = cluster_distances.max()

        # Calculate distance in percentage
        if max_dist > 0:
            percent_distances[mask] = (cluster_distances / max_dist) * 100
        else:
            percent_distances[mask] = 0

        # Calculate modified z-scores: z = 0.6745 * (distance - median) / MAD (if MAD > 0)
        median_d = np.median(cluster_distances)
        mad = np.median(np.abs(cluster_distances - median_d))
        if mad > 0:
            zscores[mask] = 0.6745 * (cluster_distances - median_d) / mad
        else:
            zscores[mask] = 0

    # Build the enriched DataFrame
    df = pd.DataFrame(data, columns=[f"data_col_{i}" for i in range(data.shape[1])])
    df['cluster_label'] = cluster_labels
    df['distance_to_centroid'] = distances_to_centroid
    df['distance_percent'] = percent_distances
    df['zscore'] = zscores

    return df

In [192]:
# Based on the previous function, I can now specify a threshold in the form of a maximum distance, 
# as well as percentage and z-score limits, to identify an element as an anomaly.

def detect_anomalies(df, methods=['distance_percent', 'zscore'], thresholds=None):
    """
    Detect anomalies using simple thresholding on:
    - distance_percent: relative distance to cluster center
    - zscore: standard score within cluster
    - distance_to_centroid: raw Euclidean distance

    Parameters:
        df (DataFrame): Must contain the relevant columns.
        methods (list): List of methods to use.
        thresholds (dict): Dictionary of thresholds for each method.

    Returns:
        np.ndarray: Boolean mask of anomalies.
    """
    if thresholds is None:
        thresholds = {
            'distance_percent': 95,
            'zscore': 4,  # Points with a zscore above 4 considered anomalous
        }

    anomalies = np.zeros(len(df), dtype=bool)

    if 'distance_percent' in methods:
        anomalies |= df['distance_percent'] > thresholds['distance_percent']

    if 'zscore' in methods:
        anomalies |= df['zscore'] > thresholds['zscore']

    return anomalies

In [193]:
# Written with the assistance of Copilot but implementation has been reviewed and understood.

def plot_clusters(
    normalized_data: np.ndarray,
    labels: np.ndarray,
    original_df: pd.DataFrame,
    anomalies_mask: np.ndarray = None,
    sample_size: int = 5000
):
    """
    Plot 2D PCA clusters, highlight anomalies by surrounding their markers, and show original (non-normalized) values in hover.

    Parameters:
      - normalized_data: standardized feature array (n_samples, n_features)
      - labels: cluster labels (n_samples,)
      - original_df: DataFrame (n_samples, m) with original columns to display in hover
      - anomalies_mask: boolean array (n_samples,) marking anomalies
      - sample_size: maximum points to plot
    """

    # PCA → 2D
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(normalized_data)

    # Build DataFrame for plotting
    df = pd.DataFrame(reduced, columns=["PC1", "PC2"])
    df["cluster"] = labels.astype(str)
    df["anomaly"] = anomalies_mask if anomalies_mask is not None else False
    # Attach original (non-normalized) columns
    df = pd.concat([df, original_df.reset_index(drop=True)], axis=1)

    # Sampling if too many points
    if len(df) > sample_size:
        df = df.sample(sample_size, random_state=42)

    # Base scatter: display all points colored by cluster, with hover showing the original columns, cluster, anomaly, PC1, and PC2.
    hover_cols = list(original_df.columns) + ["cluster", "anomaly"]
    fig = px.scatter(
        df,
        x="PC1",
        y="PC2",
        color="cluster",
        symbol="anomaly",
        symbol_map={False: "circle", True: "circle"},
        hover_data=hover_cols,
        opacity=0.6,
        title=f"PCA Clusters (n={len(df)})",
        color_discrete_sequence=px.colors.qualitative.Set1
    )
    fig.update_traces(marker=dict(size=6))

    # Overlay anomalies: add an extra trace for points identified as anomalies
    # These anomalies will use the same color as their cluster but with a border (here in black) to distinguish them
    if anomalies_mask is not None:
        anom_df = df[df["anomaly"] == True]
        if not anom_df.empty:
            # Create a cluster-to-color mapping using Plotly Express’s color sequence
            clusters = sorted(df["cluster"].unique())
            colors = px.colors.qualitative.Set1
            color_map = {clust: colors[i % len(colors)] for i, clust in enumerate(clusters)}

            # For each cluster in the anomalies, add a trace with a border
            for clust in anom_df["cluster"].unique():
                sub = anom_df[anom_df["cluster"] == clust]
                # Hover logic
                cols = list(original_df.columns)
                hover_lines = [f"{col}: %{{customdata[{i}]}}" for i, col in enumerate(cols)]
                hover_lines += ["cluster: %{text}", "anomaly: True", "PC1: %{x}", "PC2: %{y}"]
                hovertemplate = "<br>".join(hover_lines) + "<extra></extra>"

                fig.add_trace(go.Scatter(
                    x=sub["PC1"],
                    y=sub["PC2"],
                    mode="markers",
                    marker=dict(
                        symbol="circle",
                        size=6, # slightly larger size to highlight the anomaly
                        color=color_map[clust],
                        line=dict(width=3, color="black")# black border to outline
                    ),
                    name=f"Anomalies Cluster {clust}",
                    text=[clust] * len(sub),
                    customdata=sub[cols].to_numpy(),
                    hovertemplate=hovertemplate
                ))

    # Centroids: calculate and display the centroids projected into PCA space
    centroids = np.array([normalized_data[labels == k].mean(axis=0) for k in np.unique(labels)])
    centroids_2d = pca.transform(centroids)
    fig.add_trace(go.Scatter(
        x=centroids_2d[:, 0],
        y=centroids_2d[:, 1],
        mode="markers+text",
        marker=dict(symbol="diamond", size=12, color="black", line=dict(color="white", width=1)),
        text=[f"C{k}" for k in np.unique(labels)],
        textposition="top center",
        name="Centroids"
    ))

    fig.show()


### Step 1: Loading the Data  
Objective: Load the transactions.csv file containing 20,000 transactions.

In [194]:
file_path = "transactions.csv"
data_as_dataframe = load_data(file_path)
data_as_dataframe.head(4)

Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount,date
0,1,417,583,102.332066,11.004279,11.004279,91.327787,2021-10-07 12:16:55.521706
1,2,68,277,83.130697,6.325441,6.325441,76.805256,2023-08-16 00:26:54.521999
2,3,621,546,96.091707,2.435575,2.435575,93.656132,2024-03-26 11:50:51.518583
3,4,783,337,82.476615,13.056381,13.056381,69.420234,2025-01-30 13:57:10.355813


Data Exploration

In [195]:
#help(data_as_dataframe)
print(data_as_dataframe.shape)
print(data_as_dataframe.columns)
data_as_dataframe.describe()

(20000, 8)
Index(['global_id', 'sender_id', 'receiver_id', 'initial_amount',
       'transfer_amount', 'amount_received', 'final_amount', 'date'],
      dtype='object')


Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,10000.5,570.37855,490.8705,35458.260148,6553.204543,6540.375758,28925.153352
std,5773.647028,339.093935,289.824756,78367.259754,11994.880623,12001.848158,67993.367313
min,1.0,1.0,1.0,22.459093,-7110.587943,-7110.587943,-21844.15792
25%,5000.75,275.75,241.0,92.533501,57.031151,6.165345,36.79025
50%,10000.5,547.0,487.0,160.364822,101.641698,101.641698,74.531949
75%,15000.25,880.0,741.0,2015.712188,6959.0,6959.0,121.562098
max,20000.0,1098.0,999.0,375075.838829,64525.60939,64525.60939,353664.953038


In [196]:
print(data_as_dataframe.isnull().sum())

global_id          0
sender_id          0
receiver_id        0
initial_amount     0
transfer_amount    0
amount_received    0
final_amount       0
date               0
dtype: int64


#### Step 2: Selecting Relevant Features  
Objective: Choose the useful columns for anomaly detection (e.g., initial_amount, transfer_amount, amount_received, final_amount).

In [197]:
select_features_df = select_features(data_as_dataframe)
select_features_df.head()

Unnamed: 0,initial_amount,transfer_amount,amount_received,final_amount
0,102.332066,11.004279,11.004279,91.327787
1,83.130697,6.325441,6.325441,76.805256
2,96.091707,2.435575,2.435575,93.656132
3,82.476615,13.056381,13.056381,69.420234
4,92.090839,19.036857,19.036857,73.053982


#### Step 3: Data Preparation and Normalization  
Objective: Normalize the data before applying K-means.

In [198]:
#data_as_dataframe.describe()
normalize_select_features_df = normalize_features(select_features_df)

#### Step 4: Clustering with K-means  
Objective: Group the transactions into 6 clusters.

In [199]:
# from sklearn.metrics import silhouette_score
# scores = [silhouette_score(normalize_select_features_df, KMeans(n_clusters=k).fit_predict(normalize_select_features_df)) 
#           for k in range(2,11)]

Please sir, I checked different values of K-Means clusters to see the silhouette, and it appears that 2 is better than 6, but I will continue with 6 because the exercise says so.

In [200]:
# scores

In [201]:
data_kmean, data_kmean_labels = apply_kmeans(data=normalize_select_features_df, n_clusters=6)

print("data_kmean_labels  :  ",data_kmean_labels,"\n")
print("len(data_kmean_labels) : ",len(data_kmean_labels),"\n")
print("data_kmean.cluster_centers_ : ",data_kmean.cluster_centers_,"\n")
print("type(data_kmean) : ",type(data_kmean))

data_kmean_labels  :   [0 0 0 ... 2 2 4] 

len(data_kmean_labels) :  20000 

data_kmean.cluster_centers_ :  [[-0.44749743 -0.51556647 -0.5157902  -0.42467605]
 [ 1.85699545  3.08434778  3.08362613  1.59590806]
 [ 1.50652988  1.79474812  1.79477513  1.41947273]
 [ 2.68246817  2.25366302  2.25342361  2.69386809]
 [ 2.31805393  0.97771242  0.97821375  2.49894805]
 [-0.4241223  -0.0103489  -0.00927396 -0.48730145]] 

type(data_kmean) :  <class 'sklearn.cluster._kmeans.KMeans'>


#### Step 5: Anomaly Detection  
Objective: Identify abnormal transactions based on their distance from the cluster centroid.

In [202]:
# Update to use add_distance_centroid_zscore
analysis_kmeans = add_distance_centroid_zscore(data_kmean, normalize_select_features_df)

# Technical prints: print the highest and lowest zscore values for inspection
print("Highest zscore:", analysis_kmeans['zscore'].max())
print("Lowest zscore:", analysis_kmeans['zscore'].min())

#print(analysis_kmeans.head(20))

Highest zscore: 587.2402723261116
Lowest zscore: -3.3055770340839827


I display the analysis results and the points closest to and farthest from each centroid.
This helps me understand the characteristics that make each cluster distinct.


In [203]:
mask = detect_anomalies(analysis_kmeans, methods=['zscore'], thresholds={'zscore':4})
analysis_kmeans['anomaly'] = mask

# Fusionner avec les données originales pour retrouver les vraies colonnes ---
df = data_as_dataframe.copy()
df = df.join(
    analysis_kmeans[['cluster_label','distance_to_centroid','distance_percent','zscore','anomaly']],
    how='left'
)

# Total et par cluster (juste pour info) ---
print("Total anomalies :", df['anomaly'].sum())
print("Anomalies per cluster:")
print(df[df['anomaly']].groupby('cluster_label').size(), "\n")

# Pour chaque cluster : 4 points les plus proches et 4 points les plus éloignés du centroïde ---
centroids = data_kmean.cluster_centers_

for k in range(len(centroids)):
    sub = df[df['cluster_label']==k]
    if sub.empty:
        print(f"Cluster {k} empty\n")
        continue

    # 4 plus proches
    closest4 = sub.nsmallest(4, 'distance_to_centroid')
    # 4 plus éloignés
    farthest4 = sub.nlargest(4, 'distance_to_centroid')
    
    print(f"Cluster {k}:")
    print("  4 closest to centroid:")
    display(closest4)      # affiche le DataFrame interactif dans le notebook
    print("  4 farthest from centroid:")
    display(farthest4)     # idem
    print("\n")


Total anomalies : 1849
Anomalies per cluster:
cluster_label
0    1783
1      13
2      12
3      15
4      15
5      11
dtype: int64 

Cluster 0:
  4 closest to centroid:


Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount,date,cluster_label,distance_to_centroid,distance_percent,zscore,anomaly
3500,3501,356,751,45.144614,134.271529,134.271529,89.126915,2021-05-11 00:32:07.300663,0,0.026957,1.202087,-3.268438,False
4483,4484,632,784,50.110204,134.148657,134.148657,84.038452,2022-12-21 01:41:24.383284,0,0.02696,1.2022,-3.26776,False
7019,7020,163,341,50.065762,132.943028,132.943028,82.877266,2023-08-11 06:46:51.645093,0,0.0271,1.208439,-3.230472,False
7003,7004,765,197,49.380733,132.673452,132.673452,83.292718,2024-05-12 13:48:17.351610,0,0.027133,1.209903,-3.221719,False


  4 farthest from centroid:


Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount,date,cluster_label,distance_to_centroid,distance_percent,zscore,anomaly
17327,17328,1001,189,113027.154873,-2220.211278,-2220.211278,115247.366151,2023-11-14 18:50:31.734950,0,2.242537,100.0,587.240272,True
19129,19130,1049,559,115075.034063,3837.467771,3837.467771,111237.566293,2024-07-24 00:43:03.669678,0,2.232485,99.551753,584.561131,True
17922,17923,1064,121,100474.280252,-7110.587943,-7110.587943,107584.868196,2024-02-17 19:47:03.428350,0,2.21536,98.788081,579.996709,True
18517,18518,1058,116,100299.44252,3813.095646,3813.095646,96486.346874,2024-09-25 04:40:32.942217,0,1.950082,86.958725,509.293417,True




Cluster 1:
  4 closest to centroid:


Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount,date,cluster_label,distance_to_centroid,distance_percent,zscore,anomaly
16864,16865,1005,304,181137.396115,43397.867916,43397.867916,137739.528199,2023-06-07 12:14:44.971258,1,0.018441,0.679192,-2.172653,False
17423,17424,1063,960,179632.337585,43209.622664,43209.622664,136422.71492,2025-03-24 00:59:30.179658,1,0.045987,1.693688,-2.093476,False
18681,18682,1058,220,184322.755693,43654.270028,43654.270028,140668.485665,2024-05-02 06:12:34.163061,1,0.065082,2.396943,-2.038589,False
18975,18976,1058,661,180537.811579,42845.927606,42845.927606,137691.883973,2024-05-30 03:27:03.353513,1,0.083112,3.060997,-1.986762,False


  4 farthest from centroid:


Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount,date,cluster_label,distance_to_centroid,distance_percent,zscore,anomaly
18457,18458,1006,188,40941.277224,41061.715343,41061.715343,-120.438119,2025-03-03 09:37:18.493469,1,2.715205,100.0,5.578989,True
18903,18904,1058,305,153729.362939,64525.60939,64525.60939,89203.753549,2023-09-15 13:08:19.905754,1,2.595699,95.598629,5.235478,True
16627,16628,1026,652,51734.838156,45124.176799,45124.176799,6610.661357,2023-04-22 10:16:57.200680,1,2.541029,93.585164,5.078334,True
17955,17956,1064,993,247296.443593,62055.78303,62055.78303,185240.660563,2023-08-07 16:19:54.630592,1,2.443184,89.981562,4.797085,True




Cluster 2:
  4 closest to centroid:


Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount,date,cluster_label,distance_to_centroid,distance_percent,zscore,anomaly
16499,16500,1043,393,152630.531488,27988.912902,27988.912902,124641.618586,2024-12-09 07:16:36.747435,2,0.019536,0.660906,-2.410788,False
19023,19024,1058,532,151777.323326,27669.708044,27669.708044,124107.615282,2024-01-09 10:48:27.861036,2,0.056744,1.919634,-2.286721,False
19057,19058,1049,843,149381.219912,27907.695278,27907.695278,121473.524635,2024-08-24 13:41:19.092484,2,0.081241,2.748356,-2.205038,False
17289,17290,1076,65,150296.071639,27552.27406,27552.27406,122743.797579,2024-11-28 11:01:40.174497,2,0.084474,2.857707,-2.19426,False


  4 farthest from centroid:


Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount,date,cluster_label,distance_to_centroid,distance_percent,zscore,anomaly
19282,19283,1045,20,14222.027222,36066.185142,36066.185142,-21844.15792,2024-11-25 23:26:46.890285,2,2.955991,100.0,7.380601,True
17561,17562,1034,936,26498.004545,31782.965599,31782.965599,-5284.961055,2023-08-05 22:28:53.792761,2,2.552285,86.34279,6.034474,True
19611,19612,1072,718,35013.647346,32549.567949,32549.567949,2464.079397,2024-07-20 20:40:34.320907,2,2.415674,81.721277,5.578953,True
19847,19848,1082,190,53087.905669,36620.357412,36620.357412,16467.548256,2023-12-28 04:49:57.818379,2,2.285668,77.323223,5.145457,True




Cluster 3:
  4 closest to centroid:


Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount,date,cluster_label,distance_to_centroid,distance_percent,zscore,anomaly
16684,16685,1081,568,244520.170697,33324.163275,33324.163275,211196.007422,2023-07-08 18:46:48.904386,3,0.036492,1.251165,-2.433904,False
19083,19084,1049,101,246847.098619,33858.497769,33858.497769,212988.600851,2024-09-19 19:31:38.895939,3,0.037965,1.30167,-2.428784,False
17191,17192,1020,488,245962.663693,33231.722716,33231.722716,212730.940977,2023-06-28 19:43:17.660318,3,0.042865,1.469666,-2.411751,False
17444,17445,1063,884,247898.29708,33660.419166,33660.419166,214237.877914,2024-07-29 12:13:20.915074,3,0.043464,1.490203,-2.409669,False


  4 farthest from centroid:


Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount,date,cluster_label,distance_to_centroid,distance_percent,zscore,anomaly
19460,19461,1014,578,325924.23092,55584.045679,55584.045679,270340.185241,2024-08-15 18:47:12.555973,3,2.916625,100.0,7.578141,True
17517,17518,1063,200,375075.838829,44804.11504,44804.11504,330271.723789,2024-09-12 21:42:03.520029,3,2.738075,93.878216,6.95746,True
16882,16883,1005,771,352101.996981,47600.003567,47600.003567,304501.993414,2024-06-23 00:19:03.699867,3,2.533973,86.880325,6.24795,True
18908,18909,1058,487,317417.363185,49300.909682,49300.909682,268116.453503,2024-10-07 16:47:07.717520,3,2.224606,76.273309,5.172516,True




Cluster 4:
  4 closest to centroid:


Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount,date,cluster_label,distance_to_centroid,distance_percent,zscore,anomaly
19897,19898,1082,809,218409.896242,18164.363987,18164.363987,200245.532254,2024-09-30 00:27:34.489896,4,0.029879,0.934849,-2.263266,False
19205,19206,1098,667,219672.615679,18502.808239,18502.808239,201169.80744,2025-02-18 06:55:24.003127,4,0.054175,1.695038,-2.189993,False
18664,18665,1058,833,218221.291371,18787.482697,18787.482697,199433.808674,2023-08-22 03:23:50.265159,4,0.062046,1.941299,-2.166256,False
16939,16940,1068,947,220591.725274,18360.980488,18360.980488,202230.744786,2024-01-02 20:06:53.521325,4,0.067513,2.112337,-2.14977,False


  4 farthest from centroid:


Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount,date,cluster_label,distance_to_centroid,distance_percent,zscore,anomaly
17340,17341,1001,778,361024.586584,7359.633545,7359.633545,353664.953038,2024-04-01 09:32:00.228594,4,3.196115,100.0,7.285467,True
17969,17970,1064,546,249034.641204,-3973.09131,-3973.09131,253007.732514,2024-06-22 14:02:04.172401,4,2.771476,86.713894,6.00484,True
19445,19446,1014,698,141150.75003,-2398.430825,-2398.430825,143549.180855,2024-04-09 14:09:08.647445,4,2.746229,85.923962,5.9287,True
19908,19909,1082,580,325268.59237,6749.803221,6749.803221,318518.789149,2024-09-09 19:23:23.383061,4,2.617379,81.892503,5.540114,True




Cluster 5:
  4 closest to centroid:


Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount,date,cluster_label,distance_to_centroid,distance_percent,zscore,anomaly
12726,12727,621,667,2031.242128,6687.0,6687.0,-4655.757872,2021-12-08 00:00:00.000000,5,0.031204,1.503473,-3.305577,False
12713,12714,621,667,2021.790341,6687.0,6687.0,-4665.209659,2021-10-30 00:00:00.000000,5,0.031243,1.505367,-3.304891,False
12719,12720,621,667,2016.552492,6687.0,6687.0,-4670.447508,2021-11-17 00:00:00.000000,5,0.031266,1.506438,-3.304503,False
12723,12724,621,667,2015.390544,6687.0,6687.0,-4671.609456,2021-11-29 00:00:00.000000,5,0.03127,1.506678,-3.304417,False


  4 farthest from centroid:


Unnamed: 0,global_id,sender_id,receiver_id,initial_amount,transfer_amount,amount_received,final_amount,date,cluster_label,distance_to_centroid,distance_percent,zscore,anomaly
17785,17786,1034,960,107006.581583,11616.219889,11616.219889,95390.361694,2024-06-16 21:39:11.670894,5,2.07546,100.0,32.367301,True
19402,19403,1014,241,36770.390102,22837.782002,22837.782002,13932.6081,2024-07-17 06:47:19.452483,5,2.001568,96.439738,31.077867,True
17106,17107,1069,155,92658.029236,13613.660805,13613.660805,79044.368431,2024-12-13 09:17:47.178542,5,1.883666,90.758985,29.020446,True
18617,18618,1058,303,80574.514053,16310.999563,16310.999563,64263.51449,2023-06-21 13:53:10.897560,5,1.835904,88.457719,28.186987,True






In [204]:
# ## Distribution of Anomaly Metrics
# Here we plot for each metric:
# 1. A histogram of all values  
# 2. A box‑and‑whiskers diagram (boxplot) broken out by cluster_label  
# This helps see the global shape of the distribution and compare clusters.

# select only the columns we need
# metrics = ['distance_to_centroid', 'distance_percent', 'zscore']

# for metric in metrics:
#     # Histogram of the metric
#     fig_hist = px.histogram(
#         df,
#         x=metric,
#         nbins=50,
#         title=f"Histogram of {metric}",
#         marginal="rug",              # add rug plot
#         opacity=0.7
#     )
#     fig_hist.update_layout(xaxis_title=metric, yaxis_title="Count")
#     fig_hist.show()

#     # Box‑and‑whiskers by cluster
#     fig_box = px.box(
#         df,
#         x='cluster_label',
#         y=metric,
#         points="all",                # show all points
#         title=f"Boxplot of {metric} by Cluster",
#     )
#     fig_box.update_layout(xaxis_title="Cluster Label", yaxis_title=metric)
#     fig_box.show()


In [205]:
anomalies = analysis_kmeans['anomaly'].values
plot_clusters(
  normalized_data=normalize_select_features_df,
  labels=data_kmean.labels_,
  original_df=select_features_df,
  anomalies_mask=anomalies,
  sample_size=20000
)