In [1]:
import numpy as np
import pandas as pd

from bitcoin_app.data_load import load_dataset
from bitcoin_app.data_processing import data_processing
from bitcoin_app.clustering import find_n_clusters, clustering
from bitcoin_app.save_dataset import save_dataset
from bitcoin_app.settings import Settings

In [2]:
settings = Settings()

idx, X, df = load_dataset(
    path=settings.dataset.dataset_path,
    dtype=settings.dataset.dtype,
    drop_na=settings.dataset.drop_na,
)

2024-11-17 22:22:00,576 - bitcoin_app.data_load - INFO - Dataset loading has been started.
2024-11-17 22:22:12,159 - bitcoin_app.data_load - INFO - Dataset has been loaded. DF shape: (7373206, 7)
2024-11-17 22:22:12,294 - bitcoin_app.data_load - INFO - NA values has been dropped. DF shape: (7373205, 7)
2024-11-17 22:22:12,715 - bitcoin_app.data_load - INFO - Dataset split into indexes and samples. Index shape: (7373205,). Samples shape: (7373205, 6)


# Principal Component Analysis

In [3]:
scaler, pca, X_pca = data_processing(
    X,
    pca_n_components=settings.preprocessing.pca_n_components,
    pca_random_state=settings.preprocessing.pca_random_state,
)

2024-11-17 22:22:12,717 - bitcoin_app.data_processing - INFO - Data preprocessing has been started.
2024-11-17 22:22:14,536 - bitcoin_app.data_processing - INFO - Data scaled.
2024-11-17 22:22:14,859 - bitcoin_app.data_processing - INFO - PCA performed.
2024-11-17 22:22:14,860 - bitcoin_app.data_processing - INFO - PCA Components: [[ 0.50000367  0.5060643   0.08769862  0.50140492  0.47681457  0.08628545]
 [-0.04144324 -0.05299161  0.70021842 -0.0414018  -0.11233192  0.70059685]
 [-0.22355685  0.20486164  0.6480565  -0.21670278  0.23789888 -0.62009745]]
2024-11-17 22:22:14,866 - bitcoin_app.data_processing - INFO - PCA Explained Variance: [0.6153446  0.27594962 0.05494881]


In [4]:
explained_variance = pca.explained_variance_ratio_
cumulative_variance = explained_variance.cumsum()

variance_df = pd.DataFrame({
    'Principal Component': [f'PC{i+1}' for i in range(len(explained_variance))],
    'Explained Variance Ratio': explained_variance,
    'Cumulative Explained Variance': cumulative_variance
})

variance_df

Unnamed: 0,Principal Component,Explained Variance Ratio,Cumulative Explained Variance
0,PC1,0.615345,0.615345
1,PC2,0.27595,0.891294
2,PC3,0.054949,0.946243


In [5]:
# Convert principal components to DataFrame
df_pca = pd.DataFrame(
    X_pca, 
    columns=[f'PC{i+1}' for i in range(X_pca.shape[1])],
)

# Combine with ENTITY_ID
df_pca = pd.concat(
    [df.loc[:, 'ENTITY_ID'], df_pca], 
    axis=1,
)
print(df_pca.shape)
df_pca.head(10)

(7373205, 4)


Unnamed: 0,ENTITY_ID,PC1,PC2,PC3
0,997040325,-0.013396,-0.0022,0.003873
1,998226697,-0.001921,-0.002518,-0.000608
2,995117731,-0.013396,-0.0022,0.003873
3,991378712,-0.002011,-0.003245,-0.000456
4,995865856,-0.013396,-0.0022,0.003873
5,991031161,-0.008088,-0.00264,0.0015
6,991436968,-0.008088,-0.00264,0.0015
7,992911793,-0.013396,-0.0022,0.003873
8,992649174,-0.012904,-0.002252,0.004072
9,991676657,-0.002012,-0.00325,-0.000455


In [6]:
# Prepare the PCA components
features = df.columns[1:]
pca_components = pca.components_.T  # Transpose for easier interpretation
df_pca_components = pd.DataFrame(
    pca_components, 
    index=features, 
    columns=[f'PC{i+1}' for i in range(len(explained_variance))],
)

df_pca_components

Unnamed: 0,PC1,PC2,PC3
TOTAL_RECIEVE_ADDRESSES,0.500004,-0.041443,-0.223557
TOTAL_RECIEVE_TRANSACTIONS,0.506064,-0.052992,0.204862
TOTAL_BTC_RECEIVED,0.087699,0.700218,0.648056
TOTAL_SPEND_ADDRESSES,0.501405,-0.041402,-0.216703
TOTAL_SPEND_TRANSACTIONS,0.476815,-0.112332,0.237899
TOTAL_BTC_SPENT,0.086285,0.700597,-0.620097


# Cluster Analysis

In [7]:
clusters, clusters_proba = clustering(
    X=X_pca,
    n_components=settings.clustering.n_components,
    random_state=settings.clustering.random_state,
)

2024-11-17 22:22:14,985 - bitcoin_app.clustering - INFO - GMM for 12 components started


Initialization 0
  Iteration 1
  Iteration 2
  Iteration 3
  Iteration 4
  Iteration 5
  Iteration 6
  Iteration 7
  Iteration 8
  Iteration 9
  Iteration 10
  Iteration 11
  Iteration 12
  Iteration 13
  Iteration 14
  Iteration 15
  Iteration 16
  Iteration 17
  Iteration 18
  Iteration 19
  Iteration 20
  Iteration 21
  Iteration 22
  Iteration 23
  Iteration 24
Initialization converged.


2024-11-17 22:24:52,638 - bitcoin_app.clustering - INFO - GMM fitted
2024-11-17 22:24:59,986 - bitcoin_app.clustering - INFO - Clusters for 12 components: [ 0  1  2  3  4  5  6  7  8  9 10 11]
2024-11-17 22:24:59,987 - bitcoin_app.clustering - INFO - Clusters for 12 components counts: [7015511       1       1       3       1       3       1    2851       1
   84377   20258  250197]


In [8]:
df['Cluster'] = clusters
df

Unnamed: 0,ENTITY_ID,TOTAL_RECIEVE_ADDRESSES,TOTAL_RECIEVE_TRANSACTIONS,TOTAL_BTC_RECEIVED,TOTAL_SPEND_ADDRESSES,TOTAL_SPEND_TRANSACTIONS,TOTAL_BTC_SPENT,Cluster
0,997040325,1,1,0.000000,0,0,0.000000,0
1,998226697,2,1,1.369956,1,1,2.739536,0
2,995117731,1,1,0.000000,0,0,0.000000,0
3,991378712,2,1,0.013344,1,1,0.026688,0
4,995865856,1,1,0.000000,0,0,0.000000,0
...,...,...,...,...,...,...,...,...
7373200,992095090,1,1,0.000000,0,0,0.000000,0
7373201,993970279,1,1,0.000000,0,0,0.000000,0
7373202,996138165,1,1,0.000080,1,1,0.000160,0
7373203,993736393,1,1,0.000000,0,0,0.000000,0


In [9]:
clusters_count = np.unique(clusters, return_counts=True)
print('Clusters components:\n', clusters_count[0])
print('Clusters components counts:\n', clusters_count[1])

Clusters components:
 [ 0  1  2  3  4  5  6  7  8  9 10 11]
Clusters components counts:
 [7015511       1       1       3       1       3       1    2851       1
   84377   20258  250197]


In [10]:
cluster_counts = df['Cluster'].value_counts()
print("Cluster Sizes:")
cluster_counts

Cluster Sizes:


Cluster
0     7015511
11     250197
9       84377
10      20258
7        2851
5           3
3           3
4           1
6           1
8           1
1           1
2           1
Name: count, dtype: int64

In [16]:
# Drop small clusters
min_elements = 0
big_clusters = df.loc[:, 'Cluster'].value_counts().loc[
    df.loc[:, 'Cluster'].value_counts() > min_elements
].index
print('Big clusters:', big_clusters)

df_filtered = df.loc[df.loc[:, 'Cluster'].isin(big_clusters), : ]

Big clusters: Index([0, 11, 9, 10, 7, 5, 3, 4, 6, 8, 1, 2], dtype='int64', name='Cluster')


In [17]:
cluster_stats = df_filtered.groupby('Cluster')[features].agg(['mean', 'std'])
cluster_stats

Unnamed: 0_level_0,TOTAL_RECIEVE_ADDRESSES,TOTAL_RECIEVE_ADDRESSES,TOTAL_RECIEVE_TRANSACTIONS,TOTAL_RECIEVE_TRANSACTIONS,TOTAL_BTC_RECEIVED,TOTAL_BTC_RECEIVED,TOTAL_SPEND_ADDRESSES,TOTAL_SPEND_ADDRESSES,TOTAL_SPEND_TRANSACTIONS,TOTAL_SPEND_TRANSACTIONS,TOTAL_BTC_SPENT,TOTAL_BTC_SPENT
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
Cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,1.465684,0.581301,1.162862,0.884902,0.05891185,0.451272,0.484893,0.552122,0.479837,0.554087,0.08926303,0.73071
1,203739.0,,2638014.0,,219807.9,,203739.0,,1783339.0,,329700.9,
2,17153.0,,98529.0,,3030462.0,,17153.0,,1294.0,,2372420.0,
3,71712.666667,23385.1931,305278.666667,166355.383022,427286.3,435277.166667,71712.666667,23385.1931,3856.0,991.146306,683695.2,689853.628747
4,1.0,,3750.0,,1094921.0,,1.0,,125.0,,6445774.0,
5,1.0,0.0,850.0,969.1909,69682.71,62571.677431,1.0,0.0,653.666667,1074.319475,2897526.0,471971.414895
6,1.0,,2772.0,,6072788.0,,1.0,,487.0,,5237346.0,
7,539.568222,1446.213921,2695.086636,12839.742804,13587.85,75334.346071,397.533146,1422.722535,275.220975,3471.866653,14396.66,74462.753676
8,1.0,,132018.0,,5057136.0,,1.0,,11037.0,,3007270.0,
9,13.881176,19.846696,30.867511,45.990517,11.85719,21.163922,6.745333,9.304371,6.271519,13.736663,23.57719,46.880855
