# CNAK

In [91]:
# import libraries
import os
import glob
import sys
import warnings
import pandas as pd
import numpy as np

warnings.filterwarnings(action='once')


import cnak module

In [2]:
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.cnak import cnak

## Load datasets

Glob all processed csv files

In [18]:
df_lst = []
for file in glob.glob("../../data/processed/*.csv"):
    print(file)
    df = pd.read_csv(file,index_col=[0])
    df_lst.append(df)

../../data/processed/mall_customers_data.csv
../../data/processed/country_data.csv
../../data/processed/customer_data.csv


In [23]:
# dataframes of each datasets in X and y (if available)
X_mall_customers = df_lst[0].reset_index(drop=True)

X_country = df_lst[1].reset_index(drop=True)

X_customer = df_lst[2].iloc[:,:-1].reset_index(drop=True)
y_customer = df_lst[2]['Region']

## Train CNAK to obtain optimal cluster number, K
We will run CNAK 50 times to determine which is the optimal cluster number (k) for Kmeans.

In [31]:
X_dict = {
    'mall_customers': X_mall_customers,
    'country': X_country,
    'customer': X_customer
}

data_cluster_dict = {} # initialise empty dictionary to store optimal cluster number for each dataset after 50 runs
# iterate through each dataset and perform 50 runs of CNAK
for dataset_name, X in X_dict.items():
    cluster_lst = [] # initialise empty list to store clusters number for each run
    for _ in range(50): # 50 iterations
        labels, scores, centers = cnak.CNAK(np.array(X), k_min=2)
        clusters = len(set(labels)) # number of clusters
        cluster_lst.append(clusters) # append to list

    clusters_df = pd.DataFrame(cluster_lst).value_counts().reset_index()
    clusters_df.columns = ['clusters','count']
    data_cluster_dict[dataset_name] = clusters_df.loc[clusters_df['count'] == clusters_df['count'].max()]['clusters'][0] # find optimal cluster 

 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 8
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 9
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 9
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 1
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 9
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 9
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 1
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 7
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 1
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 1
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 9
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 1
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 7
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 9
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 1
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 9
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 3
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 1
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 8
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 1
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 7
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 9
 gamma: 0.7  K_min: 2  K_max: 21
K_hat: 1
 gamma: 0.7  K_min: 2  K_max: 21
K

optimal cluster number for each dataset

In [32]:
data_cluster_dict

{'mall_customers': 10, 'country': 2, 'customer': 2}

the dictionary below is a manual collation of optimal cluster number for kmeans using other methods such as elbow or silhouette score

In [33]:
norm_cluster_dict = {'mall_customers': 5, 'country': 3, 'customer': 2}

## Evaluation

In [34]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import mutual_info_score, adjusted_rand_score

In [80]:
def evaluate_cnak (dataset_name:str, dataset_clusters:dict, X:pd.DataFrame, y:pd.Series=None)->list:

    metrics_lst = []
    for k, clusters in dataset_clusters.items():
        kmeans = KMeans(n_clusters=clusters,random_state=0, n_init="auto").fit(X)
        labels = kmeans.labels_

        metrics = []
        # Calculate clustering metrics
        silhouette = silhouette_score(X, np.array(labels))
        db_index = davies_bouldin_score(X, np.array(labels))
        ch_index = calinski_harabasz_score(X, np.array(labels))
        ari = None
        mi = None

        if y is not None:
            ari = adjusted_rand_score(y, np.array(labels))
            mi = mutual_info_score(y, np.array(labels))

        metrics.append([dataset_name, clusters, silhouette,db_index,ch_index, ari, mi])

        metrics_df = pd.DataFrame(data=metrics, columns=['dataset_name','clusters','silhouette','db_index','ch_index', 'ari', 'mi'], index=[k])
        metrics_lst.append(metrics_df)
    
    return pd.concat(metrics_lst)

In [43]:
mall_customers_clusters = {
    'cnak':10,
    'without_cnak': 5
}

customer_clusters = {
    'cnak':2,
    'without_cnak': 2
}

country_clusters = {
    'cnak':2,
    'without_cnak': 3
}

In [81]:
mall_customers_metrics = evaluate_cnak(
    dataset_name='mall_customers',
    dataset_clusters=mall_customers_clusters,
    X=X_mall_customers)
customer_metrics = evaluate_cnak(
    dataset_name='customers',
    dataset_clusters=customer_clusters,
    X=X_customer, 
    y=y_customer)
country_metrics = evaluate_cnak(
    dataset_name='country',
    dataset_clusters=country_clusters,
    X=X_country)

In [90]:
pd.concat([mall_customers_metrics,customer_metrics, country_metrics])

  pd.concat([mall_customers_metrics,customer_metrics, country_metrics])


Unnamed: 0,dataset_name,clusters,silhouette,db_index,ch_index,ari,mi
cnak,mall_customers,10,0.404981,0.88544,85.125508,,
without_cnak,mall_customers,5,0.317807,1.156509,68.980518,,
cnak,customers,2,0.406829,1.123881,195.070388,-0.017812,0.003447
without_cnak,customers,2,0.406829,1.123881,195.070388,-0.017812,0.003447
cnak,country,2,0.572717,0.647933,32.064753,,
without_cnak,country,3,0.368971,0.839504,88.591953,,
