In [1]:
import numpy as np 
import pandas as pd
import networkx as nx
from plotly import io as pio 
pio.templates.default = "plotly_white"

from utils import plot_cluster, graph_sum
from make_synthetic_data import generate_synthetic_data_with_hierarchy
from make_synthetic_data import create_synthetic_paritally_known_label
from graph import knn_graph, partition
from chameleon_cluster import partition_phase, merge_phase

## Generate synthetic data

In [2]:
data = generate_synthetic_data_with_hierarchy(cluster_std=2.5)

In [3]:
data = create_synthetic_paritally_known_label(data)

In [4]:
plot_cluster(data, cluster_col='true_clst_l3')

### Cannot-link constraints

Create cannot-link matrix based on known tags. If two instances have known different tags, they will be marked as cannot-link.  
The cannot-link matrix will be input in merge phase to avoid merging clusters that has cannot-link instances.

In [5]:
# a matrix showing whether two instances has the same tag at a certain level
same_l3 = data['known_tag_l3'].values.reshape(-1, 1) == data['known_tag_l3'].values.reshape(1, -1)
same_l2 = data['known_tag_l2'].values.reshape(-1, 1) == data['known_tag_l2'].values.reshape(1, -1)
same_l1 = data['known_tag_l1'].values.reshape(-1, 1) == data['known_tag_l1'].values.reshape(1, -1)

# cannot-link matrix: known tag is not null and known tag is different
valid_l3 = data['known_tag_l3'].notna().values
valid_l2 = data['known_tag_l2'].notna().values
valid_l1 = data['known_tag_l1'].notna().values
cl_l3 = ~same_l3 & valid_l3.reshape(-1, 1) & valid_l3.reshape(1, -1)
cl_l2 = ~same_l2 & valid_l2.reshape(-1, 1) & valid_l2.reshape(1, -1)
cl_l1 = ~same_l1 & valid_l1.reshape(-1, 1) & valid_l1.reshape(1, -1)
cl_mat = cl_l3 | cl_l2 | cl_l1

In [6]:
print(cl_mat.shape)
cl_mat

(10000, 10000)


array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False,  True, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False,  True, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

## Generate graph from data

In [7]:
graph = knn_graph(data.iloc[:, :5])

In [8]:
plot_cluster(data, graph)

## Pre-label cluster according to partially known label

Prelabel cluster according to partially known label.  
If a instance's cluster is known at the finest level (l3 in this case), then these cluster are excluded in the partition phase, i.e., they won't be cut into smaller subclusters.

In [9]:
# pre partition according to the known tag of the finest level
known_tag_l3_dict = data['known_tag_l3'].dropna().to_dict()
nx.set_node_attributes(graph, known_tag_l3_dict, 'cluster')
exclude_cluster = np.unique(list(known_tag_l3_dict.values())).tolist()

cluster_idxs = pd.DataFrame({'cluster': pd.Series(nx.get_node_attributes(graph, 'cluster'))})
labeled_nodes = list(cluster_idxs.index)
known_tag_l2_dict = data['known_tag_l2'].drop(labeled_nodes).dropna().to_dict()
nx.set_node_attributes(graph, known_tag_l2_dict, 'cluster')

cluster_idxs = pd.DataFrame({'cluster': pd.Series(nx.get_node_attributes(graph, 'cluster'))})
labeled_nodes = list(cluster_idxs.index)
known_tag_l1_dict = data['known_tag_l1'].drop(labeled_nodes).dropna().to_dict()
nx.set_node_attributes(graph, known_tag_l1_dict, 'cluster')

In [10]:
print(f'Excluded clusters: {exclude_cluster}')

Excluded clusters: ['0', '1', '2', '3', '4', '5']


In [11]:
graph_sum(graph)

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
"(0, 1)",350
"(0, 1, 2, 3)",1273
"(2, 3)",360
"(4, 5)",991
0,186
1,169
2,169
3,153
4,146
5,173


In [12]:
plot_cluster(data, graph)

## Partition Phase

In [13]:
partition_phase(graph, n_cluster_final=50, exclude_cluster=exclude_cluster)

Eixst nodes without cluster. Initialize 6030 nodes to cluster -1


In [14]:
plot_cluster(data, graph)

In [15]:
graph_sum(graph)

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
-1,190
0,190
1,189
2,190
3,188
4,189
5,189
6,188
7,318
8,247


## Merge Phase

In [16]:
merge_phase(graph, n_cluster_final=6, cl_mat=cl_mat)

Output()

In [17]:
plot_cluster(data, graph)

In [18]:
# accuracy with constraints
data['cluster'] = pd.Series(nx.get_node_attributes(graph, 'cluster'))
replace_dict = {}
for i in data['cluster'].unique():
    replace_dict[i] = int(data[data['cluster'] == i]['known_tag_l3'].value_counts().index[0])

In [19]:
acc_wi_constraints = sum(data['cluster'].replace(replace_dict) == data['true_clst_l3']) / len(data)
print('accuracy with constraints: ', acc_wi_constraints)

accuracy with constraints:  0.9687
