# Initialization

In [1]:
# Display plots inline
%matplotlib inline

# Autoreload all package before excecuting a call
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import multiprocessing as mp
import itertools

import tailor
from tailor.clustering import *

In [3]:
data = tailor.load_data()

In [4]:
data.sample(10)

Unnamed: 0,article_id,season,brand,color,Abteilung,WHG,WUG,month,time_on_sale,original_price,sells_price,discount,markdown,article_count,stock_total,avq,revenue
129434,904435,Fall,Hercules Deusoniensis,mittelgrau,Abteilung002,WHG010,WUG035,Jan,19,49.95,46.071667,3.878333,0.0,4.333333,7625.0,49.412022,198.54
163719,905611,Fall,Snotra,schwarz,Abteilung003,WHG016,WUG057,Feb,15,49.95,49.95,0.0,0.0,1.0,945.0,34.920635,49.95
94272,903234,Winter,Snotra,hellgrau,Abteilung005,WHG021,WUG072,Jun,15,89.95,56.616667,0.0,33.333333,1.666667,258.0,26.098191,96.583333
244593,908381,Winter,Mercurius Arvernus,hellgrau,Abteilung002,WHG006,WUG015,Aug,24,49.95,49.95,0.0,0.0,2.0,234.0,69.65812,99.9
127068,904354,Summer,Mannus,pink,Abteilung004,WHG028,WUG102,Dec,18,12.95,6.92,3.03,3.0,20.857143,5628.0,24.253731,144.35
47697,901660,Fall,Heimdall,bordeauxrot,Abteilung002,WHG012,WUG045,Dec,14,79.95,41.955,8.995,29.0,2.0,879.0,10.608646,80.92
44063,901532,Summer,Skadi,dunkelbraun,Abteilung004,WHG026,WUG093,Nov,19,12.95,10.713333,2.236667,0.0,24.166667,6901.0,26.133894,254.071667
145700,905009,Winter,Gautr,beige,Abteilung002,WHG010,WUG034,May,19,49.95,39.903333,0.046667,10.0,8.666667,2712.0,38.95649,345.991667
121162,904155,Summer,Travalaha,schwarz,Abteilung002,WHG015,WUG053,Dec,19,59.95,49.6475,0.3025,10.0,15.0,2295.0,45.631808,744.2925
195788,906708,Summer,Tyr,mittelgrau,Abteilung007,WHG042,WUG139,Sep,4,64.95,60.812857,4.137143,0.0,8.0,1403.0,9.703696,483.742857


# Code

In [5]:
min_cluster_size = 50
max_cluster_count = 10
clustering_feature = 'article_count'

In [6]:
%%time

split_results = cluster.multi_feature_split(data, distance.euclidean, min_cluster_size)

CPU times: user 2min 55s, sys: 2.09 s, total: 2min 57s
Wall time: 3min 2s


In [7]:
# show the available split layers/depth
split_results['Clusters'].index

Index(['0', '1', '2', '3', '4', '5', '6', '7'], dtype='object')

In [8]:
# showcasing how to retrieve the cluster feauteres of the first cluster of the fifth layer
split_results['Clusters']['5'][0]['Features']

Abteilung    Abteilung005
WHG                WHG021
brand           Fimmilena
WUG                WUG073
season             Spring
dtype: object

In [9]:
merge_results = pd.Series()
merge_results['Groups'] = pd.Series()
merge_results['Indexes'] = pd.Series()
merge_results['DataFrames'] = pd.Series()

In [10]:
def get_cluster_parent_name(cluster):
    '''generates the name of the parent cluster'''
    name = cluster['Name']
    # remove last character until name is the parent cluster's name
    terminate = False
    while not terminate:
        character = name[-1:]
        if ((character == "_") or (character == "")):
            terminate = True
        name = name[:-1]
    return name

In [11]:
def get_leaves(split_results):
    '''retrieves all the unsplit clusters / the leaves of the split tree'''
    leaves = list()
    # iterate through all layers of the clustering
    for layer in split_results['Clusters'].index:
        # add all layer leaves and remove leaf parents
        for add_cluster in split_results['Clusters'][layer]:
            check_name = get_cluster_parent_name(add_cluster)
            # iterate until parent cluster is found then remove it
            for index, check_cluster in enumerate(leaves):
                if check_cluster['Name'] == check_name:
                    # parent cluster found, remove it
                    del leaves[index]
                    # no more than one parent cluster, therefore exit for loop
                    break
            leaves.append(add_cluster)
    return leaves

In [12]:
%%time

clusters = get_leaves(split_results)
print(len(clusters))

2280
CPU times: user 37.6 s, sys: 31.2 ms, total: 37.7 s
Wall time: 38.6 s


In [13]:
def get_cluster_names(clusters, reversed_sort = False):
    '''retrieves all names of the given clusters'''
    names = list()
    for cluster in clusters:
        name = cluster['Name']
        names.append(name)
    # sort by underscore count
    names.sort(key = lambda s: s.count("_"), reverse=reversed_sort)
    return names

In [None]:
get_cluster_names(clusters)

In [14]:
def get_distance_matrix(targets):
    #calculate distance matrix
    length = len(targets)
    distances = pd.DataFrame(index=range(length),columns=range(length))
    for i, a in enumerate(targets):
        for k, b in enumerate(reversed(targets)):
            j = length - 1 - k
            if j <= i:
                break
            else:
                try:
                    d = distance.euclidean(a.values,b.values)
                    distances[i][j] = d
                    distances[j][i] = d
                except:
                    print(str(i) + " " + str(k))
    return distances

In [15]:
%%time

length = len(clusters)
targets = list()

# dress the clusters for better distance performance
for i, cluster in enumerate(clusters):
    # only select the distance relevant slice of the Dataframe
    target = cluster['DataFrame'].groupby(['time_on_sale']).mean()[feature]
    if (len(target) < 26):
        # fill with 0 until index 25 so all comparison arrays are the same length
        # this improves performance dramatically
        target = target.reindex(pd.RangeIndex(26)).fillna(0)
    targets.append(target)

distances = get_distance_matrix(targets)

CPU times: user 10min 45s, sys: 2.16 s, total: 10min 47s
Wall time: 11min 20s


In [None]:
min_index = np.nanargmin(distances[0])
min_value = np.nanmin(distances[0])
print(str(min_index) + " " + str(min_value))

In [None]:
print(distances[0][41])
print(distances[41][0])

In [16]:
def merge_all(distances):
    # get the closest cluster for each cluster
    # generates a Series with pointer lists
    closest_clusters = pd.Series(index=range(len(distances)), dtype='object')
    for i in distances.index:
        target_index = np.nanargmin(distances[i]).item()
        # only one value now, but we will add values later
        closest_clusters[i] = list()
        closest_clusters[i].append(target_index)
        
    cluster_groups = closest_clusters
        
    # generate initial groups by adding the index to the target
    for i, group in cluster_groups.iteritems():
        # first value is the initial closest cluster
        target = group[0]
        cluster_groups[target].append(i)
    
    # merge until there are only loners and groups with a pointer loop  
    # a pointer loop is when two cluster point towards each other, even over multiple cluster between
    finished = False 
    while not finished:
        finished = True
        
        # merge dependencies
        for i, group in cluster_groups.iteritems():
            # loner check
            if len(group) > 1:
                # first value is the initial closest cluster
                target = group[0]
                # rest of the values are pointers added by dependent groups
                pointers = group[1:]
                try:
                    # check whether this is a dependent group without a pointer loop
                    if (target not in pointers):
                        # still dependent groups left, we need to iterate at least one more time
                        finished = False
                        # add own index to target
                        cluster_groups[target].append(i)
                        # sanity check whether looping is required
                        if (type(pointers) is list):
                            # multiple entries we can loop
                            for x in pointers:
                                if (x not in cluster_groups[target]):
                                    cluster_groups[target].append(x)
                        else:
                            print(pointers)
                            cluster_groups[target].append(pointers[0])
                        # dependent group is spent, create loner
                        cluster_groups[i] = list()
                        cluster_groups[i].append(target)
                except:
                    print("shit's on fire, yo")
                    print(str(i) + " " + str(group) + " " + str(target) + " " + str(pointers))
    
    # clear loners
    for i, group in cluster_groups.iteritems():
        if (len(group) <= 1):
            target = group[0]
            if target in cluster_groups.index:
                cluster_groups[target].append(i)
                cluster_groups = cluster_groups.drop(i) 
    
    # dress up the group list        
    merged_groups = list()
    for i, group in cluster_groups.iteritems():
        # replace target with own index
        temp = group
        temp.append(i)
        temp = sorted(list(set(temp)))
        merged_groups.append(temp)
    merged_groups = sorted(merged_groups)
    
    # merge connected groups and remove duplicates
    for i, group_a in enumerate(merged_groups):
        for k, group_b in enumerate(merged_groups):
            if k is not i:
                for x in group_a:
                    if x in set(group_b):
                        merged_groups[i] = list(set(group_a).union(set(group_b)))
                        # both will point to the same list
                        merged_groups[k] = merged_groups[i]
                        
    clean = list()
    for group in merged_groups:
        sgroup = sorted(group)
        if sgroup not in clean:
            clean.append(sgroup)
    clean = sorted(clean)
    
    print(len(list(set(list(itertools.chain.from_iterable(clean))))))
    print(len(clean))
    
    return clean

In [17]:
grouped_clusters = merge_all(distances)

2280
362


In [18]:
merge_results['Indexes']['0'] = grouped_clusters

In [19]:
merge_results['Groups']['0'] = list()
merge_results['DataFrames']['0'] = list()
for i, pointers in enumerate(merge_results['Indexes']['0']):
    group = list()
    dfs = list()
    for pointer in pointers:
        cluster = clusters[pointer]
        group.append(cluster)
        # retrieving the relevant part of the original dataframe since the cluster dataframe has missing columns
        query_string = ""
        # building the query string
        # e.g. '(Abteilung == "Abteilung001") & (WHG == "WHG003")'
        for feature, characteristic in cluster['Features'].iteritems():
            query_string = query_string + " & " + "(" + feature + " == " + '"' + characteristic + '"' + ")"
        # remove first " & "
        query_string = query_string[3:]
        # select the dataframe part
        df_temp = data.query(query_string)
        dfs.append(df_temp)
    merge_results['Groups']['0'].append(group)
    # merge the clusters' dataframes to one and add it
    merge_results['DataFrames']['0'].append(pd.concat(dfs, sort=True))

In [None]:
merge_results['DataFrames']['0'][0].sample(5)

In [None]:
merge_results['Groups']['0'][0]

In [None]:
merge_results['Indexes']['0'][0]

In [60]:
merge_number = 1
above_min_size = False
clustering_feature = 'article_count'

In [61]:
while not above_min_size:
    above_min_size = True
    # check whether all clusters are above min_cluster_size
    too_small = list()
    for i, group in enumerate(merge_results['Groups'][str(merge_number - 1)]):
        group_size = merge_results['DataFrames'][str(merge_number - 1)][i]['article_id'].nunique()
        if group_size < min_cluster_size:
            above_min_size = False
            too_small.append(i)
    print(len(too_small))
    
    if not above_min_size:
        # distance matrix generation
        length = len(merge_results['Groups'][str(merge_number - 1)])
        targets = list()
        # dress the clusters for better distance performance
        for i, group in enumerate(merge_results['Groups'][str(merge_number - 1)]):
            # only select the distance relevant slice of the Dataframe
            target = merge_results['DataFrames'][str(merge_number - 1)][i].groupby(['time_on_sale']).mean()[clustering_feature]
            if (len(target) < 26):
                # fill with 0 until index 25 so all comparison arrays are the same length
                # this improves performance dramatically
                target = target.reindex(pd.RangeIndex(26)).fillna(0)
            targets.append(target)
        distances = get_distance_matrix(targets)
        
        
        # get the closest group for each group that is too small
        # generates a Series with pointer lists
        closest_groups = pd.Series(index=range(length), dtype='object')
        for i in too_small:
            target_index = np.nanargmin(distances[i]).item()
            # only one value now, but we will add values later
            closest_groups[i] = list()
            closest_groups[i].append(target_index)
        
        relevant_groups = closest_groups
        relevant_groups = relevant_groups.dropna()
        
        # generate initial groups by adding the index to the target
        for i, group in relevant_groups.iteritems():
            if group is not np.nan:
                # first value is the initial closest group
                target = group[0]
                # sanity check
                if target in relevant_groups.index:
                    relevant_groups[target].append(i)
                else:
                    # targeting group outside of too_small
                    # add own index to own group to not be a loner
                    group.append(i)
                
        # merge until there are only loners and groups with a pointer loop  
        # a pointer loop is when two groups point towards each other, even over multiple groups in between
        finished = False 
        while not finished:
            finished = True
            
            # merge dependencies
            for i, group in relevant_groups.iteritems():
                # ignore loners
                if len(group) > 1:
                    # first value is the initial closest cluster
                    target = group[0]
                    # sanity check
                    if target in relevant_groups.index:
                        # rest of the values are pointers added by dependent groups
                        pointers = group[1:]
                        try:
                            # check whether this is a dependent group without a pointer loop
                            if (target not in pointers):
                                # still dependent groups left, we need to iterate at least one more time
                                finished = False
                                # add own index to target
                                relevant_groups[target].append(i)
                                # sanity check whether looping is required
                                if type(pointers) is list:
                                    # multiple entries we can loop
                                    for x in pointers:
                                        if (x not in relevant_groups[target]):
                                            relevant_groups[target].append(x)
                                else:
                                    print(pointers)
                                    relevant_groups[target].append(pointers[0])
                                # dependent group is spent, create loner
                                relevant_groups[i] = list()
                                relevant_groups[i].append(target)
                        except:
                            print("shit's on fire, yo")
                            print(str(i) + " " + str(group) + " " + str(target) + " " + str(pointers))
        
        # clear loners
        for i, group in relevant_groups.iteritems():
            if (len(group) <= 1):
                target = group[0]
                if target in relevant_groups.index:
                    relevant_groups[target].append(i)
                    relevant_groups = relevant_groups.drop(i)         
        
        # dress up the group list        
        sorted_groups = list()
        for i, group in relevant_groups.iteritems():
            # replace target with own index
            temp = group
            temp.append(i)
            temp = sorted(list(set(temp)))
            sorted_groups.append(temp)
        sorted_groups = sorted(sorted_groups)
        
        # merge connected groups and remove duplicates
        for i, group_a in enumerate(sorted_groups):
            for k, group_b in enumerate(sorted_groups):
                if k is not i:
                    for x in group_a:
                        if x in set(group_b):
                            sorted_groups[i] = list(set(group_a).union(set(group_b)))
                            # both will point to the same list
                            sorted_groups[k] = sorted_groups[i]              
        clean = list()
        for group in sorted_groups:
            sgroup = sorted(group)
            if sgroup not in clean:
                clean.append(sgroup)
        clean = sorted(clean)
        
        print(len(list(set(list(itertools.chain.from_iterable(clean))))))
        
        new_groups = pd.Series(index=range(length), dtype='object')
        
        # initialize with own index
        for i in new_groups.index:
            if i not in too_small:
                new_groups[i] = list()
                new_groups[i].append(i)
        
        # include the newly generated groups
        for i, group in enumerate(clean):
            found = False
            for x in group:
                if x not in too_small:
                    # found target group that already was big enough
                    found = True
                    try:
                        # merge groups
                        temp = list()
                        temp.extend(group)
                        temp.extend(new_groups[x])
                        temp = sorted(list(set(temp)))
                        new_groups[x] = temp
                    except:
                        print(x)
                        print(new_groups[x])
                        print(group)
                    break
            if not found:
                # add new group only made of merged too_small groups
                new_groups[group[0]] = group
        
        new_groups = new_groups.dropna()
        
        clean = list()
        for i, group in new_groups.iteritems():
            sgroup = sorted(group)
            if sgroup not in clean:
                clean.append(sgroup)
        clean = sorted(clean)
        
        print(len(list(set(list(itertools.chain.from_iterable(clean))))))
        
        merge_results['Indexes'][str(merge_number)] = clean
        merge_results['Groups'][str(merge_number)] = list()
        merge_results['DataFrames'][str(merge_number)] = list()
        for i, pointers in enumerate(merge_results['Indexes'][str(merge_number)]):
            group = list()
            dfs = list()
            for pointer in pointers:
                df_temp = merge_results['DataFrames'][str(merge_number-1)][pointer]
                dfs.append(df_temp)
                for cluster in merge_results['Groups'][str(merge_number-1)][pointer]:
                    group.append(cluster)
            merge_results['Groups'][str(merge_number)].append(group)
            # merge the clusters' dataframes to one and add it
            merge_results['DataFrames'][str(merge_number)].append(pd.concat(dfs, sort=True))
        merge_number += 1

240
280
362
9
11
148
0


In [62]:
print(merge_results['Indexes'].index)
print(merge_number)

Index(['0', '1', '2'], dtype='object')
3


In [70]:
count = 0
for group in merge_results['Groups']['1']:
    count += len(group)
print(count)
print(len(merge_results['Groups']['1']))

2280
148


In [72]:
while len(merge_results['Groups'][str(merge_number - 1)]) > max_cluster_count:
    # distance matrix generation
    length = len(merge_results['Groups'][str(merge_number - 1)])
    targets = list()
    # dress the clusters for better distance performance
    for i, group in enumerate(merge_results['Groups'][str(merge_number - 1)]):
        # only select the distance relevant slice of the Dataframe
        target = merge_results['DataFrames'][str(merge_number - 1)][i].groupby(['time_on_sale']).mean()[clustering_feature]
        if (len(target) < 26):
            # fill with 0 until index 25 so all comparison arrays are the same length
            # this improves performance dramatically
            target = target.reindex(pd.RangeIndex(26)).fillna(0)
        targets.append(target)
    distances = get_distance_matrix(targets)
    clean = merge_all(distances)
    merge_results['Indexes'][str(merge_number)] = clean
    merge_results['Groups'][str(merge_number)] = list()
    merge_results['DataFrames'][str(merge_number)] = list()
    for i, pointers in enumerate(merge_results['Indexes'][str(merge_number)]):
        group = list()
        dfs = list()
        for pointer in pointers:
            df_temp = merge_results['DataFrames'][str(merge_number-1)][pointer]
            dfs.append(df_temp)
            for cluster in merge_results['Groups'][str(merge_number-1)][pointer]:
                group.append(cluster)
        merge_results['Groups'][str(merge_number)].append(group)
        # merge the clusters' dataframes to one and add it
        merge_results['DataFrames'][str(merge_number)].append(pd.concat(dfs, sort=True))
    merge_number += 1

140
22
22
2


In [73]:
print(merge_results['Indexes'].index)
print(merge_number)

Index(['0', '1', '2', '3', '4'], dtype='object')
5


In [77]:
count = 0
for group in merge_results['Groups']['3']:
    count += len(group)
print(count)
print(len(merge_results['Groups']['3']))

2280
22


In [108]:
def show_cluster_characteristics(layer, threshold=0.0):
    for i, df in enumerate(merge_results['DataFrames'][str(layer)]):
        found_something = False
        for col in df.select_dtypes(include=['category']):
            if "article_id" not in col:
                for characteristic in df[col].unique():
                    query_string = str(col) + " == " + '"' + str(characteristic) + '"'
                    temp_df = df.query(query_string)
                    temp_nunique = temp_df['article_id'].nunique()
                    temp_percentage = temp_nunique / data.query(query_string)['article_id'].nunique()
                    if temp_percentage > threshold:
                        found_something = True
                        print(str(i) + ": " + str(characteristic) + ": " + str(temp_nunique) + " " + "{0:.0%}".format(temp_percentage))
        if not found_something:
            print(str(i) + ": " + "outlier collection cluster")
        print("")

In [100]:
show_cluster_characteristics(4)

0: Abteilung003: 27 100%
0: Abteilung005: 301 21%
0: Abteilung002: 2514 65%
0: Abteilung006: 1401 94%
0: Abteilung001: 172 87%
0: Abteilung007: 681 49%
0: Abteilung004: 229 87%
0: WHG017: 19 100%
0: WHG016: 8 100%
0: WHG018: 22 100%
0: WHG021: 131 17%
0: WHG010: 342 70%
0: WHG008: 47 33%
0: WHG012: 407 66%
0: WHG015: 645 68%
0: WHG007: 465 77%
0: WHG034: 457 89%
0: WHG003: 45 100%
0: WHG004: 9 90%
0: WHG009: 257 54%
0: WHG042: 218 46%
0: WHG043: 178 56%
0: WHG001: 77 75%
0: WHG013: 15 38%
0: WHG019: 28 54%
0: WHG022: 36 13%
0: WHG041: 285 54%
0: WHG006: 193 63%
0: WHG005: 94 66%
0: WHG030: 25 100%
0: WHG023: 40 17%
0: WHG031: 32 100%
0: WHG014: 35 47%
0: WHG038: 252 94%
0: WHG020: 44 85%
0: WHG011: 5 29%
0: WHG035: 188 95%
0: WHG032: 200 98%
0: WHG036: 132 99%
0: WHG028: 112 89%
0: WHG027: 42 100%
0: WHG002: 50 98%
0: WHG029: 4 100%
0: WHG026: 71 77%
0: WHG033: 48 100%
0: WHG037: 67 99%
0: WUG058: 19 100%
0: WUG057: 8 100%
0: WUG059: 11 100%
0: WUG060: 7 100%
0: WUG061: 4 100%
0: WUG07

1: WUG054: 14 21%
1: WUG056: 4 8%
1: WUG148: 3 38%
1: WUG119: 41 100%
1: WUG120: 47 100%
1: WUG121: 34 100%
1: WUG122: 4 100%
1: WUG113: 23 100%
1: WUG110: 33 92%
1: WUG133: 1 100%
1: WUG004: 1 100%
1: WUG067: 5 16%
1: WUG066: 3 14%
1: WUG124: 6 100%
1: WUG043: 3 15%
1: WUG041: 12 40%
1: WUG036: 2 6%
1: WUG047: 2 100%
1: WUG065: 3 100%
1: WUG009: 1 25%
1: WUG094: 2 12%
1: WUG103: 4 57%
1: WUG024: 1 50%
1: WUG034: 3 9%
1: WUG026: 3 30%
1: WUG123: 1 100%
1: Hermodr: 26 87%
1: Fimmilena: 224 38%
1: Lodur: 198 98%
1: Alaisiagae: 256 100%
1: Freyr: 357 94%
1: Baduhenna: 15 100%
1: Travalaha: 249 82%
1: Gebrinius: 33 100%
1: Gausus: 3 100%
1: Baudihillia: 106 56%
1: Odin: 181 39%
1: Mercurius Arvernus: 274 61%
1: Beda: 170 99%
1: Turstuahenae: 132 40%
1: Aviaitinehae: 14 42%
1: Þorgerðr Holgabrúðr: 22 100%
1: Burorina: 130 73%
1: Almaviahenae: 125 98%
1: Loki: 100 62%
1: Siofna: 99 100%
1: Snotra: 229 92%
1: Hel: 21 40%
1: Uller: 33 37%
1: Hymir: 128 91%
1: Tuisto: 76 44%
1: Gna: 304 100%
1:

In [101]:
show_cluster_characteristics(4, 0.75)

0: Abteilung003: 27 100%
0: Abteilung006: 1401 94%
0: Abteilung001: 172 87%
0: Abteilung004: 229 87%
0: WHG017: 19 100%
0: WHG016: 8 100%
0: WHG018: 22 100%
0: WHG007: 465 77%
0: WHG034: 457 89%
0: WHG003: 45 100%
0: WHG004: 9 90%
0: WHG001: 77 75%
0: WHG030: 25 100%
0: WHG031: 32 100%
0: WHG038: 252 94%
0: WHG020: 44 85%
0: WHG035: 188 95%
0: WHG032: 200 98%
0: WHG036: 132 99%
0: WHG028: 112 89%
0: WHG027: 42 100%
0: WHG002: 50 98%
0: WHG029: 4 100%
0: WHG026: 71 77%
0: WHG033: 48 100%
0: WHG037: 67 99%
0: WUG058: 19 100%
0: WUG057: 8 100%
0: WUG059: 11 100%
0: WUG060: 7 100%
0: WUG061: 4 100%
0: WUG036: 29 94%
0: WUG034: 31 91%
0: WUG020: 326 85%
0: WUG117: 93 92%
0: WUG115: 113 82%
0: WUG114: 81 88%
0: WUG116: 75 95%
0: WUG006: 8 100%
0: WUG056: 48 92%
0: WUG054: 52 79%
0: WUG032: 3 100%
0: WUG043: 17 85%
0: WUG030: 46 82%
0: WUG001: 77 80%
0: WUG063: 14 100%
0: WUG017: 19 86%
0: WUG104: 25 100%
0: WUG108: 32 100%
0: WUG084: 28 100%
0: WUG125: 34 100%
0: WUG067: 26 84%
0: WUG066: 18

In [109]:
show_cluster_characteristics(3, 0.75)

0: Abteilung003: 27 100%
0: WHG017: 19 100%
0: WHG016: 8 100%
0: WHG018: 22 100%
0: WHG030: 25 100%
0: WHG031: 32 100%
0: WHG020: 44 85%
0: WUG058: 19 100%
0: WUG057: 8 100%
0: WUG059: 11 100%
0: WUG060: 7 100%
0: WUG061: 4 100%
0: WUG006: 8 100%
0: WUG032: 3 100%
0: WUG063: 14 100%
0: WUG017: 19 86%
0: WUG104: 25 100%
0: WUG108: 32 100%
0: WUG084: 28 100%
0: WUG125: 34 100%
0: WUG067: 26 84%
0: WUG066: 18 86%
0: WUG028: 1 100%
0: WUG111: 37 97%
0: WUG119: 40 98%
0: WUG121: 34 100%

1: WHG024: 5 100%
1: WHG039: 48 100%
1: WHG031: 32 100%
1: WHG033: 37 77%
1: WUG087: 5 100%
1: WUG152: 16 100%
1: WUG151: 23 100%
1: WUG150: 9 100%
1: WUG074: 53 84%
1: WUG085: 14 100%
1: WUG002: 6 100%
1: WUG064: 21 100%
1: WUG108: 32 100%
1: WUG083: 8 100%
1: WUG075: 1 100%
1: WUG080: 14 82%
1: WUG082: 21 100%
1: WUG044: 3 100%
1: WUG130: 15 100%
1: WUG128: 25 100%
1: WUG125: 33 97%
1: WUG118: 69 80%
1: WUG131: 44 100%
1: Almaviahenae: 111 87%
1: Sunuxsal: 12 100%
1: Thor: 5 100%

2: WHG040: 28 100%
2: WU