# Initialization

In [1]:
# Display plots inline
%matplotlib inline

# Autoreload all package before excecuting a call
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import multiprocessing as mp

import tailor
from tailor.clustering import *

In [3]:
data = tailor.load_data()

In [4]:
data.sample(10)

Unnamed: 0,article_id,season,brand,color,Abteilung,WHG,WUG,month,time_on_sale,original_price,sells_price,discount,markdown,article_count,stock_total,avq,revenue
107143,903684,Summer,Heimdall,rosa,Abteilung006,WHG035,WUG118,Sep,8,54.95,48.54,6.41,0.0,2.5,524.0,15.314885,120.8925
241401,908273,Winter,Fimmilena,schwarz,Abteilung005,WHG021,WUG073,Jun,22,69.95,68.69,1.26,0.0,4.2,3669.0,29.844644,287.49
196873,906744,Summer,Mani,schwarz,Abteilung002,WHG010,WUG033,Sep,3,79.95,78.458571,1.491429,0.0,3.714286,2808.0,2.126577,295.025714
94673,903250,Spring,Mercurius Arvernus,dunkelblau,Abteilung002,WHG012,WUG045,Mar,0,39.95,37.286667,2.663333,0.0,1.333333,262.0,0.890585,50.603333
185005,906337,Spring,Mercurius Arvernus,gelb,Abteilung002,WHG007,WUG020,May,4,39.95,39.9225,0.0275,0.0,1.25,428.0,7.885514,49.8825
134431,904613,Fall,Baudihillia,hellbraun,Abteilung006,WHG038,WUG132,Jan,19,59.95,44.195,5.755,10.0,5.666667,3176.0,60.201511,256.315
144682,904975,Fall,Tuisto,dunkelblau,Abteilung002,WHG006,WUG015,Nov,4,39.95,39.131667,0.818333,0.0,2.333333,1006.0,3.628231,91.276667
26061,900908,Summer,Freyr,schwarz,Abteilung002,WHG008,WUG022,Sep,0,99.99,99.99,0.0,0.0,3.5,2468.0,1.094003,334.965
140047,904810,Summer,Mercurius Arvernus,dunkelblau,Abteilung002,WHG006,WUG015,Dec,18,39.95,38.95,1.0,0.0,3.0,3819.0,36.842105,117.85
125672,904306,Summer,Tyr,schwarz / kombiniert,Abteilung007,WHG042,WUG138,Oct,10,49.95,49.7,0.25,0.0,5.166667,1939.0,27.737665,257.575


# Code

In [5]:
%%time

split_results = cluster.multi_feature_split(data, distance.euclidean, 50)

CPU times: user 2min 38s, sys: 1.45 s, total: 2min 40s
Wall time: 2min 42s


In [6]:
split_results['Clusters'].index

Index(['0', '1', '2', '3', '4', '5', '6', '7'], dtype='object')

In [7]:
split_results['Clusters']['5'][0]['Features']

Abteilung    Abteilung005
WHG                WHG021
brand           Fimmilena
WUG                WUG073
season             Spring
dtype: object

In [8]:
def get_cluster_parent_name(cluster):
    name = cluster['Name']
    # remove last character until name is the parent cluster's name
    terminate = False
    while not terminate:
        character = name[-1:]
        if ((character == "_") or (character == "")):
            terminate = True
        name = name[:-1]
    return name

In [9]:
%%time

# get all clusters that remained unsplit
leafs = list()

# iterate through all layers of the clustering
for layer in split_results['Clusters'].index:
    # add all layer leaves and remove leaf parents
    for add_cluster in split_results['Clusters'][layer]:
        check_name = get_cluster_parent_name(add_cluster)
        # iterate until parent cluster is found then remove it
        for index, check_cluster in enumerate(leafs):
            if check_cluster['Name'] == check_name:
                # parent cluster found, remove it
                del leafs[index]
                # no more than one parent cluster, therefore exit second for loop
                break
        leafs.append(add_cluster)

CPU times: user 34.4 s, sys: 15.6 ms, total: 34.4 s
Wall time: 34.7 s


In [10]:
len(leafs)

2280

In [11]:
%%time

names = list()

for cluster in leafs:
    name = cluster['Name']
    names.append(name)
# sort by underscore count
names.sort(key = lambda s: s.count("_"), reverse=True)

CPU times: user 46.9 ms, sys: 0 ns, total: 46.9 ms
Wall time: 32 ms


In [12]:
names

['0_4_5_1_2_1_1_1',
 '0_4_5_1_2_1_1_2',
 '0_4_5_1_2_1_1_3',
 '0_4_5_1_2_1_1_4',
 '0_4_5_1_2_1_1_5',
 '0_4_5_1_2_1_1_6',
 '0_4_5_1_2_1_1_7',
 '0_4_5_1_2_1_1_8',
 '0_4_5_1_2_1_1_9',
 '0_4_5_1_2_1_1_10',
 '0_4_5_1_2_1_1_11',
 '0_4_5_1_2_1_1_12',
 '0_4_5_1_2_1_1_13',
 '0_4_5_1_2_1_1_14',
 '0_4_5_1_2_1_1_15',
 '0_4_5_1_2_1_1_16',
 '0_4_5_1_2_2_1_1',
 '0_4_5_1_2_2_1_2',
 '0_4_5_1_2_2_1_3',
 '0_4_5_1_2_2_1_4',
 '0_4_5_1_2_2_1_5',
 '0_4_5_1_2_2_1_6',
 '0_4_5_1_2_2_1_7',
 '0_4_5_1_2_2_1_8',
 '0_4_5_1_2_2_1_9',
 '0_4_5_1_2_2_1_10',
 '0_4_5_1_2_2_1_11',
 '0_4_5_1_2_2_1_12',
 '0_4_5_1_2_2_1_13',
 '0_4_5_1_2_2_1_14',
 '0_4_5_1_2_2_1_15',
 '0_4_5_1_2_2_1_16',
 '0_4_5_1_2_2_1_17',
 '0_4_5_1_2_3_1_1',
 '0_4_5_1_2_3_1_2',
 '0_4_5_1_2_3_1_3',
 '0_4_5_1_2_3_1_4',
 '0_4_5_1_2_3_1_5',
 '0_4_5_1_2_3_1_6',
 '0_4_5_1_2_3_1_7',
 '0_4_5_1_2_3_1_8',
 '0_4_5_1_2_3_1_9',
 '0_4_5_1_2_3_1_10',
 '0_4_5_1_2_3_1_11',
 '0_4_5_1_2_3_1_12',
 '0_4_5_1_2_3_1_13',
 '0_4_5_1_2_3_1_14',
 '0_4_5_1_2_3_1_15',
 '0_4_5_1_2_3_1_16'

In [13]:
len(names)

2280

In [14]:
%%time

# get all clusters based on the name
clusters = list()

# iterate through all layers of the clustering
for layer in split_results['Clusters'].index:
    # add all layer leaves and remove leaf parents
    for cluster in split_results['Clusters'][layer]:
        if cluster['Name'] in names:
            clusters.append(cluster)

CPU times: user 109 ms, sys: 0 ns, total: 109 ms
Wall time: 97 ms


In [15]:
len(clusters)

2280

In [101]:
%%time

length = len(clusters)
distances = pd.DataFrame(index=range(length),columns=range(length))
targets = list()

# dress the clusters for better distance performance
for i, cluster in enumerate(clusters):
    # only select the distance relevant slice of the Dataframe
    target = cluster['DataFrame'].groupby(['time_on_sale']).mean()['article_count']
    if (len(target) < 26):
        # fill with 0 until index 25 so all comparison arrays are the same length
        # this improves performance dramatically
        target = target.reindex(pd.RangeIndex(26)).fillna(0)
    targets.append(target)

CPU times: user 4.67 s, sys: 31.2 ms, total: 4.7 s
Wall time: 4.72 s


In [103]:
%%time
length = len(targets)
for i, a in enumerate(targets):
    for k, b in enumerate(reversed(targets)):
        j = length - 1 - k
        if j <= i:
            break
        else:
            try:
                d = distance.euclidean(a.values,b.values)
                distances[i][j] = d
                distances[j][i] = d
            except:
                print(str(i) + " " + str(k))

CPU times: user 9min 50s, sys: 5.66 s, total: 9min 55s
Wall time: 9min 52s


In [104]:
distances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2270,2271,2272,2273,2274,2275,2276,2277,2278,2279
0,,2.03832,6.69048,7.1303,6.14795,3.85276,3.10528,6.54338,4.18375,18.8841,...,9.90423,7.21291,8.23462,9.92501,7.651,9.41321,8.67895,8.38961,9.8068,9.70231
1,2.03832,,6.76835,6.9644,5.98206,4.62162,3.93246,7.31497,3.5728,17.8055,...,9.73834,6.96664,8.02103,8.33874,7.37754,9.24731,8.38641,8.35181,9.6409,9.53641
2,6.69048,6.76835,,2.42506,2.46595,7.86267,8.93264,12.6533,4.5055,20.723,...,3.62926,5.00225,4.92029,7.2767,4.78076,3.8767,4.89144,4.34355,3.53182,3.48503
3,7.1303,6.9644,2.42506,,1.22592,8.35691,9.49053,13.5086,3.79811,18.3857,...,2.77394,3.75345,2.99566,5.60067,2.98934,2.36543,3.48024,3.33064,2.6765,2.63568
4,6.14795,5.98206,2.46595,1.22592,,7.243,8.50819,12.5263,2.80772,18.2722,...,3.75628,3.55647,3.17076,5.86361,3.3549,3.26526,3.8534,3.67272,3.65884,3.55436
5,3.85276,4.62162,7.86267,8.35691,7.243,,3.48907,5.56525,6.53169,21.0343,...,10.9547,8.50882,9.70692,11.6166,9.04192,10.4637,9.88445,9.07803,10.8573,10.7528
6,3.10528,3.93246,8.93264,9.49053,8.50819,3.48907,,4.02372,6.30318,20.637,...,12.2645,8.92729,10.5472,11.8127,10.0809,11.7734,10.9125,10.5388,12.167,12.0625
7,6.54338,7.31497,12.6533,13.5086,12.5263,5.56525,4.02372,,10.2824,23.0869,...,16.2826,12.9454,14.5652,15.5175,13.9195,15.7915,14.9306,14.4058,16.1851,16.0806
8,4.18375,3.5728,4.5055,3.79811,2.80772,6.53169,6.30318,10.2824,,17.1023,...,6.564,4.9423,4.8467,6.27832,4.8096,6.07298,5.30076,6.07719,6.46657,6.36208
9,18.8841,17.8055,20.723,18.3857,18.2722,21.0343,20.637,23.0869,17.1023,,...,19.9865,18.9134,18.2692,15.7782,17.7453,19.4955,18.6346,19.3406,19.8891,19.7846


In [112]:
min_index = np.nanargmin(distances[0])
min_value = np.nanmin(distances[0])
print(str(min_index) + " " + str(min_value))

41 1.5473443449213393


In [114]:
distances[0][41]

1.5473443449213393

In [339]:
# get the closest cluster for each cluster
# generates a Series with pointer lists
closest_clusters = pd.Series(index=range(length), dtype='object')
for i in distances.index:
    target_index = np.nanargmin(distances[i]).item()
    # only one value now, but we will add values later
    closest_clusters[i] = list()
    closest_clusters[i].append(target_index)

    
cluster_groups = closest_clusters
    
# generate initial groups by adding the index to the target
for i, group in cluster_groups.iteritems():
    # first value is the initial closest cluster
    target = group[0]
    cluster_groups[target].append(i)

# merge until there are only loners and groups with a pointer loop  
# a pointer loop is when two cluster point towards each other, even over multiple cluster between
finished = False 
while not finished:
    finished = True
    
    # merge dependencies
    for i, group in cluster_groups.iteritems():
        # ignore loners
        if len(group) > 1:
            # first value is the initial closest cluster
            target = group[0]
            # rest of the values are pointers added by dependent groups
            pointers = group[1:]
            try:
                # check whether this is a dependent group without a pointer loop
                if (target not in pointers):
                    # still dependent groups left, we need to iterate at least one more time
                    finished = False
                    # sanity check whether looping is required
                    if ((pointers is list) or (pointers is tuple)):
                        # multiple entries we can loop
                        for x in pointers:
                            if (x not in cluster_groups[target]):
                                cluster_groups[target].append(x)
                    elif len(pointers) > 0:
                        cluster_groups[target].append(pointers[0])
                    # dependent group is spent, create loner
                    cluster_groups[i] = list()
                    cluster_groups[i].append(target)
            except:
                print("shit's on fire, yo")
                print(str(i) + " " + str(group) + " " + str(target) + " " + str(pointers))

# clear loners
for i, group in cluster_groups.iteritems():
    if (len(group) <= 1):
        cluster_groups = cluster_groups.drop(i) 

# dress up the group list        
merged_groups = list()
for i, group in cluster_groups.iteritems():
    # replace target with own index
    temp = group
    temp[0] = i
    temp = sorted(temp)
    merged_groups.append(temp)
merged_groups = sorted(merged_groups)

# merge connected groups and remove duplicates
for i, group_a in enumerate(merged_groups):
    if group_a is not None:
        for k, group_b in enumerate(merged_groups):
            if k != i:
                for x in group_a:
                    if group_b is not None:
                        if x in set(group_b):
                            group_a = sorted(list(set(group_a).union(set(group_b))))
                            merged_groups[k] = None
clean = list(filter(lambda x: x is not None, merged_groups))

In [340]:
len(clean)

362

In [341]:
clean

[[0, 1, 41, 170, 180, 412, 1020, 1592],
 [2, 743, 1431],
 [3, 20, 84, 136, 137, 270, 283, 1149, 1390],
 [4, 69, 71, 120, 311, 359, 1501],
 [5, 101, 110, 379, 1228, 1333],
 [6, 103, 208, 210, 331, 634, 1237],
 [7, 29, 32, 33, 154, 165, 209, 982],
 [8, 104, 109, 115, 140, 168, 291, 355, 1209],
 [10, 126, 155, 156, 157, 158, 595],
 [11, 49, 322, 329, 348, 390, 456, 509, 632, 1235],
 [12, 148, 288, 297],
 [13, 14, 26, 45, 53, 54, 93, 199, 313, 316, 618, 1236],
 [15, 62, 102],
 [16, 50],
 [17, 627, 1385, 1692, 2078],
 [18, 48, 65, 159, 171, 176, 189, 432, 446, 1223],
 [19, 24, 42, 55, 141, 195, 197, 262, 275, 276, 354, 712, 1605],
 [21, 603],
 [22, 162, 203, 328, 334, 447, 608, 630, 1292],
 [25, 427, 619, 878, 1081],
 [27, 362, 599, 1026, 1347],
 [30, 307, 321, 431, 641],
 [31, 43, 79, 86, 268, 303, 314, 436, 1289],
 [34, 372, 398],
 [35, 47, 85, 284, 341, 458],
 [36, 88, 134, 263, 271, 286, 310, 332],
 [37, 207, 339, 344],
 [38, 884, 1698, 1908],
 [39, 254, 1099, 1135, 1358, 1386, 1623, 16