# Initialization

In [1]:
# Display plots inline
%matplotlib inline

# Autoreload all package before excecuting a call
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import multiprocessing as mp

import tailor
from tailor.clustering import *

In [3]:
data = tailor.load_data()

In [4]:
data.sample(10)

Unnamed: 0,article_id,season,brand,color,Abteilung,WHG,WUG,month,time_on_sale,original_price,sells_price,discount,markdown,article_count,stock_total,avq,revenue
107143,903684,Summer,Heimdall,rosa,Abteilung006,WHG035,WUG118,Sep,8,54.95,48.54,6.41,0.0,2.5,524.0,15.314885,120.8925
241401,908273,Winter,Fimmilena,schwarz,Abteilung005,WHG021,WUG073,Jun,22,69.95,68.69,1.26,0.0,4.2,3669.0,29.844644,287.49
196873,906744,Summer,Mani,schwarz,Abteilung002,WHG010,WUG033,Sep,3,79.95,78.458571,1.491429,0.0,3.714286,2808.0,2.126577,295.025714
94673,903250,Spring,Mercurius Arvernus,dunkelblau,Abteilung002,WHG012,WUG045,Mar,0,39.95,37.286667,2.663333,0.0,1.333333,262.0,0.890585,50.603333
185005,906337,Spring,Mercurius Arvernus,gelb,Abteilung002,WHG007,WUG020,May,4,39.95,39.9225,0.0275,0.0,1.25,428.0,7.885514,49.8825
134431,904613,Fall,Baudihillia,hellbraun,Abteilung006,WHG038,WUG132,Jan,19,59.95,44.195,5.755,10.0,5.666667,3176.0,60.201511,256.315
144682,904975,Fall,Tuisto,dunkelblau,Abteilung002,WHG006,WUG015,Nov,4,39.95,39.131667,0.818333,0.0,2.333333,1006.0,3.628231,91.276667
26061,900908,Summer,Freyr,schwarz,Abteilung002,WHG008,WUG022,Sep,0,99.99,99.99,0.0,0.0,3.5,2468.0,1.094003,334.965
140047,904810,Summer,Mercurius Arvernus,dunkelblau,Abteilung002,WHG006,WUG015,Dec,18,39.95,38.95,1.0,0.0,3.0,3819.0,36.842105,117.85
125672,904306,Summer,Tyr,schwarz / kombiniert,Abteilung007,WHG042,WUG138,Oct,10,49.95,49.7,0.25,0.0,5.166667,1939.0,27.737665,257.575


# Code

In [5]:
%%time

split_results = cluster.multi_feature_split(data, distance.euclidean, 50)

CPU times: user 2min 38s, sys: 1.45 s, total: 2min 40s
Wall time: 2min 42s


In [6]:
split_results['Clusters'].index

Index(['0', '1', '2', '3', '4', '5', '6', '7'], dtype='object')

In [7]:
split_results['Clusters']['5'][0]['Features']

Abteilung    Abteilung005
WHG                WHG021
brand           Fimmilena
WUG                WUG073
season             Spring
dtype: object

In [8]:
def get_cluster_parent_name(cluster):
    name = cluster['Name']
    # remove last character until name is the parent cluster's name
    terminate = False
    while not terminate:
        character = name[-1:]
        if ((character == "_") or (character == "")):
            terminate = True
        name = name[:-1]
    return name

In [9]:
%%time

# get all clusters that remained unsplit
leafs = list()

# iterate through all layers of the clustering
for layer in split_results['Clusters'].index:
    # add all layer leaves and remove leaf parents
    for add_cluster in split_results['Clusters'][layer]:
        check_name = get_cluster_parent_name(add_cluster)
        # iterate until parent cluster is found then remove it
        for index, check_cluster in enumerate(leafs):
            if check_cluster['Name'] == check_name:
                # parent cluster found, remove it
                del leafs[index]
                # no more than one parent cluster, therefore exit second for loop
                break
        leafs.append(add_cluster)

CPU times: user 34.4 s, sys: 15.6 ms, total: 34.4 s
Wall time: 34.7 s


In [10]:
len(leafs)

2280

In [11]:
%%time

names = list()

for cluster in leafs:
    name = cluster['Name']
    names.append(name)
# sort by underscore count
names.sort(key = lambda s: s.count("_"), reverse=True)

CPU times: user 46.9 ms, sys: 0 ns, total: 46.9 ms
Wall time: 32 ms


In [12]:
names

['0_4_5_1_2_1_1_1',
 '0_4_5_1_2_1_1_2',
 '0_4_5_1_2_1_1_3',
 '0_4_5_1_2_1_1_4',
 '0_4_5_1_2_1_1_5',
 '0_4_5_1_2_1_1_6',
 '0_4_5_1_2_1_1_7',
 '0_4_5_1_2_1_1_8',
 '0_4_5_1_2_1_1_9',
 '0_4_5_1_2_1_1_10',
 '0_4_5_1_2_1_1_11',
 '0_4_5_1_2_1_1_12',
 '0_4_5_1_2_1_1_13',
 '0_4_5_1_2_1_1_14',
 '0_4_5_1_2_1_1_15',
 '0_4_5_1_2_1_1_16',
 '0_4_5_1_2_2_1_1',
 '0_4_5_1_2_2_1_2',
 '0_4_5_1_2_2_1_3',
 '0_4_5_1_2_2_1_4',
 '0_4_5_1_2_2_1_5',
 '0_4_5_1_2_2_1_6',
 '0_4_5_1_2_2_1_7',
 '0_4_5_1_2_2_1_8',
 '0_4_5_1_2_2_1_9',
 '0_4_5_1_2_2_1_10',
 '0_4_5_1_2_2_1_11',
 '0_4_5_1_2_2_1_12',
 '0_4_5_1_2_2_1_13',
 '0_4_5_1_2_2_1_14',
 '0_4_5_1_2_2_1_15',
 '0_4_5_1_2_2_1_16',
 '0_4_5_1_2_2_1_17',
 '0_4_5_1_2_3_1_1',
 '0_4_5_1_2_3_1_2',
 '0_4_5_1_2_3_1_3',
 '0_4_5_1_2_3_1_4',
 '0_4_5_1_2_3_1_5',
 '0_4_5_1_2_3_1_6',
 '0_4_5_1_2_3_1_7',
 '0_4_5_1_2_3_1_8',
 '0_4_5_1_2_3_1_9',
 '0_4_5_1_2_3_1_10',
 '0_4_5_1_2_3_1_11',
 '0_4_5_1_2_3_1_12',
 '0_4_5_1_2_3_1_13',
 '0_4_5_1_2_3_1_14',
 '0_4_5_1_2_3_1_15',
 '0_4_5_1_2_3_1_16'

In [13]:
len(names)

2280

In [14]:
%%time

# get all clusters that are above min_cluster_size
parents = list()

# iterate through all layers of the clustering
for layer in split_results['Clusters'].index:
    # add all layer leaves and remove leaf parents
    for cluster in split_results['Clusters'][layer]:
        if cluster['Name'] in names:
            parents.append(cluster)

CPU times: user 109 ms, sys: 0 ns, total: 109 ms
Wall time: 97 ms


In [15]:
len(parents)

2280

In [86]:
%%time

length = len(parents)
distances = pd.DataFrame(index=range(length),columns=range(length))
targets = list()

# dress the clusters for better distance performance
for i, cluster in enumerate(parents):
    # only select the distance relevant slice of the Dataframe
    target = cluster['DataFrame'].groupby(['time_on_sale']).mean()['article_count']
    if (len(target) < 26):
        # fill with 0 for better performance later on
        target = target.reindex(pd.RangeIndex(26)).fillna(0)
    targets.append(target)

CPU times: user 4.72 s, sys: 46.9 ms, total: 4.77 s
Wall time: 4.77 s


In [98]:
%%time
length = len(targets)
for i, a in enumerate(targets):
    for k, b in enumerate(reversed(targets)):
        j = length - 1 - k
        if j <= i:
            break
        else:
            try:
                d = distance.euclidean(a.values,b.values)
                distances[i][j] = d
                distances[j][i] = d
            except:
                print(str(i) + " " + str(k))

CPU times: user 8min 45s, sys: 1.27 s, total: 8min 46s
Wall time: 8min 47s


In [99]:
distances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2270,2271,2272,2273,2274,2275,2276,2277,2278,2279
0,,9.53641,3.48503,2.63568,3.55436,10.7528,12.0625,16.0806,6.36208,19.7846,...,9.90423,7.21291,8.23462,9.92501,7.651,9.41321,8.67895,8.38961,9.8068,9.70231
1,9.53641,,3.53182,2.6765,3.65884,10.8573,12.167,16.1851,6.46657,19.8891,...,9.73834,6.96664,8.02103,8.33874,7.37754,9.24731,8.38641,8.35181,9.6409,9.53641
2,3.48503,3.53182,,3.33064,3.67272,9.07803,10.5388,14.4058,6.07719,19.3406,...,3.62926,5.00225,4.92029,7.2767,4.78076,3.8767,4.89144,4.34355,3.53182,3.48503
3,2.63568,2.6765,3.33064,,3.8534,9.88445,10.9125,14.9306,5.30076,18.6346,...,2.77394,3.75345,2.99566,5.60067,2.98934,2.36543,3.48024,3.33064,2.6765,2.63568
4,3.55436,3.65884,3.67272,3.8534,,10.4637,11.7734,15.7915,6.07298,19.4955,...,3.75628,3.55647,3.17076,5.86361,3.3549,3.26526,3.8534,3.67272,3.65884,3.55436
5,10.7528,10.8573,9.07803,9.88445,10.4637,,10.0809,13.9195,4.8096,17.7453,...,10.9547,8.50882,9.70692,11.6166,9.04192,10.4637,9.88445,9.07803,10.8573,10.7528
6,12.0625,12.167,10.5388,10.9125,11.7734,10.0809,,15.5175,6.27832,15.7782,...,12.2645,8.92729,10.5472,11.8127,10.0809,11.7734,10.9125,10.5388,12.167,12.0625
7,16.0806,16.1851,14.4058,14.9306,15.7915,13.9195,15.5175,,4.8467,18.2692,...,16.2826,12.9454,14.5652,15.5175,13.9195,15.7915,14.9306,14.4058,16.1851,16.0806
8,6.36208,6.46657,6.07719,5.30076,6.07298,4.8096,6.27832,4.8467,,18.9134,...,6.564,4.9423,4.8467,6.27832,4.8096,6.07298,5.30076,6.07719,6.46657,6.36208
9,19.7846,19.8891,19.3406,18.6346,19.4955,17.7453,15.7782,18.2692,18.9134,,...,19.9865,18.9134,18.2692,15.7782,17.7453,19.4955,18.6346,19.3406,19.8891,19.7846
