# Initialization

In [1]:
# Display plots inline
%matplotlib inline

# Autoreload all package before excecuting a call
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

import tailor
from tailor.clustering import *

In [3]:
data = tailor.load_data()

In [4]:
data.sample(10)

Unnamed: 0,article_id,season,brand,color,Abteilung,WHG,WUG,month,time_on_sale,original_price,sells_price,discount,markdown,article_count,stock_total,avq,revenue
246518,908447,Winter,Uller,dunkelgrau,Abteilung005,WHG023,WUG086,Mar,9,29.95,29.631667,0.318333,0.0,13.166667,2591.0,18.339123,389.26
152893,905249,Winter,Almaviahenae,mittelbraun,Abteilung005,WHG021,WUG069,Apr,9,69.95,69.95,0.0,0.0,3.6,629.0,16.248013,251.82
236842,908120,Spring,Beda,pink,Abteilung006,WHG032,WUG112,Aug,21,19.95,18.245,1.705,0.0,6.333333,3489.0,73.105952,116.203333
231475,907939,Winter,Almaviahenae,mittelgrau,Abteilung005,WHG021,WUG069,Feb,5,59.95,59.95,0.0,0.0,2.8,1189.0,5.399495,167.86
138096,904742,Summer,Fimmilena,schwarz / kombiniert,Abteilung007,WHG039,WUG150,Dec,22,89.95,89.95,0.0,0.0,2.0,1472.0,21.824049,179.9
12574,900451,Spring,Loki,schwarz / kombiniert,Abteilung001,WHG002,WUG003,Sep,20,15.95,15.95,0.0,0.0,2.0,3868.0,71.923475,31.9
43464,901511,Fall,Friagabis,bordeauxrot,Abteilung002,WHG015,WUG051,Dec,9,79.95,50.07,9.88,20.0,2.333333,578.0,28.085352,118.236667
211653,907241,Fall,Sigyn,mittelgrau,Abteilung006,WHG036,WUG119,Dec,7,29.95,29.58,0.37,0.0,12.142857,1792.0,35.419324,357.832857
154274,905296,Winter,Almaviahenae,dunkelblau,Abteilung005,WHG021,WUG072,May,11,59.95,39.686667,10.263333,10.0,20.666667,2781.0,19.585281,819.52
160420,905501,Spring,Kolga,schwarz / kombiniert,Abteilung006,WHG032,WUG109,Jun,10,19.95,19.562857,0.387143,0.0,9.0,3440.0,41.100498,176.901429


# Code

In [5]:
%%time

split_results = cluster.multi_feature_split(data, distance.euclidean, 50)

CPU times: user 2min 48s, sys: 2.36 s, total: 2min 50s
Wall time: 2min 54s


In [6]:
split_results['Clusters'].index

Index(['0', '1', '2', '3', '4', '5', '6', '7'], dtype='object')

In [7]:
split_results['Clusters']['5'][0]['Features']

Abteilung    Abteilung005
WHG                WHG021
brand           Fimmilena
WUG                WUG073
season             Spring
dtype: object

In [37]:
def get_cluster_parent_name(cluster):
    name = cluster['Name']
    # remove last character until name is the parent cluster's name
    terminate = False
    while not terminate:
        character = name[-1:]
        if ((character == "_") or (character == "0") or (character == "")):
            terminate = True
        name = name[:-1]
    return name

In [38]:
%%time

# get all clusters that remained unsplit
leafs = list()

# iterate through all layers of the clustering
for layer in split_results['Clusters'].index:
    # add all layer leaves and remove leaf parents
    for add_cluster in split_results['Clusters'][layer]:
        check_name = get_cluster_parent_name(add_cluster)
        # iterate until parent cluster is found then remove it
        for index, check_cluster in enumerate(leafs):
            if check_cluster['Name'] == check_name:
                # parent cluster found, remove it
                del leafs[index]
                # no more than one parent cluster, therefore exit second for loop
                break
        leafs.append(add_cluster)

CPU times: user 35.2 s, sys: 31.2 ms, total: 35.3 s
Wall time: 35.8 s


In [39]:
len(leafs)

2152

In [29]:
%%time

names = list()

for cluster in leafs:
    name = cluster['Name']
    # remove last character until name is the parent cluster's name
    found_underscore = False
    while not found_underscore:
        character = name[-1:]
        if character == "_":
            found_underscore = True
        name = name[:-1]
    names.append(name)
# only save uniques by converting to set 
names = set(names)

CPU times: user 46.9 ms, sys: 0 ns, total: 46.9 ms
Wall time: 33.7 ms


In [40]:
names

{'0_1_1',
 '0_1_1_1',
 '0_1_1_10',
 '0_1_1_10_2',
 '0_1_1_1_1',
 '0_1_1_2',
 '0_1_1_4',
 '0_1_1_5',
 '0_1_1_8',
 '0_1_1_8_1',
 '0_1_2',
 '0_1_2_2',
 '0_1_2_3',
 '0_1_3',
 '0_1_5',
 '0_1_6',
 '0_1_6_2',
 '0_1_6_3',
 '0_1_7',
 '0_2_1_1',
 '0_2_1_2',
 '0_2_1_4',
 '0_2_1_4_4',
 '0_2_2',
 '0_2_2_1',
 '0_2_2_1_2',
 '0_2_2_3',
 '0_2_2_3_1',
 '0_2_3',
 '0_2_3_1',
 '0_2_3_2',
 '0_2_3_3',
 '0_2_3_4',
 '0_2_3_5',
 '0_2_3_5_3',
 '0_2_3_5_8',
 '0_2_3_5_9',
 '0_3_3',
 '0_3_3_1',
 '0_4',
 '0_4_10_1',
 '0_4_10_2',
 '0_4_10_3',
 '0_4_10_4',
 '0_4_11',
 '0_4_11_3',
 '0_4_12',
 '0_4_12_2',
 '0_4_12_4',
 '0_4_13',
 '0_4_14_1',
 '0_4_14_10',
 '0_4_14_11',
 '0_4_14_12',
 '0_4_14_2',
 '0_4_14_3',
 '0_4_14_4',
 '0_4_14_5',
 '0_4_14_6',
 '0_4_14_7',
 '0_4_14_8',
 '0_4_14_9',
 '0_4_15',
 '0_4_2',
 '0_4_2_1',
 '0_4_2_3',
 '0_4_2_4',
 '0_4_3',
 '0_4_3_2',
 '0_4_3_4',
 '0_4_3_4_1',
 '0_4_3_4_1_1',
 '0_4_3_4_1_2',
 '0_4_3_4_1_3',
 '0_4_3_4_1_4',
 '0_4_3_4_1_5',
 '0_4_3_4_1_7',
 '0_4_4',
 '0_4_4_2',
 '0_4_5_1',
 '0_

In [41]:
len(names)

246

In [42]:
%%time

# get all clusters that are above min_cluster_size
parents = list()

# iterate through all layers of the clustering
for layer in split_results['Clusters'].index:
    # add all layer leaves and remove leaf parents
    for cluster in split_results['Clusters'][layer]:
        if cluster['Name'] in names:
            parents.append(cluster)

CPU times: user 31.2 ms, sys: 0 ns, total: 31.2 ms
Wall time: 36.3 ms


In [43]:
len(parents)

246

In [45]:
%%time

length = len(parents)
distances = np.ndarray(shape=(length,length))

for i, a in enumerate(parents):
    a_curve = a['DataFrame'].set_index('time_on_sale')
    for k, b in enumerate(parents):
        if k <= i:
            continue
        b_curve = b['DataFrame'].set_index('time_on_sale')
        d = distance.euclidean(a_curve['article_count'], b_curve['article_count'])
        distances[i][k] = d
        distances[k][i] = d 

CPU times: user 2min 43s, sys: 2min 23s, total: 5min 6s
Wall time: 5min 11s


In [46]:
distances

array([[6.93846042e-310, 1.02448231e+001, 6.80939050e+000, ...,
        7.98798962e+000, 9.48311156e+000, 8.79075366e+000],
       [1.02448231e+001, 2.33419537e-313, 9.63942160e+000, ...,
        1.09801628e+001, 1.12266351e+001, 9.72084027e+000],
       [6.80939050e+000, 9.63942160e+000, 0.00000000e+000, ...,
        6.07047095e+000, 7.41716720e+000, 6.71621855e+000],
       ...,
       [7.98798962e+000, 1.09801628e+001, 6.07047095e+000, ...,
        5.92878775e-323, 8.56393049e+000, 9.61645217e+000],
       [9.48311156e+000, 1.12266351e+001, 7.41716720e+000, ...,
        8.56393049e+000, 9.88131292e-323, 9.71919102e+000],
       [8.79075366e+000, 9.72084027e+000, 6.71621855e+000, ...,
        9.61645217e+000, 9.71919102e+000, 4.94065646e-324]])