# Initialization

In [1]:
# Display plots inline
%matplotlib inline

# Autoreload all package before excecuting a call
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

import tailor
from tailor.clustering import *
from tailor.visualization import *

In [3]:
data = tailor.load_data()

In [4]:
data.sample(10)

Unnamed: 0,article_id,season,brand,color,Abteilung,WHG,WUG,month,time_on_sale,original_price,sells_price,discount,markdown,article_count,stock_total,avq,revenue
188163,906451,Winter,Gautr,schwarz,Abteilung002,WHG006,WUG016,May,17,49.95,49.815,0.135,0.0,17.666667,3731.0,45.376575,881.156667
40177,901395,Summer,Sunuxsal,pink,Abteilung004,WHG028,WUG102,Nov,13,19.95,19.314,0.636,0.0,2.2,1117.0,16.204118,42.532
197371,906760,Summer,Mercurius Arvernus,dunkelbraun,Abteilung002,WHG015,WUG051,Nov,14,79.95,78.43,1.52,0.0,9.5,2649.0,37.013968,748.06
66276,902290,Winter,Hel,schwarz,Abteilung005,WHG021,WUG069,Apr,15,39.95,39.55,0.4,0.0,19.25,4685.0,16.195304,760.905
70176,902420,Winter,Nersihenae,mittelblau,Abteilung007,WHG040,WUG135,Jun,17,29.95,28.328333,1.621667,0.0,4.0,1603.0,34.63298,113.171667
79738,902743,Spring,Heimdall,dunkelblau,Abteilung002,WHG012,WUG040,Jul,17,79.95,59.95,0.0,20.0,3.0,1098.0,33.934426,179.85
108392,903727,Summer,Alaisiagae,hellgrau,Abteilung007,WHG041,WUG136,Jul,0,49.95,49.721667,0.228333,0.0,8.0,4963.0,0.433206,397.196667
165675,905676,Winter,Hel,schwarz,Abteilung005,WHG021,WUG069,Jan,2,29.95,29.95,0.0,0.0,1.75,979.0,1.966292,52.4125
79439,902733,Winter,Heimdall,weiss,Abteilung002,WHG012,WUG045,Jun,15,89.95,84.0825,0.8675,5.0,1.25,561.0,29.099822,106.57
3338,900113,Spring,Gersimi,schwarz / kombiniert,Abteilung001,WHG002,WUG003,Sep,21,9.95,9.95,0.0,0.0,2.8,4870.0,90.948665,27.86


# Code

In [5]:
feats = ['color', 'brand', 'Abteilung', 'WHG', 'WUG', 'season']
ranking.rank_features(data, distance.euclidean , feats, 'article_count').index[0]

'Abteilung'

In [6]:
min_cluster_size = 10

split_number = 0
split_possible = True

# this will contain the whole hierarchical top-down clustering
split_results = pd.Series()
# this will contain all the clusters in an array with the split_number as index
split_results['Clusters'] = pd.Series()
# this will contain all the clusters' split features in an array with the split_number as index
split_results['Features'] = pd.Series()

# this is the data structure used for all clusters
first_cluster = pd.Series()
# this only contains the cluster's articles, all split clusteres will use splines of this
first_cluster['DataFrame'] = data.copy()
# this contains the features and characteristics used for the cluster
first_cluster['Features'] = pd.Series()
# the name will be defined in a manner that the hierarchy of the clustering will become clear
first_cluster['Name'] = "0"


In [7]:
# initializing the 0 split
# adding the base cluster
split_results['Clusters'][str(split_number)] = list()
split_results['Clusters'][str(split_number)].append(first_cluster)

# determining the feature the cluster should be split by
split_feature = ranking.rank_features(first_cluster['DataFrame'], distance.euclidean , feats, 'article_count').index[0]
# the split_feature is entered in the unsplit layer
split_results['Features'][str(split_number)] = list()
split_results['Features'][str(split_number)].append(split_feature)

In [8]:
%%time

while (split_possible):
    split_possible = False
    for position, cluster in enumerate(split_results['Clusters'][str(split_number)]):
        if (cluster['DataFrame']['article_id'].nunique() > min_cluster_size):
            if (split_possible == False):
                split_possible = True
            # retrieving the feature to split the cluster
            split_feature = split_results['Features'][str(split_number)][position]
            # retrieving the values the cluster will be split into
            feature_uniques = cluster['DataFrame'][split_feature].unique()
            
            df = cluster['DataFrame']
            # generating the new split layer
            new_layer = split_number + 1
            split_results['Clusters'][str(new_layer)] = list()
            split_results['Features'][str(new_layer)] = list()
            
            for position, characteristic in enumerate(feature_uniques):
                # create new cluster
                new_cluster = pd.Series()
                # select the relevant part of the dataframe
                new_cluster['DataFrame'] = df[df[split_feature] == characteristic].drop(columns=[split_feature])
                # copy the features from the parent cluster
                new_cluster['Features'] = cluster['Features'].copy()
                # add the split feature to it
                new_cluster['Features'][split_feature] = characteristic
                # name the cluster
                new_cluster['Name'] = cluster['Name'] + "_" + str(position + 1)
                
                # retrieve the features relevant for clustering
                rank_features = new_cluster['DataFrame'].select_dtypes(include=['category']).drop(columns=['article_id']).columns.values
                # determine the feature the new cluster will be split by
                new_split_feature = ranking.rank_features(new_cluster['DataFrame'], distance.euclidean , rank_features, 'article_count').index[0]
                
                # add the cluster to the split_results
                split_results['Clusters'][str(new_layer)].append(new_cluster)
                split_results['Features'][str(new_layer)].append(new_split_feature)
            
            
    split_number += 1

CPU times: user 33.1 s, sys: 203 ms, total: 33.3 s
Wall time: 34 s


In [9]:
print(split_results['Clusters']['0'])

[DataFrame           article_id  season         brand       ...
Features                            Series([], dtype: float64)
Name                                                         0
dtype: object]


In [10]:
print(split_results['Features']['1'])

['WHG', 'WHG', 'WUG', 'brand', 'WUG', 'brand', 'month']


In [11]:
print(split_results['Clusters'].index)

Index(['0', '1', '2', '3', '4'], dtype='object')


In [12]:
print(split_results['Clusters']['4'])

[DataFrame           article_id  season       color     WUG ...
Features     Abteilung    Abteilung006
month               ...
Name                                                0_7_12_7_1
dtype: object, DataFrame           article_id  season       color     WUG ...
Features     Abteilung    Abteilung006
month               ...
Name                                                0_7_12_7_2
dtype: object]


In [13]:
print(split_results['Clusters']['4'][0]['Features'])

Abteilung    Abteilung006
month                 Jan
WHG                WHG030
brand           Travalaha
dtype: object


In [14]:
split_results['Clusters']['3'][0]['DataFrame']

Unnamed: 0,article_id,season,brand,color,WUG,time_on_sale,original_price,sells_price,discount,markdown,article_count,stock_total,avq,revenue
18380,900645,Summer,Mercurius Arvernus,mittelbraun,WUG132,19,59.95,38.415000,21.535000,0.000000,2.250000,1554.0,82.898970,91.667500
18381,900645,Summer,Mercurius Arvernus,mittelbraun,WUG132,20,59.95,35.667500,15.282500,9.000000,4.500000,1554.0,83.735521,160.792500
18382,900645,Summer,Mercurius Arvernus,mittelbraun,WUG132,21,59.95,45.552000,2.398000,12.000000,2.000000,1554.0,84.478764,91.104000
18383,900645,Summer,Mercurius Arvernus,mittelbraun,WUG132,22,59.95,41.522000,6.428000,12.000000,1.400000,1554.0,84.980695,60.702000
18384,900645,Summer,Mercurius Arvernus,mittelbraun,WUG132,23,59.95,38.343333,7.606667,14.000000,2.666667,1554.0,85.435435,107.130000
18441,900647,Summer,Mercurius Arvernus,schwarz,WUG132,18,65.95,55.020000,4.930000,6.000000,8.000000,1524.0,60.301837,440.160000
18442,900647,Summer,Mercurius Arvernus,schwarz,WUG132,19,65.95,41.650000,14.300000,10.000000,4.500000,1524.0,61.472003,193.795000
18443,900647,Summer,Mercurius Arvernus,schwarz,WUG132,20,65.95,43.660000,4.290000,18.000000,4.500000,1524.0,62.959318,188.540000
18444,900647,Summer,Mercurius Arvernus,schwarz,WUG132,21,65.95,48.763333,-0.813333,18.000000,2.000000,1524.0,63.801400,93.526667
18445,900647,Summer,Mercurius Arvernus,schwarz,WUG132,22,65.95,45.954000,1.196000,18.800000,4.400000,1524.0,64.737533,198.662000
