# Initialization

In [1]:
# Display plots inline
%matplotlib inline

# Autoreload all package before excecuting a call
%load_ext autoreload
%autoreload 2

In [22]:
import pandas as pd
import numpy as np
import multiprocessing as mp
import itertools

import tailor
from tailor.clustering import *

In [3]:
data = tailor.load_data()

In [4]:
data.sample(10)

Unnamed: 0,article_id,season,brand,color,Abteilung,WHG,WUG,month,time_on_sale,original_price,sells_price,discount,markdown,article_count,stock_total,avq,revenue
201217,906890,Winter,Siofna,dunkelbraun,Abteilung005,WHG021,WUG072,Feb,5,74.95,74.95,0.0,0.0,2.333333,243.0,11.38546,174.883333
49787,901734,Fall,Snotra,mittelbraun,Abteilung005,WHG025,WUG089,Oct,0,119.95,118.075,1.875,0.0,1.5,207.0,1.449275,177.05
56644,901967,Winter,Friagabis,schwarz,Abteilung002,WHG012,WUG040,Apr,11,39.95,39.74,0.21,0.0,15.4,3393.0,21.349838,612.326
133758,904589,Fall,Freyr,schwarz,Abteilung002,WHG009,WUG029,Jan,17,59.99,48.284,1.666,10.04,1.6,564.0,43.156028,74.922
67731,902338,Spring,Heimdall,weiss,Abteilung006,WHG038,WUG125,Jun,14,49.95,49.95,0.0,0.0,1.0,614.0,42.42671,49.95
101778,903498,Spring,Friagabis,schwarz,Abteilung002,WHG009,WUG027,Jun,13,69.95,69.95,0.0,0.0,1.0,164.0,54.268293,69.95
22194,900774,Summer,Snotra,schwarz,Abteilung003,WHG016,WUG057,Sep,2,69.95,69.45,0.5,0.0,3.285714,1624.0,5.902533,227.835714
23063,900804,Summer,Burorina,dunkelbraun,Abteilung006,WHG038,WUG132,Dec,18,59.95,46.258333,3.691667,10.0,3.666667,1915.0,54.534378,173.483333
5403,900183,Spring,Freyr,weiss,Abteilung002,WHG009,WUG029,Jun,11,69.99,49.95,0.0,20.04,2.333333,224.0,32.440476,116.55
158070,905422,Winter,Friagabis,schwarz,Abteilung002,WHG012,WUG045,Jul,21,79.95,67.955,11.995,0.0,1.0,535.0,72.056075,67.955


# Code

In [5]:
%%time

split_results = cluster.multi_feature_split(data, distance.euclidean, 50)

CPU times: user 2min 37s, sys: 1.38 s, total: 2min 39s
Wall time: 2min 39s


In [6]:
split_results['Clusters'].index

Index(['0', '1', '2', '3', '4', '5', '6', '7'], dtype='object')

In [7]:
split_results['Clusters']['5'][0]['Features']

Abteilung    Abteilung005
WHG                WHG021
brand           Fimmilena
WUG                WUG073
season             Spring
dtype: object

In [8]:
def get_cluster_parent_name(cluster):
    name = cluster['Name']
    # remove last character until name is the parent cluster's name
    terminate = False
    while not terminate:
        character = name[-1:]
        if ((character == "_") or (character == "")):
            terminate = True
        name = name[:-1]
    return name

In [9]:
%%time

# get all clusters that remained unsplit
leafs = list()

# iterate through all layers of the clustering
for layer in split_results['Clusters'].index:
    # add all layer leaves and remove leaf parents
    for add_cluster in split_results['Clusters'][layer]:
        check_name = get_cluster_parent_name(add_cluster)
        # iterate until parent cluster is found then remove it
        for index, check_cluster in enumerate(leafs):
            if check_cluster['Name'] == check_name:
                # parent cluster found, remove it
                del leafs[index]
                # no more than one parent cluster, therefore exit second for loop
                break
        leafs.append(add_cluster)

CPU times: user 34.2 s, sys: 0 ns, total: 34.2 s
Wall time: 34.2 s


In [10]:
len(leafs)

2280

In [11]:
%%time

names = list()

for cluster in leafs:
    name = cluster['Name']
    names.append(name)
# sort by underscore count
names.sort(key = lambda s: s.count("_"), reverse=True)

CPU times: user 31.2 ms, sys: 0 ns, total: 31.2 ms
Wall time: 36.5 ms


In [12]:
names

['0_4_5_1_2_1_1_1',
 '0_4_5_1_2_1_1_2',
 '0_4_5_1_2_1_1_3',
 '0_4_5_1_2_1_1_4',
 '0_4_5_1_2_1_1_5',
 '0_4_5_1_2_1_1_6',
 '0_4_5_1_2_1_1_7',
 '0_4_5_1_2_1_1_8',
 '0_4_5_1_2_1_1_9',
 '0_4_5_1_2_1_1_10',
 '0_4_5_1_2_1_1_11',
 '0_4_5_1_2_1_1_12',
 '0_4_5_1_2_1_1_13',
 '0_4_5_1_2_1_1_14',
 '0_4_5_1_2_1_1_15',
 '0_4_5_1_2_1_1_16',
 '0_4_5_1_2_2_1_1',
 '0_4_5_1_2_2_1_2',
 '0_4_5_1_2_2_1_3',
 '0_4_5_1_2_2_1_4',
 '0_4_5_1_2_2_1_5',
 '0_4_5_1_2_2_1_6',
 '0_4_5_1_2_2_1_7',
 '0_4_5_1_2_2_1_8',
 '0_4_5_1_2_2_1_9',
 '0_4_5_1_2_2_1_10',
 '0_4_5_1_2_2_1_11',
 '0_4_5_1_2_2_1_12',
 '0_4_5_1_2_2_1_13',
 '0_4_5_1_2_2_1_14',
 '0_4_5_1_2_2_1_15',
 '0_4_5_1_2_2_1_16',
 '0_4_5_1_2_2_1_17',
 '0_4_5_1_2_3_1_1',
 '0_4_5_1_2_3_1_2',
 '0_4_5_1_2_3_1_3',
 '0_4_5_1_2_3_1_4',
 '0_4_5_1_2_3_1_5',
 '0_4_5_1_2_3_1_6',
 '0_4_5_1_2_3_1_7',
 '0_4_5_1_2_3_1_8',
 '0_4_5_1_2_3_1_9',
 '0_4_5_1_2_3_1_10',
 '0_4_5_1_2_3_1_11',
 '0_4_5_1_2_3_1_12',
 '0_4_5_1_2_3_1_13',
 '0_4_5_1_2_3_1_14',
 '0_4_5_1_2_3_1_15',
 '0_4_5_1_2_3_1_16'

In [13]:
len(names)

2280

In [14]:
%%time

# get all clusters based on the name
clusters = list()

# iterate through all layers of the clustering
for layer in split_results['Clusters'].index:
    # add all layer leaves and remove leaf parents
    for cluster in split_results['Clusters'][layer]:
        if cluster['Name'] in names:
            clusters.append(cluster)

CPU times: user 109 ms, sys: 0 ns, total: 109 ms
Wall time: 98.8 ms


In [15]:
len(clusters)

2280

In [16]:
%%time

length = len(clusters)
distances = pd.DataFrame(index=range(length),columns=range(length))
targets = list()

# dress the clusters for better distance performance
for i, cluster in enumerate(clusters):
    # only select the distance relevant slice of the Dataframe
    target = cluster['DataFrame'].groupby(['time_on_sale']).mean()['article_count']
    if (len(target) < 26):
        # fill with 0 until index 25 so all comparison arrays are the same length
        # this improves performance dramatically
        target = target.reindex(pd.RangeIndex(26)).fillna(0)
    targets.append(target)

CPU times: user 4.75 s, sys: 62.5 ms, total: 4.81 s
Wall time: 4.83 s


In [17]:
%%time
length = len(targets)
for i, a in enumerate(targets):
    for k, b in enumerate(reversed(targets)):
        j = length - 1 - k
        if j <= i:
            break
        else:
            try:
                d = distance.euclidean(a.values,b.values)
                distances[i][j] = d
                distances[j][i] = d
            except:
                print(str(i) + " " + str(k))

CPU times: user 9min 52s, sys: 11.6 s, total: 10min 3s
Wall time: 10min 1s


In [18]:
distances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2270,2271,2272,2273,2274,2275,2276,2277,2278,2279
0,,2.03832,6.69048,7.1303,6.14795,3.85276,3.10528,6.54338,4.18375,18.8841,...,9.90423,7.21291,8.23462,9.92501,7.651,9.41321,8.67895,8.38961,9.8068,9.70231
1,2.03832,,6.76835,6.9644,5.98206,4.62162,3.93246,7.31497,3.5728,17.8055,...,9.73834,6.96664,8.02103,8.33874,7.37754,9.24731,8.38641,8.35181,9.6409,9.53641
2,6.69048,6.76835,,2.42506,2.46595,7.86267,8.93264,12.6533,4.5055,20.723,...,3.62926,5.00225,4.92029,7.2767,4.78076,3.8767,4.89144,4.34355,3.53182,3.48503
3,7.1303,6.9644,2.42506,,1.22592,8.35691,9.49053,13.5086,3.79811,18.3857,...,2.77394,3.75345,2.99566,5.60067,2.98934,2.36543,3.48024,3.33064,2.6765,2.63568
4,6.14795,5.98206,2.46595,1.22592,,7.243,8.50819,12.5263,2.80772,18.2722,...,3.75628,3.55647,3.17076,5.86361,3.3549,3.26526,3.8534,3.67272,3.65884,3.55436
5,3.85276,4.62162,7.86267,8.35691,7.243,,3.48907,5.56525,6.53169,21.0343,...,10.9547,8.50882,9.70692,11.6166,9.04192,10.4637,9.88445,9.07803,10.8573,10.7528
6,3.10528,3.93246,8.93264,9.49053,8.50819,3.48907,,4.02372,6.30318,20.637,...,12.2645,8.92729,10.5472,11.8127,10.0809,11.7734,10.9125,10.5388,12.167,12.0625
7,6.54338,7.31497,12.6533,13.5086,12.5263,5.56525,4.02372,,10.2824,23.0869,...,16.2826,12.9454,14.5652,15.5175,13.9195,15.7915,14.9306,14.4058,16.1851,16.0806
8,4.18375,3.5728,4.5055,3.79811,2.80772,6.53169,6.30318,10.2824,,17.1023,...,6.564,4.9423,4.8467,6.27832,4.8096,6.07298,5.30076,6.07719,6.46657,6.36208
9,18.8841,17.8055,20.723,18.3857,18.2722,21.0343,20.637,23.0869,17.1023,,...,19.9865,18.9134,18.2692,15.7782,17.7453,19.4955,18.6346,19.3406,19.8891,19.7846


In [19]:
min_index = np.nanargmin(distances[0])
min_value = np.nanmin(distances[0])
print(str(min_index) + " " + str(min_value))

41 1.5473443449213393


In [20]:
distances[0][41]

1.5473443449213393

In [23]:
# get the closest cluster for each cluster
# generates a Series with pointer lists
closest_clusters = pd.Series(index=range(length), dtype='object')
for i in distances.index:
    target_index = np.nanargmin(distances[i]).item()
    # only one value now, but we will add values later
    closest_clusters[i] = list()
    closest_clusters[i].append(target_index)

    
cluster_groups = closest_clusters
    
# generate initial groups by adding the index to the target
for i, group in cluster_groups.iteritems():
    # first value is the initial closest cluster
    target = group[0]
    cluster_groups[target].append(i)

# merge until there are only loners and groups with a pointer loop  
# a pointer loop is when two cluster point towards each other, even over multiple cluster between
finished = False 
while not finished:
    finished = True
    
    # merge dependencies
    for i, group in cluster_groups.iteritems():
        # loner check
        if len(group) > 1:
            # first value is the initial closest cluster
            target = group[0]
            # rest of the values are pointers added by dependent groups
            pointers = group[1:]
            try:
                # check whether this is a dependent group without a pointer loop
                if (target not in pointers):
                    # still dependent groups left, we need to iterate at least one more time
                    finished = False
                    # add own index to target
                    cluster_groups[target].append(i)
                    # sanity check whether looping is required
                    if (type(pointers) is list):
                        # multiple entries we can loop
                        for x in pointers:
                            if (x not in cluster_groups[target]):
                                cluster_groups[target].append(x)
                    else:
                        print(pointers)
                        cluster_groups[target].append(pointers[0])
                    # dependent group is spent, create loner
                    cluster_groups[i] = list()
                    cluster_groups[i].append(target)
            except:
                print("shit's on fire, yo")
                print(str(i) + " " + str(group) + " " + str(target) + " " + str(pointers))

# clear loners
for i, group in cluster_groups.iteritems():
    if (len(group) <= 1):
        target = group[0]
        if target in cluster_groups.index:
            cluster_groups[target].append(i)
            cluster_groups = cluster_groups.drop(i) 

# dress up the group list        
merged_groups = list()
for i, group in cluster_groups.iteritems():
    # replace target with own index
    temp = group
    temp.append(i)
    temp = sorted(list(set(temp)))
    merged_groups.append(temp)
merged_groups = sorted(merged_groups)

print(len(list(set(list(itertools.chain.from_iterable(merged_groups))))))

# merge connected groups and remove duplicates
for i, group_a in enumerate(merged_groups):
    for k, group_b in enumerate(merged_groups):
        if k is not i:
            for x in group_a:
                if x in set(group_b):
                    merged_groups[i] = list(set(group_a).union(set(group_b)))
                    # both will point to the same list
                    merged_groups[k] = merged_groups[i]
                    
clean = list()
for group in merged_groups:
    sgroup = sorted(group)
    if sgroup not in clean:
        clean.append(sgroup)
clean = sorted(clean)

2280


In [24]:
len(clean)

362

In [25]:
len(list(set(list(itertools.chain.from_iterable(clean)))))

2280

In [26]:
clean

[[0, 1, 41, 163, 170, 173, 180, 412, 429, 633, 1020, 1592],
 [2, 743, 750, 758, 1431],
 [3,
  20,
  73,
  84,
  122,
  136,
  137,
  266,
  270,
  283,
  318,
  358,
  385,
  441,
  443,
  454,
  612,
  617,
  1092,
  1149,
  1150,
  1238,
  1390],
 [4, 69, 71, 120, 311, 359, 435, 1214, 1240, 1501],
 [5, 101, 110, 379, 419, 1228, 1333],
 [6, 94, 103, 208, 210, 331, 388, 413, 414, 623, 634, 1237],
 [7, 29, 32, 33, 96, 154, 165, 209, 387, 426, 982],
 [8,
  23,
  63,
  100,
  104,
  109,
  115,
  129,
  140,
  168,
  228,
  229,
  265,
  277,
  291,
  330,
  355,
  410,
  784,
  1209,
  1217,
  1224,
  1595],
 [9,
  61,
  396,
  449,
  977,
  1326,
  1344,
  1356,
  1504,
  1534,
  1585,
  1615,
  1645,
  1744,
  1850,
  1858,
  1970,
  2178,
  2265,
  2273],
 [10, 126, 152, 155, 156, 157, 158, 296, 594, 595, 597],
 [11, 49, 90, 322, 329, 348, 349, 390, 456, 509, 632, 1235, 1245],
 [12, 148, 288, 297],
 [13,
  14,
  26,
  45,
  52,
  53,
  54,
  87,
  93,
  198,
  199,
  267,
  282,
  295

In [27]:
data.query('Abteilung == "Abteilung001"')

Unnamed: 0,article_id,season,brand,color,Abteilung,WHG,WUG,month,time_on_sale,original_price,sells_price,discount,markdown,article_count,stock_total,avq,revenue
94,900004,Spring,Turstuahenae,mittelblau,Abteilung001,WHG003,WUG006,Mar,0,19.95,18.765000,1.185000,0.0,4.500000,2062.0,0.363725,87.390000
95,900004,Spring,Turstuahenae,mittelblau,Abteilung001,WHG003,WUG006,Mar,1,19.95,18.817143,1.132857,0.0,8.857143,2062.0,2.604961,171.200000
96,900004,Spring,Turstuahenae,mittelblau,Abteilung001,WHG003,WUG006,Mar,2,19.95,19.678000,0.272000,0.0,8.600000,2062.0,4.966052,168.610000
97,900004,Spring,Turstuahenae,mittelblau,Abteilung001,WHG003,WUG006,Mar,3,19.95,19.950000,0.000000,0.0,10.000000,2062.0,6.401552,199.500000
98,900004,Spring,Turstuahenae,mittelblau,Abteilung001,WHG003,WUG006,Apr,3,19.95,19.611667,0.338333,0.0,10.833333,2062.0,8.470740,211.610000
99,900004,Spring,Turstuahenae,mittelblau,Abteilung001,WHG003,WUG006,Apr,4,19.95,19.251429,0.698571,0.0,12.142857,2062.0,12.047942,233.978571
100,900004,Spring,Turstuahenae,mittelblau,Abteilung001,WHG003,WUG006,Apr,5,19.95,19.320000,0.630000,0.0,10.000000,2062.0,14.966052,194.122000
101,900004,Spring,Turstuahenae,mittelblau,Abteilung001,WHG003,WUG006,Apr,6,19.95,19.404000,0.546000,0.0,16.000000,2062.0,18.195926,309.560000
102,900004,Spring,Turstuahenae,mittelblau,Abteilung001,WHG003,WUG006,Apr,7,19.95,19.450000,0.500000,0.0,15.000000,2062.0,21.467831,291.983333
103,900004,Spring,Turstuahenae,mittelblau,Abteilung001,WHG003,WUG006,May,7,19.95,19.590000,0.360000,0.0,13.000000,2062.0,23.844164,250.856667


In [28]:
merge_results = pd.Series()
merge_results['Groups'] = pd.Series()
merge_results['Indexes'] = pd.Series()
merge_results['DataFrames'] = pd.Series()

In [29]:
merge_results['Indexes']['0'] = clean

In [30]:
merge_results['Groups']['0'] = list()
merge_results['DataFrames']['0'] = list()
for i, pointers in enumerate(merge_results['Indexes']['0']):
    group = list()
    dfs = list()
    for pointer in pointers:
        cluster = clusters[pointer]
        group.append(cluster)
        # retrieving the relevant part of the original dataframe since the cluster dataframe has missing columns
        query_string = ""
        # building the query string
        for feature, characteristic in cluster['Features'].iteritems():
            query_string = query_string + " & " + "(" + feature + " == " + '"' + characteristic + '"' + ")"
        # remove first " & "
        query_string = query_string[3:]
        # select the dataframe part
        df_temp = data.query(query_string)
        dfs.append(df_temp)
    merge_results['Groups']['0'].append(group)
    # merge the clusters' dataframes to one and add it
    merge_results['DataFrames']['0'].append(pd.concat(dfs, sort=True))

In [31]:
merge_results['DataFrames']['0'][0]

Unnamed: 0,Abteilung,WHG,WUG,article_count,article_id,avq,brand,color,discount,markdown,month,original_price,revenue,season,sells_price,stock_total,time_on_sale
495,Abteilung003,WHG017,WUG058,9.285714,900018,1.242957,Uller,schwarz,0.932857,0.0,Jan,19.95,173.511429,Winter,19.017143,3448.0,0
496,Abteilung003,WHG017,WUG058,6.000000,900018,2.494200,Uller,schwarz,0.036667,0.0,Jan,19.95,117.373333,Winter,19.913333,3448.0,1
497,Abteilung003,WHG017,WUG058,10.333333,900018,3.949149,Uller,schwarz,2.980000,0.0,Jan,19.95,168.485000,Winter,16.970000,3448.0,2
498,Abteilung003,WHG017,WUG058,6.500000,900018,5.447602,Uller,schwarz,0.476667,0.0,Jan,19.95,126.056667,Winter,19.473333,3448.0,3
499,Abteilung003,WHG017,WUG058,5.000000,900018,6.133991,Uller,schwarz,2.300000,0.0,Jan,19.95,79.050000,Winter,17.650000,3448.0,4
500,Abteilung003,WHG017,WUG058,9.250000,900018,6.830046,Uller,schwarz,3.035000,0.0,Feb,19.95,156.860000,Winter,16.915000,3448.0,4
501,Abteilung003,WHG017,WUG058,8.166667,900018,8.188322,Uller,schwarz,0.075000,0.0,Feb,19.95,162.515000,Winter,19.875000,3448.0,5
502,Abteilung003,WHG017,WUG058,10.166667,900018,9.817285,Uller,schwarz,0.083333,0.0,Feb,19.95,201.973333,Winter,19.866667,3448.0,6
503,Abteilung003,WHG017,WUG058,10.500000,900018,11.470418,Uller,schwarz,0.121667,0.0,Feb,19.95,208.095000,Winter,19.828333,3448.0,7
504,Abteilung003,WHG017,WUG058,11.500000,900018,12.775522,Uller,schwarz,2.485000,0.0,Feb,19.95,206.535000,Winter,17.465000,3448.0,8


In [32]:
merge_results['Groups']['0'][0]

[DataFrame           article_id  season        brand        ...
 Features               Abteilung    Abteilung003
 dtype: object
 Name                                                       0_5
 dtype: object, DataFrame           article_id  season  brand      color   ...
 Features     Abteilung    Abteilung005
 WHG                W...
 Name                                                     0_1_4
 dtype: object, DataFrame           article_id  season        color     WUG...
 Features     Abteilung    Abteilung005
 WHG                W...
 Name                                                   0_1_1_3
 dtype: object, DataFrame           article_id  season                 colo...
 Features     Abteilung    Abteilung002
 brand         Aumena...
 Name                                                   0_4_9_4
 dtype: object, DataFrame           article_id  season        color     WHG...
 Features     Abteilung    Abteilung002
 brand               ...
 Name                                  

In [33]:
merge_results['Indexes']['0'][0]

[0, 1, 41, 163, 170, 173, 180, 412, 429, 633, 1020, 1592]

In [34]:
too_small = list()
for i, group in enumerate(merge_results['Groups']['0']):
    group_size = merge_results['DataFrames']['0'][i]['article_id'].nunique()
    if group_size < 50:
        too_small.append(i)
print(len(too_small))
print(too_small)

240
[1, 11, 13, 15, 18, 22, 35, 37, 43, 44, 46, 54, 55, 58, 67, 68, 74, 76, 77, 78, 82, 84, 90, 91, 92, 93, 95, 96, 97, 98, 104, 108, 110, 112, 113, 115, 116, 117, 118, 120, 123, 124, 125, 127, 128, 129, 131, 133, 134, 136, 137, 138, 140, 141, 143, 144, 145, 147, 148, 149, 151, 152, 153, 156, 157, 158, 159, 160, 161, 163, 165, 166, 167, 168, 169, 171, 174, 175, 176, 177, 178, 179, 180, 181, 182, 184, 185, 186, 187, 188, 190, 191, 192, 193, 194, 195, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 216, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 235, 237, 238, 240, 241, 242, 244, 245, 246, 249, 251, 252, 253, 254, 255, 259, 260, 261, 262, 263, 264, 267, 269, 270, 271, 273, 274, 275, 276, 277, 279, 280, 281, 282, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 295, 296, 297, 298, 299, 300, 301, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 3

In [35]:
%%time

length = len(merge_results['Groups']['0'])
distances = pd.DataFrame(index=range(length),columns=range(length))
targets = list()

# dress the clusters for better distance performance
for i, group in enumerate(merge_results['Groups']['0']):
    # only select the distance relevant slice of the Dataframe
    target = merge_results['DataFrames']['0'][i].groupby(['time_on_sale']).mean()['article_count']
    if (len(target) < 26):
        # fill with 0 until index 25 so all comparison arrays are the same length
        # this improves performance dramatically
        target = target.reindex(pd.RangeIndex(26)).fillna(0)
    targets.append(target)

CPU times: user 1 s, sys: 0 ns, total: 1 s
Wall time: 998 ms


In [36]:
%%time
length = len(targets)
for i, a in enumerate(targets):
    for k, b in enumerate(reversed(targets)):
        j = length - 1 - k
        if j <= i:
            break
        else:
            try:
                d = distance.euclidean(a.values,b.values)
                distances[i][j] = d
                distances[j][i] = d
            except:
                print(str(i) + " " + str(k))

CPU times: user 13.3 s, sys: 93.8 ms, total: 13.4 s
Wall time: 13.3 s


In [37]:
distances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,352,353,354,355,356,357,358,359,360,361
0,,5.19136,6.85186,5.95762,2.99075,2.6238,5.86226,3.51915,4.72029,7.70984,...,9.7458,9.0206,8.00579,10.2732,9.60221,9.64067,9.18816,9.82657,9.35958,7.39323
1,5.19136,,2.4742,2.11411,5.73922,7.35888,11.0536,2.71826,5.72944,12.9012,...,4.65572,4.25508,4.34568,8.18616,4.49419,4.44931,4.36218,4.71213,4.21087,3.64651
2,6.85186,2.4742,,1.08084,7.26037,9.01938,12.7141,3.38307,5.10071,14.5617,...,2.94041,2.92781,4.45689,7.66042,2.75035,2.78881,2.67764,2.97471,2.60395,3.37138
3,5.95762,2.11411,1.08084,,6.36613,8.12514,11.8199,2.43847,4.59654,13.6675,...,3.86899,3.85997,4.87565,7.61888,3.64459,3.68305,3.46211,3.86895,3.46049,3.78287
4,2.99075,5.73922,7.26037,6.36613,,1.75901,5.45376,4.96143,7.69434,7.32723,...,10.1543,9.14838,7.56636,10.3423,10.0107,10.0492,9.59666,10.2351,9.76809,7.31937
5,2.6238,7.35888,9.01938,8.12514,1.75901,,3.69475,5.68667,7.21016,5.79399,...,11.9133,10.9074,9.10984,11.6962,11.7697,11.8082,11.3557,11.9941,11.5271,9.07838
6,5.86226,11.0536,12.7141,11.8199,5.45376,3.69475,,9.38141,9.80889,3.92743,...,15.6081,14.6021,12.528,14.1096,15.4645,15.5029,15.0504,15.6888,15.2218,12.7731
7,3.51915,2.71826,3.38307,2.43847,4.96143,5.68667,9.38141,,3.30327,11.229,...,6.28046,6.06388,6.7377,8.63907,6.08306,6.12152,5.66901,6.30742,5.86682,5.72605
8,4.72029,5.72944,5.10071,4.59654,7.69434,7.21016,9.80889,3.30327,,10.708,...,6.86316,7.42799,8.73824,9.63715,6.60407,6.64253,6.19002,6.82843,6.74439,7.93664
9,7.70984,12.9012,14.5617,13.6675,7.32723,5.79399,3.92743,11.229,10.708,,...,17.4556,16.4497,14.3756,14.5556,17.3121,17.3505,16.898,17.5364,17.0694,14.6207


In [38]:
# get the closest group for each group that is too small
# generates a Series with pointer lists
closest_groups = pd.Series(index=range(length), dtype='object')
for i in too_small:
    target_index = np.nanargmin(distances[i]).item()
    # only one value now, but we will add values later
    closest_groups[i] = list()
    closest_groups[i].append(target_index)

    
relevant_groups = closest_groups

relevant_groups = relevant_groups.dropna()

print(len(relevant_groups))

check_temp = list()
for index, value in relevant_groups.iteritems():
    check_temp.append(index)
    check_temp.extend(value)

print(len(list(set(check_temp))))

240
280


In [39]:
# generate initial groups by adding the index to the target
for i, group in relevant_groups.iteritems():
    if group is not np.nan:
        # first value is the initial closest group
        target = group[0]
        # sanity check
        if target in relevant_groups.index:
            relevant_groups[target].append(i)
        else:
            # targeting group outside of too_small
            # add own index to own group to not be a loner
            group.append(i)
        

# merge until there are only loners and groups with a pointer loop  
# a pointer loop is when two groups point towards each other, even over multiple groups in between
finished = False 
while not finished:
    finished = True
    
    # merge dependencies
    for i, group in relevant_groups.iteritems():
        # ignore loners
        if len(group) > 1:
            # first value is the initial closest cluster
            target = group[0]
            # sanity check
            if target in relevant_groups.index:
                # rest of the values are pointers added by dependent groups
                pointers = group[1:]
                try:
                    # check whether this is a dependent group without a pointer loop
                    if (target not in pointers):
                        # still dependent groups left, we need to iterate at least one more time
                        finished = False
                        # add own index to target
                        relevant_groups[target].append(i)
                        # sanity check whether looping is required
                        if type(pointers) is list:
                            # multiple entries we can loop
                            for x in pointers:
                                if (x not in relevant_groups[target]):
                                    relevant_groups[target].append(x)
                        else:
                            print(pointers)
                            relevant_groups[target].append(pointers[0])
                        # dependent group is spent, create loner
                        relevant_groups[i] = list()
                        relevant_groups[i].append(target)
                except:
                    print("shit's on fire, yo")
                    print(str(i) + " " + str(group) + " " + str(target) + " " + str(pointers))

# clear loners
for i, group in relevant_groups.iteritems():
    if (len(group) <= 1):
        target = group[0]
        if target in relevant_groups.index:
            relevant_groups[target].append(i)
            relevant_groups = relevant_groups.drop(i)         

# dress up the group list        
sorted_groups = list()
for i, group in relevant_groups.iteritems():
    # replace target with own index
    temp = group
    temp.append(i)
    temp = sorted(list(set(temp)))
    sorted_groups.append(temp)
sorted_groups = sorted(sorted_groups)

# merge connected groups and remove duplicates
for i, group_a in enumerate(sorted_groups):
    for k, group_b in enumerate(sorted_groups):
        if k is not i:
            for x in group_a:
                if x in set(group_b):
                    sorted_groups[i] = list(set(group_a).union(set(group_b)))
                    # both will point to the same list
                    sorted_groups[k] = sorted_groups[i]              
clean = list()
for group in sorted_groups:
    sgroup = sorted(group)
    if sgroup not in clean:
        clean.append(sgroup)
clean = sorted(clean)

print(len(list(set(list(itertools.chain.from_iterable(sorted_groups))))))
print(len(list(set(list(itertools.chain.from_iterable(clean))))))

280
280


In [41]:
new_groups = pd.Series(index=range(length), dtype='object')

# initialize with own index
for i in new_groups.index:
    if i not in too_small:
        new_groups[i] = list()
        new_groups[i].append(i)

# include the newly generated groups
for i, group in enumerate(clean):
    found = False
    for x in group:
        if x not in too_small:
            # found target group that already was big enough
            found = True
            try:
                # merge groups
                temp = list()
                temp.extend(group)
                temp.extend(new_groups[x])
                temp = sorted(list(set(temp)))
                new_groups[x] = temp
            except:
                print(x)
                print(new_groups[x])
                print(group)
            break
    if not found:
        # add new group only made of merged too_small groups
        new_groups[group[0]] = group

new_groups = new_groups.dropna()

print(len(list(set(list(itertools.chain.from_iterable(new_groups.values))))))

clean = list()
for i, group in new_groups.iteritems():
    sgroup = sorted(group)
    if sgroup not in clean:
        clean.append(sgroup)
clean = sorted(clean)

print(len(list(set(list(itertools.chain.from_iterable(clean))))))

362
362


In [46]:
print(len(clean))
print(clean)

148
[[0], [1, 112], [2], [3, 35], [4, 98], [5], [6, 22], [7], [8, 193], [9, 13], [10], [11, 31], [12], [14], [15, 236], [16], [17], [18, 39], [19], [20], [21], [23], [24, 37], [25, 104], [26, 43], [27, 46, 95, 96], [28], [29], [30, 115, 253], [32, 58], [33], [34], [36, 44], [38], [40], [41], [42, 54], [45], [47], [48], [49, 117, 218], [50], [51], [52], [53, 55], [56], [57], [59], [60], [61], [62, 232], [63], [64], [65], [66], [67, 162, 163, 198, 346], [68, 70, 230, 320], [69], [71], [72], [73], [74, 120, 134, 167, 180, 188, 212, 213, 221, 227, 231, 307, 323, 331, 352, 359], [75], [76, 90, 92, 216, 235, 287, 288, 291, 298, 353], [77, 168, 186, 191, 206, 208, 241, 251, 252, 282, 310, 312, 347, 348, 351, 356], [78, 123, 159, 160, 161, 219, 240, 354], [79], [80, 82], [81], [83], [84, 149, 224, 311], [85], [86, 97, 129, 203], [87], [88, 144, 148, 270, 318, 325, 342], [89], [91, 194, 209, 273, 278, 341], [93, 214, 304], [94], [99], [100, 220, 313], [101, 301], [102], [103], [105], [106, 211,

In [42]:
merge_results['Indexes']['1'] = clean

In [43]:
# TODO merge_results['Groups']['1'] = list()
merge_results['DataFrames']['1'] = list()
for i, pointers in enumerate(merge_results['Indexes']['1']):
    # TODO group = list()
    dfs = list()
    for pointer in pointers:
        df_temp = merge_results['DataFrames']['0'][pointer]
        dfs.append(df_temp)
    # TODO merge_results['Groups']['1'].append(group)
    # merge the clusters' dataframes to one and add it
    merge_results['DataFrames']['1'].append(pd.concat(dfs, sort=True))

In [44]:
too_small = list()
for i, group in enumerate(merge_results['Indexes']['1']):
    group_size = merge_results['DataFrames']['1'][i]['article_id'].nunique()
    if group_size < 50:
        too_small.append(i)
print(len(too_small))
print(too_small)

9
[77, 87, 98, 101, 106, 109, 125, 132, 144]


In [45]:
len(merge_results['Indexes']['1'])

148