# Reproducing Sankey Diagram Crashing Browser

To reproduce the issue, you need to to simply set the number of `clusters` to a value greater than ~11.  

The `total_users`can be set as well, does not impact the crash behavior.

In [1]:
def generateRandomClusterFlow(cluster_count): # this is simulating clustering analysis for GA traffic 
    cluster_set=set(range(cluster_count))
    clusterFlow = {}
    
    while len(cluster_set)>0: # set to 1 exit before s is empty
        clusterFlow[len(cluster_set)] = list(cluster_set)
        cluster_set.remove(random.choice(list(cluster_set)))
    
    return clusterFlow  

def getUserAssignmentsDataFrame(user_count,cluster_count): # continue simulating randomized clustering data
    users = range(user_count)
    user_assignment_df = pd.DataFrame()
    user_assignment_df["user"] = users # simulating userId, replace with real userId assignment
    
    for index,clusters in generateRandomClusterFlow(cluster_count).items():
        if (index == 0): continue # irrelevant clusterFlow index value
        clusters_assignments = {}
        for user in users:
            clusters_assignments[user] = random.choice(clusters) # random assignments from available clusters
        user_assignment_df[index] = clusters_assignments.values() # add the column of assignments to the dataframe
    
    return user_assignment_df  

def getClusterPopulation(user_assignments_df,cluster_count): # utility function to groupby and sum users per cluster
    cluster_population = user_assignments_df.groupby(cluster_count)['user'].agg("count").reset_index(drop=True) \
        .sort_values(ascending=False) # order the results to influence visualization rendering order
    
    return pd.DataFrame({'cluster_id':cluster_population.index, 'count':cluster_population.values})

def sumUserMovementBetweenClusters(user_assignments_df,cluster_count,cluster_population):
    results = {}
    # warning, tripple nested for-loop incoming. 
    # I've been programming python for about ~6 months, would love feedback on how to do this better
    for cluster_group in user_assignments_df.columns:
        if type(cluster_group) is str or cluster_group >= cluster_count: # skip the "user" or the last cluster
            continue 
            
        _from = user_assignments_df[cluster_group] 
        _to = user_assignments_df[cluster_group+1]  # add one to advance "to" the next cluster group
        _users = user_assignments_df['user'] # add userIds to the data
        cluster_data = pd.DataFrame({'cluster': cluster_group, 'from':_from, 'to': _to, "user": _users })
        
        for _,cluster in cluster_population.iterrows():
            cluster_id = cluster.cluster_id
            
            for target,value in dict(cluster_data[cluster_data["from"] == cluster_id].to.value_counts()).items():
                key = "{}|{}|{}".format(cluster_group,cluster_id,target) 
                # create unique key for all permutations of cluster and cluster_group

                if not cluster_id in results:
                    results[key] = { 
                        "from": "{} - {}".format(cluster_id,cluster_group), 
                        "to": "{} - {}".format(target,cluster_group+1), "metric": 0 
                    }
                results[key]["to"] = "{} - {}".format(target,cluster_group+1)
                results[key]["metric"] += value

    results_df = pd.DataFrame(pd.DataFrame(results).transpose()[["from","to","metric"]])
    results_df["metric"] = results_df.metric.astype("int") # set metric to int for Sankey
    
    return results_df

## Running the application
Set the number of `target_clusters` you'd like to generate and the `total_users` for the size of the population.

In [2]:
%load_ext google.datalab.kernel 
import pandas as pd
import random

target_clusters = 10
total_users = 100000

user_assignments = getUserAssignmentsDataFrame(total_users,target_clusters)
cluster_populations = getClusterPopulation(user_assignments, target_clusters)
results = sumUserMovementBetweenClusters(user_assignments, target_clusters, cluster_populations)

## Visualize the results  / Crash the browser!
This is where you might possibly crash the browser.  If the `target_clusters` value is too high, possibly above 11, then the browser will likely crash. 

In [3]:
%%chart sankey -d results

The Sankey diagram labels are formated as "`{cluster id} - {number of clusters}`", starting with 1 and ending with the number of `target_clusters` you configure when the application is run.