In [218]:
import sys     
import math
import datetime
import calendar
import os                            
import pandas as pd                  
import matplotlib.pyplot as plt      
import seaborn as sns     
import altair as alt
import numpy as np
%matplotlib inline
# jupyter notebook needs this option.
alt.renderers.enable('notebook')

RendererRegistry.enable('notebook')

In [219]:
dataurl = 'https://data.bloomington.in.gov/dataset/94d3f457-57b5-45be-bee0-a0106f59b7ed/resource/8854ce02-e8f5-44b9-b85f-17f002a7d023/download/8854ce02-e8f5-44b9-b85f-17f002a7d023.csv'
df = pd.read_csv(dataurl)
# Drop the 
df = df.drop(labels=[
    'intakedate', 
    'sheltercode', 
    'identichipnumber', 
    'breedname', 
    'basecolour', 
    'animalage', 
    'sexname',
    'location',
    'movementdate',
    'returndate',
    'deceasedreason',
    'diedoffshelter',
    'isdoa'], axis=1)
df.head()

Unnamed: 0,id,intakereason,istransfer,animalname,speciesname,movementtype,istrial,returnedreason,deceaseddate,puttosleep
0,15801,Moving,0,Jadzia,Cat,Adoption,0.0,Stray,,0
1,15932,Moving,0,Gonzo,Dog,Adoption,0.0,Stray,,0
2,28859,Abandoned,0,Maggie,Dog,Adoption,0.0,Stray,,0
3,30812,Abandoned,0,Pretty Girl,Cat,Foster,0.0,Stray,,0
4,31469,Incompatible,0,Bonnie,Dog,Adoption,0.0,Incompatible,,0


For the next step we need to map out each animal's movement through the shelter. Some animals will have multiple records corresponding to a sequence of movements through the shelter. For example, a cat called 'Independence' was brought to the shelter as a stray, fostered twice, adopted, subsequently returned because the owner couldn't afford her, and finally adopted once more.

In [220]:
df[df.id==56475]

Unnamed: 0,id,intakereason,istransfer,animalname,speciesname,movementtype,istrial,returnedreason,deceaseddate,puttosleep
1430,56475,Stray,0,Independence,Cat,Foster,0.0,Stray,,0
1431,56475,Stray,0,Independence,Cat,Foster,0.0,Stray,,0
1432,56475,Stray,0,Independence,Cat,Adoption,0.0,Unable to Afford,,0
1433,56475,Stray,0,Independence,Cat,Adoption,0.0,Stray,,0


We can model Independence's movement as a collection of states:

```
[ Stray, Foster, Foster, Adoption, Unable to Afford, Adoption ]
```

Sankey charts work on the basis of state transitions. We can thus quantize Independence's movements as a collection of tuples, where the first item of each tuple represents the initial state, and the second item represents the final state.

```
[(Stray, Foster), 
 (Foster, Foster),
 (Foster, Adoption),
 (Adoption, Unable to Afford),
 (Unable to Afford, Adoption)]
```

If we do this for each animal, we can account for the movements of all animals through the shelter over time. This is accomplished below.

In [221]:
# Examine the group of records for each animal 
# and return a sequence of state transitions.
def sequence_transitions(group):
    # an accumulator for this animal's transitions
    seq = []
    seq_dict = {}
    # the index of the last record in this group
    last_idx = len(group['id'])-1    
    # for each row in the group...
    for row_idx,(i,row) in enumerate(group.iterrows()):
        # Add intake reason on first row
        if (row_idx == 0):
            seq.append(row.speciesname)
            #seq_dict["type"] = row.speciesname
            if (row.istransfer): 
                seq.append("Transfer")
                #seq_dict["source"]="Transfer"
            else: 
                seq.append(row.intakereason) 
                #seq_dict["source"]=row.intakereason
        # Add intermediate movement step.
        seq.append(row.movementtype)
        #seq_dict['waypoint'+str(row_idx)]= row.movementtype
        # Add reason for return. 
        # Ignore 'stray'; it's the default/empty value.
        # Ignore DOA; we'll catch this later
        if (row.returnedreason not in ['Stray', 'DOA', 'Owner requested Euthanasia']):
            seq.append(row.returnedreason)
            #seq_dict['waypoint'+str(row_idx)]= row.movementtype
        # Check for animal death on last row
        if (row_idx == last_idx):
            if (row.puttosleep): 
                seq.append("Euthanized") 
                #seq_dict['target']= "Euthanized"
            elif (type(row.deceaseddate) is str): 
                seq.append("Died") 
                #seq_dict['target']="Euthanized"
                # last row - copy movement type to target
    return seq

# convert a list of items to a pairwise list of tupels
# ex: [A, B, C, D] => [(A,B), (B,C), (C,D)]
def tuple_pairwise(lst):
    pairs = []
    for idx in range(1,len(lst)-1):
        pairs.append((lst[0], lst[idx], lst[idx+1]))
    return pairs

# Generate a list of state transitions for every animal
def transform_sequences_to_dicts(seq): 
    result = {}
    result["type"]=seq[0]
    result["source"]=seq[1]
    result["target"]=seq[-1]
    
    for i in range(2, len(seq)-1):
        result["waypoint"+(str)(i-2)] = seq[i]
    return result
        
sequences=[]
pairwise_sequences=[]
for name, group in df.groupby('id'):
    s = sequence_transitions(group)
    sequences.append(transform_sequences_to_dicts(s))
    pairwise_sequences += tuple_pairwise(s)

In [222]:
# The first 10 state transitions
sequences[0:10]

[{'type': 'Cat', 'source': 'Moving', 'target': 'Adoption'},
 {'type': 'Dog', 'source': 'Moving', 'target': 'Adoption'},
 {'type': 'Dog', 'source': 'Abandoned', 'target': 'Adoption'},
 {'type': 'Cat', 'source': 'Abandoned', 'target': 'Foster'},
 {'type': 'Dog',
  'source': 'Incompatible',
  'target': 'Adoption',
  'waypoint0': 'Adoption',
  'waypoint1': 'Incompatible'},
 {'type': 'Rabbit', 'source': 'Abandoned', 'target': 'Transfer'},
 {'type': 'Rabbit', 'source': 'Abandoned', 'target': 'Transfer'},
 {'type': 'Dog',
  'source': 'Abandoned',
  'target': 'Adoption',
  'waypoint0': 'Foster',
  'waypoint1': 'Foster'},
 {'type': 'Cat',
  'source': 'Abandoned',
  'target': 'Euthanized',
  'waypoint0': 'Foster',
  'waypoint1': 'Adoption',
  'waypoint2': 'Incompatible'},
 {'type': 'Dog', 'source': 'Abandoned', 'target': 'Adoption'}]

In [223]:
pairwise_sequences[0:10]

[('Cat', 'Moving', 'Adoption'),
 ('Dog', 'Moving', 'Adoption'),
 ('Dog', 'Abandoned', 'Adoption'),
 ('Cat', 'Abandoned', 'Foster'),
 ('Dog', 'Incompatible', 'Adoption'),
 ('Dog', 'Adoption', 'Incompatible'),
 ('Dog', 'Incompatible', 'Adoption'),
 ('Rabbit', 'Abandoned', 'Transfer'),
 ('Rabbit', 'Abandoned', 'Transfer'),
 ('Dog', 'Abandoned', 'Foster')]

Next, we need to count how many instances of each type of transition we have. A `Counter` object will do this for us. We can see that the most common transition is Foster->Adoption, followed by Stray->Adoption. This is in line with intuition.

In [224]:
from collections import Counter
c = Counter(pairwise_sequences)
c.most_common()[0:10]

[(('Cat', 'Foster', 'Adoption'), 487),
 (('Dog', 'Stray', 'Reclaimed'), 334),
 (('Cat', 'Stray', 'Adoption'), 328),
 (('Cat', 'Stray', 'Foster'), 286),
 (('Dog', 'Stray', 'Adoption'), 248),
 (('Dog', 'Incompatible', 'Adoption'), 236),
 (('Cat', 'Incompatible', 'Adoption'), 177),
 (('Dog', 'Foster', 'Adoption'), 137),
 (('Dog', 'Adoption', 'Incompatible'), 98),
 (('Cat', 'Incompatible', 'Foster'), 96)]

We need to reshape this slightly to fit with the Sankey widget's expected input.

In [225]:
def sankey_node(key):
    return {
        
        'source': key[1],
        'target': key[2],
        'value': c[key],
        'type': key[0],
        'color': "#D32F2F" if key[0] == "Cat" else "#FFC107"
    }
    
links = list(map(sankey_node, c.keys()))
sorted(links, key=lambda l: l['value'], reverse=True)[0:10]

[{'source': 'Foster',
  'target': 'Adoption',
  'value': 487,
  'type': 'Cat',
  'color': '#D32F2F'},
 {'source': 'Stray',
  'target': 'Reclaimed',
  'value': 334,
  'type': 'Dog',
  'color': '#FFC107'},
 {'source': 'Stray',
  'target': 'Adoption',
  'value': 328,
  'type': 'Cat',
  'color': '#D32F2F'},
 {'source': 'Stray',
  'target': 'Foster',
  'value': 286,
  'type': 'Cat',
  'color': '#D32F2F'},
 {'source': 'Stray',
  'target': 'Adoption',
  'value': 248,
  'type': 'Dog',
  'color': '#FFC107'},
 {'source': 'Incompatible',
  'target': 'Adoption',
  'value': 236,
  'type': 'Dog',
  'color': '#FFC107'},
 {'source': 'Incompatible',
  'target': 'Adoption',
  'value': 177,
  'type': 'Cat',
  'color': '#D32F2F'},
 {'source': 'Foster',
  'target': 'Adoption',
  'value': 137,
  'type': 'Dog',
  'color': '#FFC107'},
 {'source': 'Adoption',
  'target': 'Incompatible',
  'value': 98,
  'type': 'Dog',
  'color': '#FFC107'},
 {'source': 'Incompatible',
  'target': 'Foster',
  'value': 96,
  'ty

Let's plot a scaled back version of the chart to make sure the shape is correct.

In [245]:
from ipysankeywidget import SankeyWidget
from ipywidgets import Layout

# To prevent the Sankey widget from becoming overloaded, we can limit our view to the more frequent transitions.
top = list(filter(lambda l: l['value']>100, links))

layout = Layout(width="800", height="600" )
sankey = SankeyWidget(links=top, layout=layout,  margins=dict(top=0, bottom=0, left=100, right=100))
sankey.auto_save_png('two_state_transitions_proto.png')

SankeyWidget(layout=Layout(height='600', width='800'), links=[{'source': 'Incompatible', 'target': 'Adoption',…

Finally, Let's plot a fuller visualization with more transitions. Note the existence of cycles!

In [246]:
top = list(filter(lambda l: l['value']>30, links))

layout = Layout(width="800", height="600" )
sankey = SankeyWidget(links=top, layout=layout,  margins=dict(top=0, bottom=0, left=130, right=100))
sankey.auto_save_png('two_state_transitions_full.png')

SankeyWidget(layout=Layout(height='600', width='800'), links=[{'source': 'Moving', 'target': 'Adoption', 'valu…

In [213]:
# Construct a dataframe from the sequences
sk = pd.DataFrame.from_dict(sequences).dropna(subset=['source', 'target']).replace(np.nan, '', regex=True)
# Reshape with dataframe to merge duplicate rows and set 'value' to the count of duplicate rows 
sk = sk.groupby(sk.columns.tolist()).size().reset_index().rename(columns={0:'value'})
# Sort by value, descending
sk = sk.sort_values("value", ascending=False)

In [240]:
from floweaver import *

def flow(df, filename):

    # Construct nodes
    nodes = {
        'incomingreason': ProcessGroup(df.source.unique().tolist()),
        'outcometype': ProcessGroup(df.target.unique().tolist()),
        'waypoint0': Waypoint(Partition.Simple('waypoint0', df.waypoint0.unique().tolist())),
        'waypoint1': Waypoint(Partition.Simple('waypoint1', df.waypoint1.unique().tolist())),
        'waypoint2': Waypoint(Partition.Simple('waypoint2', df.waypoint2.unique().tolist())),
    }

    # Update the ProcessGroup nodes to use the partitions
    nodes['incomingreason'].partition = Partition.Simple('process',df.source.unique().tolist())
    nodes['outcometype'].partition = Partition.Simple('process',df.target.unique().tolist())
    species_by_type = Partition.Simple('type', sankey_frame.type.unique().tolist())

    # 2. Update the ordering to show where the waypoint goes: in the middle
    ordering = [
        ['incomingreason'],
        ['waypoint0'],['waypoint1'],['waypoint2'],
        ['outcometype'],
    ]

    # 3. Update the bundle definition to send the flows via the waypoint
    bundles = [
        Bundle('incomingreason', 'outcometype', 
               waypoints=['waypoint0','waypoint1','waypoint2']),
    ]

    # Update the SDD with the new nodes, ordering & bundles.
    SankeyWidget(links=df.to_dict('records'), margins=dict(top=0, bottom=0, left=500, right=-100))

    palette = {'Cat': '#D32F2F', 'Dog': '#FFC107'}
    sdd = SankeyDefinition(nodes, bundles, ordering,flow_partition=species_by_type)
    size = dict(width = 1000, height=600) # still need to work with the width and height.
    return weave(sdd, df, palette=palette).to_widget(**size).auto_save_png(filename)

In [241]:
flow(sk.head(10), "sankey_flow_proto.png")

SankeyWidget(groups=[{'id': 'incomingreason', 'type': 'process', 'title': '', 'nodes': ['incomingreason^Stray'…

In [242]:
flow(sk.head(50), "sankey_flow.png")

SankeyWidget(groups=[{'id': 'incomingreason', 'type': 'process', 'title': '', 'nodes': ['incomingreason^Stray'…

In [244]:
flow(sk[sk['waypoint2'] != ''].head(10), "sankey_flow_complex.png")

SankeyWidget(groups=[{'id': 'incomingreason', 'type': 'process', 'title': '', 'nodes': ['incomingreason^Stray'…