In [1]:
import sys     
import math
import datetime
import calendar
import os                            
import pandas as pd                  
import matplotlib.pyplot as plt      
import seaborn as sns     
import altair as alt
import numpy as np
%matplotlib inline
# jupyter notebook needs this option.
alt.renderers.enable('notebook')

RendererRegistry.enable('notebook')

In [2]:
dataurl = 'https://data.bloomington.in.gov/dataset/94d3f457-57b5-45be-bee0-a0106f59b7ed/resource/8854ce02-e8f5-44b9-b85f-17f002a7d023/download/8854ce02-e8f5-44b9-b85f-17f002a7d023.csv'
df = pd.read_csv(dataurl)
# Drop the 
df = df.drop(labels=[
    'intakedate', 
    'sheltercode', 
    'identichipnumber', 
    'breedname', 
    'basecolour', 
    'animalage', 
    'sexname',
    'location',
    'movementdate',
    'returndate',
    'deceasedreason',
    'diedoffshelter',
    'isdoa'], axis=1)
df.head()

Unnamed: 0,id,intakereason,istransfer,animalname,speciesname,movementtype,istrial,returnedreason,deceaseddate,puttosleep
0,15801,Moving,0,Jadzia,Cat,Adoption,0.0,Stray,,0
1,15932,Moving,0,Gonzo,Dog,Adoption,0.0,Stray,,0
2,28859,Abandoned,0,Maggie,Dog,Adoption,0.0,Stray,,0
3,30812,Abandoned,0,Pretty Girl,Cat,Foster,0.0,Stray,,0
4,31469,Incompatible,0,Bonnie,Dog,Adoption,0.0,Incompatible,,0


For the next step we need to map out each animal's movement through the shelter. Some animals will have multiple records corresponding to a sequence of movements through the shelter. For example, a cat called 'Independence' was brought to the shelter as a stray, fostered twice, adopted, subsequently returned because the owner couldn't afford her, and finally adopted once more.

In [3]:
df[df.id==56475]

Unnamed: 0,id,intakereason,istransfer,animalname,speciesname,movementtype,istrial,returnedreason,deceaseddate,puttosleep
1430,56475,Stray,0,Independence,Cat,Foster,0.0,Stray,,0
1431,56475,Stray,0,Independence,Cat,Foster,0.0,Stray,,0
1432,56475,Stray,0,Independence,Cat,Adoption,0.0,Unable to Afford,,0
1433,56475,Stray,0,Independence,Cat,Adoption,0.0,Stray,,0


We can model Independence's movement as a collection of states:

```
[ Stray, Foster, Foster, Adoption, Unable to Afford, Adoption ]
```

Sankey charts work on the basis of state transitions. We can thus quantize Independence's movements as a collection of tuples, where the first item of each tuple represents the initial state, and the second item represents the final state.

```
[(Stray, Foster), 
 (Foster, Foster),
 (Foster, Adoption),
 (Adoption, Unable to Afford),
 (Unable to Afford, Adoption)]
```

If we do this for each animal, we can account for the movements of all animals through the shelter over time. This is accomplished below.

In [4]:
# Examine the group of records for each animal 
# and return a sequence of state transitions.
def sequence_transitions(group):
    # an accumulator for this animal's transitions
    seq = []
    seq_dict = {}
    # the index of the last record in this group
    last_idx = len(group['id'])-1    
    # for each row in the group...
    for row_idx,(i,row) in enumerate(group.iterrows()):
        # Add intake reason on first row
        if (row_idx == 0):
            seq.append(row.speciesname)
            seq_dict["type"] = row.speciesname
            if (row.istransfer): 
                seq.append("Transfer")
                seq_dict["source"]="Transfer"
            else: 
                seq.append(row.intakereason) 
                seq_dict["source"]=row.intakereason
        # Add intermediate movement step.
        seq.append(row.movementtype)
        seq_dict['waypoint'+str(row_idx)]= row.movementtype
        # Add reason for return. 
        # Ignore 'stray'; it's the default/empty value.
        # Ignore DOA; we'll catch this later
        if (row.returnedreason not in ['Stray', 'DOA', 'Owner requested Euthanasia']):
            seq.append(row.returnedreason)
            seq_dict['waypoint'+str(row_idx)]= row.movementtype
        # Check for animal death on last row
        if (row_idx == last_idx):
            if (row.puttosleep): 
                seq.append("Euthanized") 
                seq_dict['target']= "Euthanized"
            elif (type(row.deceaseddate) is str): 
                seq.append("Died") 
                seq_dict['target']="Euthanized"
                # last row - copy movement type to target
            else:
                seq_dict['target']= row.movementtype
    return (seq,seq_dict)

# convert a list of items to a pairwise list of tupels
# ex: [A, B, C, D] => [(A,B), (B,C), (C,D)]
def tuple_pairwise(lst):
    pairs = []
    for idx in range(1,len(lst)-1):
        pairs.append((lst[0], lst[idx], lst[idx+1]))
    return pairs

# Generate a list of state transitions for every animal
sequences=[]
pairwise_sequences=[]
for name, group in df.groupby('id'):
    (s,seq_dict) = sequence_transitions(group)
    sequences.append(seq_dict)
    pairwise_sequences += tuple_pairwise(s)

In [12]:
# The first 10 state transitions
sequences[0:10]

[{'source': 'Moving',
  'target': 'Adoption',
  'type': 'Cat',
  'waypoint0': 'Adoption'},
 {'source': 'Moving',
  'target': 'Adoption',
  'type': 'Dog',
  'waypoint0': 'Adoption'},
 {'source': 'Abandoned',
  'target': 'Adoption',
  'type': 'Dog',
  'waypoint0': 'Adoption'},
 {'source': 'Abandoned',
  'target': 'Foster',
  'type': 'Cat',
  'waypoint0': 'Foster'},
 {'source': 'Incompatible',
  'target': 'Adoption',
  'type': 'Dog',
  'waypoint0': 'Adoption',
  'waypoint1': 'Adoption'},
 {'source': 'Abandoned',
  'target': 'Transfer',
  'type': 'Rabbit',
  'waypoint0': 'Transfer'},
 {'source': 'Abandoned',
  'target': 'Transfer',
  'type': 'Rabbit',
  'waypoint0': 'Transfer'},
 {'source': 'Abandoned',
  'target': 'Adoption',
  'type': 'Dog',
  'waypoint0': 'Foster',
  'waypoint1': 'Foster',
  'waypoint2': 'Adoption'},
 {'source': 'Abandoned',
  'target': 'Euthanized',
  'type': 'Cat',
  'waypoint0': 'Foster',
  'waypoint1': 'Adoption'},
 {'source': 'Abandoned',
  'target': 'Adoption',
  

In [5]:
pairwise_sequences[0:10]

[('Cat', 'Moving', 'Adoption'),
 ('Dog', 'Moving', 'Adoption'),
 ('Dog', 'Abandoned', 'Adoption'),
 ('Cat', 'Abandoned', 'Foster'),
 ('Dog', 'Incompatible', 'Adoption'),
 ('Dog', 'Adoption', 'Incompatible'),
 ('Dog', 'Incompatible', 'Adoption'),
 ('Rabbit', 'Abandoned', 'Transfer'),
 ('Rabbit', 'Abandoned', 'Transfer'),
 ('Dog', 'Abandoned', 'Foster')]

Next, we need to count how many instances of each type of transition we have. A `Counter` object will do this for us. We can see that the most common transition is Foster->Adoption, followed by Stray->Adoption. This is in line with intuition.

In [6]:
from collections import Counter
c = Counter(pairwise_sequences)
c.most_common()[0:10]

[(('Cat', 'Foster', 'Adoption'), 487),
 (('Dog', 'Stray', 'Reclaimed'), 334),
 (('Cat', 'Stray', 'Adoption'), 328),
 (('Cat', 'Stray', 'Foster'), 286),
 (('Dog', 'Stray', 'Adoption'), 248),
 (('Dog', 'Incompatible', 'Adoption'), 236),
 (('Cat', 'Incompatible', 'Adoption'), 177),
 (('Dog', 'Foster', 'Adoption'), 137),
 (('Dog', 'Adoption', 'Incompatible'), 98),
 (('Cat', 'Incompatible', 'Foster'), 96)]

We need to reshape this slightly to fit with the Sankey widget's expected input.

In [7]:
def sankey_node(key):
    return {
        
        'source': key[1],
        'target': key[2],
        'value': c[key],
        'type': key[0]
    }
    
links = list(map(sankey_node, c.keys()))
sorted(links, key=lambda l: l['value'], reverse=True)[0:10]

[{'source': 'Foster', 'target': 'Adoption', 'type': 'Cat', 'value': 487},
 {'source': 'Stray', 'target': 'Reclaimed', 'type': 'Dog', 'value': 334},
 {'source': 'Stray', 'target': 'Adoption', 'type': 'Cat', 'value': 328},
 {'source': 'Stray', 'target': 'Foster', 'type': 'Cat', 'value': 286},
 {'source': 'Stray', 'target': 'Adoption', 'type': 'Dog', 'value': 248},
 {'source': 'Incompatible', 'target': 'Adoption', 'type': 'Dog', 'value': 236},
 {'source': 'Incompatible', 'target': 'Adoption', 'type': 'Cat', 'value': 177},
 {'source': 'Foster', 'target': 'Adoption', 'type': 'Dog', 'value': 137},
 {'source': 'Adoption', 'target': 'Incompatible', 'type': 'Dog', 'value': 98},
 {'source': 'Incompatible', 'target': 'Foster', 'type': 'Cat', 'value': 96}]

To prevent the Sankey widget from becoming overloaded, we can limit our view to the more frequent transitions.

In [8]:
top = list(filter(lambda l: l['value']>20, links))

Finally, we can draw the Sankey chart. Note the existence of cycles!

In [9]:
from ipysankeywidget import SankeyWidget
from ipywidgets import Layout

layout = Layout(width="1280", height="1024")
sankey = SankeyWidget(links=top, layout=layout, align_link_types=True)
sankey

A Jupyter Widget

In [10]:
max_length = max(map(len, sequences))
max_length

8

In [11]:
sankey_frame = pd.DataFrame.from_dict(sequences)

In [12]:
sankey_frame

Unnamed: 0,source,target,type,waypoint0,waypoint1,waypoint2,waypoint3,waypoint4
0,Moving,Adoption,Cat,Adoption,,,,
1,Moving,Adoption,Dog,Adoption,,,,
2,Abandoned,Adoption,Dog,Adoption,,,,
3,Abandoned,Foster,Cat,Foster,,,,
4,Incompatible,Adoption,Dog,Adoption,Adoption,,,
5,Abandoned,Transfer,Rabbit,Transfer,,,,
6,Abandoned,Transfer,Rabbit,Transfer,,,,
7,Abandoned,Adoption,Dog,Foster,Foster,Adoption,,
8,Abandoned,Euthanized,Cat,Foster,Adoption,,,
9,Abandoned,Adoption,Dog,Adoption,,,,


In [13]:
from floweaver import *
sankey_frame['value']=1
sankey_frame = sankey_frame.replace(np.nan, '', regex=True)



nodes = {
    'incomingreason': ProcessGroup(sankey_frame.source.unique().tolist()),
    'outcometype': ProcessGroup(sankey_frame.target.unique().tolist()),
}

incomingreason_by_type = Partition.Simple('process',
                                          sankey_frame.source
                                          .unique().tolist())


species_by_type = Partition.Simple('type', sankey_frame.type.unique().tolist())

# movement type
outcometype = Partition.Simple('process',sankey_frame.target.unique().tolist())

# Update the ProcessGroup nodes to use the partitions

nodes['incomingreason'].partition = incomingreason_by_type
nodes['outcometype'].partition = outcometype


# define waypoint
movement_by_type0 = Partition.Simple('waypoint0', sankey_frame.waypoint0.unique().tolist())
nodes['waypoint0'] = Waypoint(movement_by_type0)

movement_by_type1 = Partition.Simple('waypoint1', sankey_frame.waypoint1.unique().tolist())
nodes['waypoint1'] = Waypoint(movement_by_type1)

movement_by_type2 = Partition.Simple('waypoint2', sankey_frame.waypoint2.unique().tolist())
nodes['waypoint2'] = Waypoint(movement_by_type2)

# define ordering
# 2. Update the ordering to show where the waypoint goes: in the middle
ordering = [
    ['incomingreason'],
    ['waypoint0'],['waypoint1'],['waypoint2'],
    ['outcometype'],
]

# 3. Update the bundle definition to send the flows via the waypoint
bundles = [
    Bundle('incomingreason', 'outcometype', 
           waypoints=['waypoint0','waypoint1','waypoint2']),
]

# Update the SDD with the new nodes, ordering & bundles.
sdd = SankeyDefinition(nodes, bundles, ordering,flow_partition=species_by_type)
size = dict(width = 2000, height=2000) # still need to work with the width and height.
weave(sdd, sankey_frame).to_widget(**size)


A Jupyter Widget