In [182]:
import sys     
import math
import datetime
import calendar
import os                            
import pandas as pd                  
import matplotlib.pyplot as plt      
import seaborn as sns     
import altair as alt
%matplotlib inline
# jupyter notebook needs this option.
alt.renderers.enable('notebook')

RendererRegistry.enable('notebook')

In [183]:
dataurl = 'https://data.bloomington.in.gov/dataset/94d3f457-57b5-45be-bee0-a0106f59b7ed/resource/8854ce02-e8f5-44b9-b85f-17f002a7d023/download/8854ce02-e8f5-44b9-b85f-17f002a7d023.csv'
df = pd.read_csv(dataurl)
# Drop the 
df = df.drop(labels=[
    'intakedate', 
    'sheltercode', 
    'identichipnumber', 
    'breedname', 
    'basecolour', 
    'animalage', 
    'sexname',
    'location',
    'movementdate',
    'returndate',
    'deceasedreason',
    'diedoffshelter',
    'isdoa'], axis=1)
df.head()

Unnamed: 0,id,intakereason,istransfer,animalname,speciesname,movementtype,istrial,returnedreason,deceaseddate,puttosleep
0,15801,Moving,0,Jadzia,Cat,Adoption,0.0,Stray,,0
1,15932,Moving,0,Gonzo,Dog,Adoption,0.0,Stray,,0
2,28859,Abandoned,0,Maggie,Dog,Adoption,0.0,Stray,,0
3,30812,Abandoned,0,Pretty Girl,Cat,Foster,0.0,Stray,,0
4,31469,Incompatible,0,Bonnie,Dog,Adoption,0.0,Incompatible,,0


For the next step we need to map out each animal's movement through the shelter. Some animals will have multiple records corresponding to a sequence of movements through the shelter. For example, a cat called 'Independence' was brought to the shelter as a stray, fostered twice, adopted, subsequently returned because the owner couldn't afford her, and finally adopted once more.

In [184]:
df[df.id==56475]

Unnamed: 0,id,intakereason,istransfer,animalname,speciesname,movementtype,istrial,returnedreason,deceaseddate,puttosleep
1430,56475,Stray,0,Independence,Cat,Foster,0.0,Stray,,0
1431,56475,Stray,0,Independence,Cat,Foster,0.0,Stray,,0
1432,56475,Stray,0,Independence,Cat,Adoption,0.0,Unable to Afford,,0
1433,56475,Stray,0,Independence,Cat,Adoption,0.0,Stray,,0


We can model Independence's movement as a collection of states:

```
[ Stray, Foster, Foster, Adoption, Unable to Afford, Adoption ]
```

Sankey charts work on the basis of state transitions. We can thus quantize Independence's movements as a collection of tuples, where the first item of each tuple represents the initial state, and the second item represents the final state.

```
[(Stray, Foster), 
 (Foster, Foster),
 (Foster, Adoption),
 (Adoption, Unable to Afford),
 (Unable to Afford, Adoption)]
```

If we do this for each animal, we can account for the movements of all animals through the shelter over time. This is accomplished below.

In [185]:
# Examine the group of records for each animal 
# and return a sequence of state transitions.
def sequence_transitions(group):
    # an accumulator for this animal's transitions
    seq = []
    # the index of the last record in this group
    last_idx = len(group['id'])-1    
    # for each row in the group...
    for row_idx,(i,row) in enumerate(group.iterrows()):
        # Add intake reason on first row
        if (row_idx == 0):
            if (row.istransfer): seq.append("Transfer")
            else: seq.append(row.intakereason)
        # Add intermediate movement step.
        seq.append(row.movementtype)
        # Add reason for return. 
        # Ignore 'stray'; it's the default/empty value.
        # Ignore DOA; we'll catch this later
        if (row.returnedreason not in ['Stray', 'DOA', 'Owner requested Euthanasia']):
            seq.append(row.returnedreason)
        # Check for animal death on last row
        if (row_idx == last_idx):
            if (row.puttosleep): seq.append("Euthanized")
            elif (type(row.deceaseddate) is str): seq.append("Died")
    return seq

# convert a list of items to a pairwise list of tupels
# ex: [A, B, C, D] => [(A,B), (B,C), (C,D)]
def tuple_pairwise(lst):
    pairs = []
    for idx in range(0,len(lst)-1):
        pairs.append((lst[idx], lst[idx+1]))
    return pairs

# Generate a list of state transitions for every animal
sequences=[]
for name, group in df.groupby('id'):
    sequences += tuple_pairwise(sequence_transitions(group))

In [186]:
# The first 10 state transitions
sequences[0:10]

[('Moving', 'Adoption'),
 ('Moving', 'Adoption'),
 ('Abandoned', 'Adoption'),
 ('Abandoned', 'Foster'),
 ('Incompatible', 'Adoption'),
 ('Adoption', 'Incompatible'),
 ('Incompatible', 'Adoption'),
 ('Abandoned', 'Transfer'),
 ('Abandoned', 'Transfer'),
 ('Abandoned', 'Foster')]

Next, we need to count how many instances of each type of transition we have. A `Counter` object will do this for us. We can see that the most common transition is Foster->Adoption, followed by Stray->Adoption. This is in line with intuition.

In [187]:
from collections import Counter
c = Counter(sequences)
c.most_common()[0:10]

[(('Foster', 'Adoption'), 644),
 (('Stray', 'Adoption'), 598),
 (('Incompatible', 'Adoption'), 480),
 (('Stray', 'Reclaimed'), 421),
 (('Stray', 'Foster'), 331),
 (('Foster', 'Foster'), 142),
 (('Adoption', 'Incompatible'), 138),
 (('Moving', 'Adoption'), 137),
 (('Incompatible', 'Foster'), 125),
 (('Litter relinquishment', 'Adoption'), 115)]

We need to reshape this slightly to fit with the Sankey widget's expected input.

In [188]:
def sankey_node(key):
    return {
        'source': key[0],
        'target': key[1],
        'value': c[key]
    }
    
links = list(map(sankey_node, c.keys()))
sorted(links, key=lambda l: l['value'], reverse=True)[0:10]

[{'source': 'Foster', 'target': 'Adoption', 'value': 644},
 {'source': 'Stray', 'target': 'Adoption', 'value': 598},
 {'source': 'Incompatible', 'target': 'Adoption', 'value': 480},
 {'source': 'Stray', 'target': 'Reclaimed', 'value': 421},
 {'source': 'Stray', 'target': 'Foster', 'value': 331},
 {'source': 'Foster', 'target': 'Foster', 'value': 142},
 {'source': 'Adoption', 'target': 'Incompatible', 'value': 138},
 {'source': 'Moving', 'target': 'Adoption', 'value': 137},
 {'source': 'Incompatible', 'target': 'Foster', 'value': 125},
 {'source': 'Litter relinquishment', 'target': 'Adoption', 'value': 115}]

To prevent the Sankey widget from becoming overloaded, we can limit our view to the more frequent transitions.

In [189]:
top = list(filter(lambda l: l['value']>30, links))

Finally, we can draw the Sankey chart. Note the existence of cycles!

In [190]:
from ipysankeywidget import SankeyWidget
from ipywidgets import Layout

layout = Layout(width="1280", height="1024")
sankey = SankeyWidget(links=top, layout=layout)
sankey

SankeyWidget(layout=Layout(height='1024', width='1280'), links=[{'source': 'Moving', 'target': 'Adoption', 'va…