# Generate Sankey Diagram
Generate a Sankey Diagram to better understand how the physical resources (the logbooks, the address book, and the cards) map onto the abstract resources.

Conclusion: A Sankey diagram is useful, but overly simplifies the contributions of each resource. Because there is overlap between resources (not for borrow events, but for subscription events), this chart is misleading. The full breakdown is available below the image that was generated.

In [18]:
import pandas as pd
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

In [19]:
csv_urls = {
    'members': 'https://dataspace.princeton.edu/bitstream/88435/dsp01b5644v608/2/SCoData_members_v1.1_2021-01.csv',
    'books': 'https://dataspace.princeton.edu/bitstream/88435/dsp016d570067j/2/SCoData_books_v1.1_2021-01.csv',
    'events': 'https://dataspace.princeton.edu/bitstream/88435/dsp012n49t475g/2/SCoData_events_v1.1_2021-01.csv'

}

# load members, books, events as csv
members_df = pd.read_csv(csv_urls['members'])
books_df = pd.read_csv(csv_urls['books'])
events_df = pd.read_csv(csv_urls['events'])

In [20]:
events_df.event_type.unique()

array(['Subscription', 'Renewal', 'Reimbursement', 'Supplement',
       'Crossed out', 'Request', 'Generic', 'Gift', 'Loan', 'Purchase',
       'Borrow', 'Periodical Subscription', 'Separate Deposit'],
      dtype=object)

In [21]:
subscription_events = events_df[events_df['event_type'].isin(['Subscription', 'Renewal', 'Supplement'])]
borrow_events = events_df[events_df['event_type'] == 'Borrow']

In [22]:
def get_semicolon_counts(series):
    counts = {}
    extended_list = [v for l in series.str.split(';').to_list() for v in l]
    for value in extended_list:
        counts[value] = counts.get(value, 0) + 1
    return counts

subscription_counts = get_semicolon_counts(subscription_events['source_type'])
borrow_counts = get_semicolon_counts(borrow_events['source_type'])

label = ['Logbook', 'Lending Library Card', 'Address Book',
         'Addresses', 'People', 'Subscription Events', 'Borrow Events']

pairs = []

for physical_resource, value in subscription_counts.items():
    pairs.append([physical_resource, 'Subscription Events', value])

for physical_resource, value in borrow_counts.items():
    pairs.append([physical_resource, 'Borrow Events', value])
    
source = [label.index(v[0]) for v in pairs]
target = [label.index(v[1]) for v in pairs]
value = [v[2] for v in pairs]
link = dict(source=source, target=target, value=value)
node = dict(label=label, pad=50, thickness=5)
data = go.Sankey(link=link, node=node)
fig = go.Figure(data)
fig.update_layout(title={
        'text': 'Mapping physical to abstract resources'
    }
)
fig.show()

In [23]:
fig.write_image('physical-to-abstract-resources.jpg')

# How much overlap are we simplifying with this diagram?

In [26]:
def set_transform(x):
    s = set()
    for v in x.split(';'):
        s.add(v)
    return ';'.join(sorted([v for v in s]))

subscription_events['source_type_s'] = subscription_events.source_type.apply(
    lambda x: set_transform(x)
)
subscription_events.source_type_s.value_counts()

Logbook                              7689
Lending Library Card;Logbook         1141
Lending Library Card                  560
Address Book                          348
Address Book;Lending Library Card      21
Name: source_type_s, dtype: int64

In [27]:
borrow_events['source_type_s'] = borrow_events.source_type.apply(
    lambda x: set_transform(x)
)
borrow_events.source_type_s.value_counts()

Lending Library Card    21040
Logbook                    21
Name: source_type_s, dtype: int64