<a href="https://colab.research.google.com/github/jessiejxyu2/ist526/blob/main/Question_5__Sankey_diagram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Essential Libraries

In [1]:
# python visualization libraries
import pandas as pd
import numpy as np

import math
import json


import plotly.express as px
import plotly.graph_objects as go

# for hierarchical data
import networkx as nx

# Load Data from GitHub

In [2]:
# ref: https://stackoverflow.com/questions/32400867/pandas-read-csv-from-url

url = 'https://raw.githubusercontent.com/smbillah/ist526/main/hierarchical_data.csv'

# pandas call to read csv file 
df = pd.read_csv(url)

# quickly show the dataframe
df.head()

Unnamed: 0,Indent Level,Item and Group,Weight,Parent
0,0,All items,100.0,
1,1,Food and beverages,15.157,All items
2,2,Food,14.119,Food and beverages
3,3,Food at home,7.772,Food
4,4,Cereals and bakery products,1.001,Food at home


In [3]:
# it's a good idea to peek at the tail too. 
# Note, we need display(.) function if more than output is printed
display(df.head())
display(df.tail())

# get column names
display(df.columns)

Unnamed: 0,Indent Level,Item and Group,Weight,Parent
0,0,All items,100.0,
1,1,Food and beverages,15.157,All items
2,2,Food,14.119,Food and beverages
3,3,Food at home,7.772,Food
4,4,Cereals and bakery products,1.001,Food at home


Unnamed: 0,Indent Level,Item and Group,Weight,Parent
289,4,Funeral expenses,0.14,Miscellaneous personal services
290,4,Laundry and dry cleaning services,0.22,Miscellaneous personal services
291,4,Apparel services other than laundry and dry cl...,0.03,Miscellaneous personal services
292,4,Financial services,0.229,Miscellaneous personal services
293,4,Unsampled items,0.111,Miscellaneous personal services


Index(['Indent Level', 'Item and Group', 'Weight', 'Parent'], dtype='object')

## Pre-processing

In [4]:
# remove NaN with blank, otherwise plotly will be upset
df.fillna('', inplace = True)
# df.dropna(axis=0, inplace = True)
display(df.head())


Unnamed: 0,Indent Level,Item and Group,Weight,Parent
0,0,All items,100.0,
1,1,Food and beverages,15.157,All items
2,2,Food,14.119,Food and beverages
3,3,Food at home,7.772,Food
4,4,Cereals and bakery products,1.001,Food at home


# Sankey Diagram (Edge/Flow visualization)
Up until now, we haven't paid attention to edge of a tree. Enter [Sankey](https://en.wikipedia.org/wiki/Sankey_diagram) diagram.

A Sankey diagram is a flow diagram, in which the width of arrows is proportional to the flow quantity.

[Ref](https://plotly.com/python/sankey-diagram/)


## Basic Sankey Diagram

`source` to represent the source node, 

`target` for the target node, 

`value` to set the flow volume, and 

`label` that shows the node name

In [22]:
line = {'color': "black", 'width': 0.5}
print(line)

node = {'pad': 15, 
        'thickness': 20, 
        'line': line,
        'label': ['automotive sales', 
                  'automatic leasing', 
                  'automotive regulatory credits',
                  'Energy Generation and Storage', 
                  'Service and Others',
                  'Total Revenue',
                  'Gross Profit',
                  'Cost of Revenue', 
                  'Operating Income', 
                  'Operating Expenses',
                  'Interest Income',
                  'Net Income', 
                  'Interest Expense',
                  'Tax&Other Expense',
                  'SG&A', 
                  'R&D'],
        'color': "blue"
      }
print(node)

link = {
      'source': [0, 1,2,3, 4, 5, 5, 6,6],#8,  8,8, 9,  9,10], # indices correspond to labels, (e.g., A1=0, A2=1) and (souce_i, target_i) are tuple  
      'target': [5, 5,5,5, 5, 6, 7, 8,9],#11,12,13,14,15,11],
      'value' : [18.9, 0.56, 0.52, 1.5, 1.8, 4.5,18.8,2.7,1.8]#,0.21,2.5,0.03,0.31,1.1,0.77]
    }
print(link)

{'color': 'black', 'width': 0.5}
{'pad': 15, 'thickness': 20, 'line': {'color': 'black', 'width': 0.5}, 'label': ['automotive sales', 'automatic leasing', 'automotive regulatory credits', 'Energy Generation and Storage', 'Service and Others', 'Total Revenue', 'Gross Profit', 'Cost of Revenue', 'Operating Income', 'Operating Expenses', 'Interest Income', 'Net Income', 'Interest Expense', 'Tax&Other Expense', 'SG&A', 'R&D'], 'color': 'blue'}
{'source': [0, 1, 2, 3, 4, 5, 5, 6, 6], 'target': [5, 5, 5, 5, 5, 6, 7, 8, 9], 'value': [18.9, 0.56, 0.52, 1.5, 1.8, 4.5, 18.8, 2.7, 1.8]}


In [24]:
fig = go.Figure(
  data = [go.Sankey(node = node, link = link)]
)

fig.update_layout(
  title_text="Basic Sankey Diagram", 
  font_size=10
)
fig.show()

## Complex one

In [24]:
# Get the data in the format Plotly wants
label_dict = { df["Item and Group"][i] : i for i in range(0, len(df) ) }

# Initialize empty arrays
source = []
target = []
value = []

for i, row in df.iterrows():
    # Skip the root level
    if row["Item and Group"] != 'All items': 
        source.append(label_dict[row["Parent"]])
        target.append(label_dict[row["Item and Group"]])
        value.append(row["Weight"])   


# define three variables
line = {'color': "black", 'width': 0.5}

link = {
      'source': source,
      'target': target,
      'value' : value
    }

node = {'pad': 15, 
        'thickness': 20, 
        'line': line,
        'label': df["Item and Group"].to_list(),
        'color': "blue",
        'hovertemplate': '%{label} is %{value} of spending'
      }


fig = go.Figure(
  data = [go.Sankey(node = node, link = link)]
)

fig.update_layout(
  title_text="Complext Sankey Diagram", 
  font_size=10
)
fig.show()