In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from timeit import default_timer as timer

import plotly.graph_objects as go
import plotly
import kaleido

---
---
---

##### Sankey diagram

In [3]:
js = """
        const TEXTPAD = 3; // constant used by Plotly.js

        function sankeyNodeLabelsAlign(position, forcePos) {
          const textAnchor = {left: 'end', right: 'start', center: 'middle'}[position];
          const nodes = gd.getElementsByClassName('sankey-node');

          for (const node of nodes) {
            const d = node.__data__;
            const label = node.getElementsByClassName('node-label').item(0);

            // Ensure to reset any previous modifications
            label.setAttribute('x', 0);

            if (!d.horizontal)
              continue;

            // This is how Plotly's default text positioning is computed (coordinates
            // are relative to that of the cooresponding node).
            const padX = d.nodeLineWidth / 2 + TEXTPAD;
            const posX = padX + d.visibleWidth;
            let x;

            switch (position) {
              case 'left':
                if (d.left || d.node.originalLayer === 0 && !forcePos)
                  continue;
                x = -posX - padX;
                break;

              case 'right':
                if (!d.left || !forcePos)
                  continue;
                x = posX + padX;
                break;

              case 'center':
                if (!forcePos && (d.left || d.node.originalLayer === 0))
                  continue;
                x = (d.nodeLineWidth + d.visibleWidth)/2 + (d.left ? padX : -posX);
                break;
            }

            label.setAttribute('x', x);
            label.setAttribute('text-anchor', textAnchor);
          }
        }

        const gd = document.getElementById('{plot_id}');
        const position = 'right';
        const forcePos = true;

        gd.on('plotly_afterplot', sankeyNodeLabelsAlign.bind(gd, position, forcePos));
        gd.emit('plotly_afterplot');
     """

In [4]:
all_elastic = 2287019
all_relevant = 1275471
drop_relevant = all_elastic-all_relevant
print(all_elastic)
print(all_relevant)
print(drop_relevant)
all_topic = 620656
drop_topic = all_relevant - all_topic
print(all_topic)
print(drop_topic)
all_thresholding = 257266
drop_thersholding = all_topic - all_thresholding
print(all_thresholding)
print(drop_thersholding)

2287019
1275471
1011548
620656
654815
257266
363390


In [5]:
base_color = "#F8CBA6"
sankey_colors = sns.light_palette(base_color, n_colors=30)
sankey_colors = sankey_colors.as_hex()[6::6]

In [7]:
fig = go.Figure(data=[go.Sankey(
    node = dict(
      x=[0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4],
      y=[0.1, 0.2, 0.3, 0.2, 0.3, 0.2, 0.3],
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = ["","","","","","","",],#"2,287,019", #0
               #"1,275,471", #1
               #"drop [1,011,548]", #2
               #"620,656", #3
               #"drop [654,815]", #4
               #"257,266", #5
               #"drop [363,390]"], #6
      #color = ["#F8CBA6", "#FFE7CC", "lightgray", "#FFFBEB", "lightgray", "lightblue", "lightgray"]
      color = ["#F8CBA6", "#FFE7CC", "lightgray", "#FFFBEB", "lightgray", "lightblue", "lightgray"]
    ),
    arrangement='snap',
    link = dict(
        source = [0, 0, 1, 1, 3, 3],
        target = [1, 2, 3, 4, 5, 6],
      value = [1275471, 
               1011548, 
               620656,
               654815,
               257266,
               363390],
        color = ["#F8CBA6", "lightgray", "#FFE7CC", "lightgray", "#FFFBEB", 'lightgray']
  ))])

fig.update_layout(font_size=10, width=2400, height=400)
#fig.show()

fig.write_image("sankey_without_labels.png")
fig.write_image("sankey_without_labels.pdf")

fig.show(post_script=[js])

In [10]:
df = pd.read_csv('../../../../ncs/PaperMaterials/Data/257k_final_relevants_with_all_fields.csv', keep_default_na=False, index_col=0)


Columns (1,34,86,111,116,117,120) have mixed types. Specify dtype option on import or set low_memory=False.



In [11]:
all_elastic = 2287019
all_relevant = 1275471
drop_relevant = all_elastic-all_relevant
print(all_elastic)
print(all_relevant)
print(drop_relevant)
all_topic = 620656
drop_topic = all_relevant - all_topic
print(all_topic)
print(drop_topic)
all_thresholding = 257266
drop_thersholding = all_topic - all_thresholding
print(all_thresholding)
print(drop_thersholding)
geolocated = df[df['geolocation_status'] == 'found'].shape[0]
print('geolocated:', geolocated)
biodiv = df[df['biodiv_species_name'] != 'Not Found'].shape[0]
print('biodiv:', biodiv)
cost = df[df['contains_cost_layer1'] == True].shape[0]
print('cost:', cost)
esj = df[df['esj_s1'] == True].shape[0]
print('esj:', esj)
iplc = df[df['iplc_s2'] == True].shape[0]
print('iplc:', iplc)
geoandbio = df[(df['geolocation_status'] == 'found') & (df['biodiv_species_name'] != 'Not Found')].shape[0]
print('geoandbio:', geoandbio)

2287019
1275471
1011548
620656
654815
257266
363390
geolocated: 147696
biodiv: 107313
cost: 4992
esj: 2670
iplc: 3045
geoandbio: 64447


In [12]:
fig = go.Figure(data=[go.Sankey(
    node = dict(
      x=[0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.5, 0.5, 0.5],
      y=[0.1, 0.2, 0.3, 0.2, 0.3, 0.2, 0.3, 0.3, 0.4, 0.5, 0.5, 0.6, 0.7],
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = ["","","","","","","","","","","","",""],
        #label = ["2,287,019", #0
        #       "1,275,471", #1
        #       "drop [1,011,548]", #2
        #       "620,656", #3
        #       "drop [654,815]", #4
        #       "257,266", #5
        #       "drop [363,390]", #6
        #       "geolocated: 147,696", #7
        #       "biodiv: 107,313", #8
        #       "geo&bio: 64,447", #9
        #       "cost: 4,992", #10
        #       "equity: 2,670", #11
        #       "IPLC: 3,045"], #12
      #label = ["search: 2,287,019", #0
      #         "filter: 1,275,471", #1
      #         "drop [1,011,548]", #2
      #         "categorize: 620,656", #3
      #         "drop [654,815]", #4
      #         "relevant: 257,266", #5
      #         "drop [363,390]", #6
      #         "geolocated: 147,696", #7
      #         "biodiv: 107,313", #8
      #         "geo&bio: 64,447", #9
      #         "cost: 4,992", #10
      #         "equity: 2,670", #11
      #         "IPLC: 3,045"], #12
      color = [sankey_colors[-1], 
               sankey_colors[-2], 
               "lightgray", 
               sankey_colors[-3], 
               "lightgray", 
               sankey_colors[-4], 
               "lightgray", 
               sankey_colors[-4], 
               sankey_colors[-4], 
               sankey_colors[-4], 
               sankey_colors[-4], 
               sankey_colors[-4], 
               sankey_colors[-4], 
               sankey_colors[-4]]
    ),
    arrangement='snap',
    link = dict(
        source = [0, 0, 1, 1, 3, 3, 5, 5, 5, 5, 5, 7, 8],
        target = [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 9, 9],
      value = [1275471, 
               1011548, 
               620656,
               654815,
               257266,
               363390, 
               147696, 
               107313, 
               4992, 
               2670, 
               3045, 
               64447, 
               64447],
        color = [sankey_colors[-1], 
                 "lightgray", 
                 sankey_colors[-2], 
                 "lightgray", 
                 sankey_colors[-3], 
                 'lightgray', 
                 sankey_colors[-4], 
                 sankey_colors[-4], 
                 sankey_colors[-4], 
                 sankey_colors[-4], 
                 sankey_colors[-4], 
                 sankey_colors[-4], 
                 sankey_colors[-4]]
  ))])

fig.update_layout(font_size=10, width=2400, height=400)
#fig.show()

fig.write_image("sankey_detailted_no_label_at_all.png")
fig.write_image("sankey_detailted_no_label_at_all.pdf")

fig.show(post_script=[js])