In [8]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = ["A1", "A2", "B1", "B2", "C1", "C2", "R", "O", "B"],
      color = "blue"
    ),
    link = dict(
      source = [0, 1, 0, 2, 3, 3, 0, 1, 6, 7, 7], # indices correspond to labels, eg A1, A2, A1, B1, ...
      target = [2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 5],
      value = [8, 4, 2, 8, 4, 2, 3, 1, 4, 2, 2]
  ))])

fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show()

In [18]:
import os
import pandas as pd
import itertools

In [12]:
df = pd.read_csv("summaries.tsv", delimiter="\t")
df

Unnamed: 0,donor,101JH_week_04_S2.summary.txt,101JH_week_12_S3.summary.txt,101JH_week_16_S4.summary.txt,101JH_week_20_S5.summary.txt,101JH_week_24_S6.summary.txt,101JH_week_28_S7.summary.txt,101JH_week_32_S8.summary.txt,101JH_week_36_S9.summary.txt,101JH_week_40_S10.summary.txt,101JH_week_44_S11.summary.txt,101JH_week_48_S12.summary.txt,101JH_week_52_S13.summary.txt
0,AE05.,590402,2991236,3273873,183721,2100581,4441139,8076089,370927,515026,1161879,3398915,4019807
1,CB06.,3882383,900369,653589,258411,259115,857911,1379013,3165433,3351337,1425974,2028028,1416890
2,QC07.,1714151,7324052,4889184,1161170,8677766,7292872,6018808,3164595,4138392,2322309,1631404,2616304
3,ambiguous,4339521,6828467,4707413,495020,1480530,5530693,9913920,4760182,4528240,2213343,4642481,5791381
4,unassigned,17544084,14153743,14079757,7046736,17285125,16223652,10043103,15946383,17663326,15368519,10325633,11176456


In [13]:
donors = list(df['donor'])
donors

['AE05.', 'CB06.', 'QC07.', 'ambiguous', 'unassigned']

In [17]:
samples = list(df.columns)
samples.pop(0)
samples

['101JH_week_04_S2.summary.txt',
 '101JH_week_12_S3.summary.txt',
 '101JH_week_16_S4.summary.txt',
 '101JH_week_20_S5.summary.txt',
 '101JH_week_24_S6.summary.txt',
 '101JH_week_28_S7.summary.txt',
 '101JH_week_32_S8.summary.txt',
 '101JH_week_36_S9.summary.txt',
 '101JH_week_40_S10.summary.txt',
 '101JH_week_44_S11.summary.txt',
 '101JH_week_48_S12.summary.txt',
 '101JH_week_52_S13.summary.txt']

In [27]:
dfr = df.set_index('donor')
list(dfr.index)

['AE05.', 'CB06.', 'QC07.', 'ambiguous', 'unassigned']

In [29]:
dfr.loc['AE05.']['101JH_week_04_S2.summary.txt']

590402

In [30]:
list(enumerate(itertools.product(samples, donors)))

[(0, ('101JH_week_04_S2.summary.txt', 'AE05.')),
 (1, ('101JH_week_04_S2.summary.txt', 'CB06.')),
 (2, ('101JH_week_04_S2.summary.txt', 'QC07.')),
 (3, ('101JH_week_04_S2.summary.txt', 'ambiguous')),
 (4, ('101JH_week_04_S2.summary.txt', 'unassigned')),
 (5, ('101JH_week_12_S3.summary.txt', 'AE05.')),
 (6, ('101JH_week_12_S3.summary.txt', 'CB06.')),
 (7, ('101JH_week_12_S3.summary.txt', 'QC07.')),
 (8, ('101JH_week_12_S3.summary.txt', 'ambiguous')),
 (9, ('101JH_week_12_S3.summary.txt', 'unassigned')),
 (10, ('101JH_week_16_S4.summary.txt', 'AE05.')),
 (11, ('101JH_week_16_S4.summary.txt', 'CB06.')),
 (12, ('101JH_week_16_S4.summary.txt', 'QC07.')),
 (13, ('101JH_week_16_S4.summary.txt', 'ambiguous')),
 (14, ('101JH_week_16_S4.summary.txt', 'unassigned')),
 (15, ('101JH_week_20_S5.summary.txt', 'AE05.')),
 (16, ('101JH_week_20_S5.summary.txt', 'CB06.')),
 (17, ('101JH_week_20_S5.summary.txt', 'QC07.')),
 (18, ('101JH_week_20_S5.summary.txt', 'ambiguous')),
 (19, ('101JH_week_20_S5.summ

### Generate the data

Note that this is sort of set up to make https://stackoverflow.com/questions/50486767/plotly-how-to-draw-a-sankey-diagram-from-a-dataframe

but in reality, I just use the diagram above.

In [57]:
sources = {x:0 for x in donors}
nodes = [["Node", "Donor", "Colour"], [0, 'unassigned']]
labels = []
# for a simple figure
simple_labels = ["unassigned"]
simple_source = []
simple_target = []
simple_value = []
links = [['Source','Target','Value','Link Colour']]
for k, tpl in enumerate(itertools.product(samples, donors)):
    i=k+1
    links.append([sources[tpl[1]], i, dfr.loc[tpl[1]][tpl[0]], 'rgba(127, 194, 65, 0.2)'])
    simple_source.append(sources[tpl[1]])
    simple_target.append(i)
    simple_value.append(dfr.loc[tpl[1]][tpl[0]])
    sources[tpl[1]] = i
    if tpl[1] == 'unassigned':
        labels.append(tpl[0].replace('.summary.txt', ''))
    nodes.append([i, tpl[1], '#4994CE'])
    simple_labels.append(tpl[1])


In [59]:
fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = simple_labels,
      color = "blue"
    ),
    link = dict(
      source = simple_source, # indices correspond to labels, eg A1, A2, A1, B1, ...
      target = simple_target,
      value = simple_value
  ))])

fig.update_layout(title_text="Donor abundances by week", font_size=10)
fig.show()

## TODO

This needs to be converted to relative abundance by column normalization!