In [1]:
import pandas as pd
import json

## PBMC Celllayers

In [4]:
data = pd.read_csv('sankey_data/celllayers/PBMC_celllayers_fig1.csv')
data[['source_label', 'target_label', 'value']]

Unnamed: 0,source_label,target_label,value
0,res.0.1_3,res.0.2_4,99
1,res.0.1_1,res.0.2_5,94
2,res.0.1_2,res.0.2_3,101
3,res.0.1_0,res.0.2_0,203
4,res.0.1_1,res.0.2_2,107
5,res.0.1_0,res.0.2_1,110
6,res.0.1_4,res.0.2_6,32
7,res.0.2_4,res.0.3_3,99
8,res.0.2_5,res.0.3_5,94
9,res.0.2_3,res.0.3_2,101


In [5]:
d = {'nodes': []}
layer1 = []
for row in data[data['source_label'].str.startswith('res.0.1')].groupby('source_label').sum('value').reset_index().itertuples():
    layer1.append({
        'name': row.source_label,
        'id': int(row.source_label.split('_')[-1]),
        'size': row.value
    })
d['nodes'].append(layer1)

for i in range(2, 6):
    layer = []
    for row in data[data['target_label'].str.startswith(f'res.0.{i}')].groupby('target_label').sum('value').reset_index().itertuples():
        layer.append({
            'name': row.target_label,
            'id': int(row.target_label.split('_')[-1]),
            'size': row.value
        })
    d['nodes'].append(layer)

In [6]:
d.update({'links': []})
for i in range(1, 6):
    links = []
    for row in data[data['source_label'].str.startswith(f'res.0.{i}')].itertuples():
        links.append({
            'source': row.source_label,
            'sourceid': int(row.source_label.split('_')[-1]),
            'target': row.target_label,
            'targetid': int(row.target_label.split('_')[-1]),
            'value': row.value
        })
    d['links'].append(links)

In [7]:
d

{'nodes': [[{'name': 'res.0.1_0', 'id': 0, 'size': 313},
   {'name': 'res.0.1_1', 'id': 1, 'size': 201},
   {'name': 'res.0.1_2', 'id': 2, 'size': 101},
   {'name': 'res.0.1_3', 'id': 3, 'size': 99},
   {'name': 'res.0.1_4', 'id': 4, 'size': 32}],
  [{'name': 'res.0.2_0', 'id': 0, 'size': 203},
   {'name': 'res.0.2_1', 'id': 1, 'size': 110},
   {'name': 'res.0.2_2', 'id': 2, 'size': 107},
   {'name': 'res.0.2_3', 'id': 3, 'size': 101},
   {'name': 'res.0.2_4', 'id': 4, 'size': 99},
   {'name': 'res.0.2_5', 'id': 5, 'size': 94},
   {'name': 'res.0.2_6', 'id': 6, 'size': 32}],
  [{'name': 'res.0.3_0', 'id': 0, 'size': 203},
   {'name': 'res.0.3_1', 'id': 1, 'size': 107},
   {'name': 'res.0.3_2', 'id': 2, 'size': 101},
   {'name': 'res.0.3_3', 'id': 3, 'size': 99},
   {'name': 'res.0.3_4', 'id': 4, 'size': 97},
   {'name': 'res.0.3_5', 'id': 5, 'size': 94},
   {'name': 'res.0.3_6', 'id': 6, 'size': 32},
   {'name': 'res.0.3_7', 'id': 7, 'size': 13}],
  [{'name': 'res.0.4_0', 'id': 0, 'siz

In [8]:
with open('sankey_data/celllayers/PBMC_celllayers_fig1.json', 'w') as fout:
    json.dump(d, fout)

## Sequence Flow

In [48]:
for f in ['msl_ref7_fig3', 'mucin2_fig2_right', 'read_correction_fig2_left']:
    with open(f'sankey_data/sequece_flow/{f}.json') as fin:
        data = json.load(fin)
    
    df = []
    for link in data['links']:
        source_layer_id = link['from']['column']
        source_id = link['from']['node']
        target_layer_id = link['to']['column']
        target_id = link['to']['node']
        value = link['value']
        df.append((
            source_layer_id, source_id, data['nodes'][source_layer_id][source_id]['label'],
            target_layer_id, target_id, data['nodes'][target_layer_id][target_id]['label'],
            value
        ))
    df = pd.DataFrame(df, columns=[
        'source_layer_id', 'source_id', 'source_label',
        'target_layer_id', 'target_id', 'target_label', 'value'])
    break
    
    nodes = []
    for i in range(len(data['nodes'])):
        node = []
        if i == 0:
            for row in df[df['source_layer_id'] == i].groupby(['source_id', 'source_label']).sum('value').reset_index().itertuples():
                node.append({
                    'name': row.source_label,
                    'id': row.source_id,
                    'size': value
                })
        else:
            for row in df[df['target_layer_id'] == i].groupby(['target_id', 'target_label']).sum('value').reset_index().itertuples():
                node.append({
                    'name': row.target_label,
                    'id': row.target_id,
                    'size': value
                })
        nodes.append(node)
    
    links = []
    for i in range(len(nodes)):
        link = []
        for row in df[df['source_layer_id'] == i].itertuples():
            link.append({
                'source': row.source_label,
                'sourceid': row.source_id,
                'target': row.target_label,
                'targetid': row.target_id,
                'value': row.value
            })
        links.append(link)
    
    with open(f'sankey_data/sequece_flow/{f}_proc.json', 'w') as fout:
        json.dump({'nodes': nodes, 'links': links}, fout)

In [52]:
for i in range(len(data['nodes'])):
    print(df[df['source_layer_id'] == i].groupby('source_id').sum('value')['value'].min())

1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
13
5
2
13
2
1
5
14
14
4
1
14
2
14
3
5
1
6
1
1
1
1
14
2
1
1
2
1
2
6
4
1
1
1
1
1
1
1
2
1
1
1
1
3
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
2
1
1
1
1
5
14
1
4
1
2
1
3
3
4
3
1
6
3
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
14
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
1
1
1
1
1
1
1
1
1
nan


In [57]:
df[df['source_layer_id'] == 166]

Unnamed: 0,source_layer_id,source_id,source_label,target_layer_id,target_id,target_label,value
235,166,1,T,167,0,N,1
486,166,3,G,167,1,G,1
679,166,0,V,167,2,D,1


In [56]:
data['nodes'][167]

[{'label': 'N'}, {'label': 'G'}, {'label': 'D'}]

## BioSankey

In [61]:
links = [
    ["k__Bacteria", "p__Actinobacteria", 4575],
    ["k__Bacteria", "p__Bacteroidetes", 17491],
    ["k__Bacteria", "p__Firmicutes", 12331],
    ["k__Bacteria", "p__Fusobacteria", 4429],
    ["k__Bacteria", "p__Proteobacteria", 8561],
    ["k__Bacteria", "p__Spirochaetes", 45],
    ["k__Bacteria", "p__Tenericutes", 103],
    ["p__Actinobacteria", "c__Actinobacteria_(class)", 4575],
    ["p__Bacteroidetes", "c__Bacteroidia", 17057],
    ["p__Bacteroidetes", "c__Flavobacteria", 434],
    ["p__Firmicutes", "c__Bacilli", 5423],
    ["p__Firmicutes", "c__Clostridia", 6908],
    ["p__Fusobacteria", "c__Fusobacteria_(class)", 4429],
    ["p__Proteobacteria", "c__Betaproteobacteria", 1242],
    ["p__Proteobacteria", "c__Epsilonproteobacteria", 681],
    ["p__Proteobacteria", "c__Gammaproteobacteria", 6638],
    ["p__Spirochaetes", "c__Spirochaetes_(class)", 45],
    ["p__Tenericutes", "c__Erysipelotrichi", 60],
    ["p__Tenericutes", "c__Mollicutes", 43],
    ["c__Actinobacteria_(class)", "o__Actinomycetales", 4369],
    ["c__Actinobacteria_(class)", "o__Coriobacteriales", 206],
    ["c__Bacteroidia", "o__Bacteroidales", 17057],
    ["c__Flavobacteria", "o__Flavobacteriales", 434],
    ["c__Bacilli", "o__Bacillales", 633],
    ["c__Bacilli", "o__Lactobacillales", 4790],
    ["c__Clostridia", "o__Clostridiales", 6908],
    ["c__Fusobacteria_(class)", "o__Fusobacteriales", 4429],
    ["c__Betaproteobacteria", "o__Neisseriales", 1242],
    ["c__Epsilonproteobacteria", "o__Campylobacterales", 681],
    ["c__Gammaproteobacteria", "o__Pasteurellales", 6638],
    ["c__Spirochaetes_(class)", "o__Spirochaetales", 45],
    ["c__Erysipelotrichi", "o__Erysipelotrichales", 60],
    ["c__Mollicutes", "o__Mycoplasmatales", 43],
    ["o__Actinomycetales", "f__Actinomycetaceae", 2074],
    ["o__Actinomycetales", "f__Corynebacteriaceae", 19],
    ["o__Actinomycetales", "f__Micrococcaceae", 2276],
    ["o__Coriobacteriales", "f__Coriobacteriaceae", 206],
    ["o__Bacteroidales", "f__Porphyromonadaceae", 148],
    ["o__Bacteroidales", "f__Prevotellaceae", 16909],
    ["o__Flavobacteriales", "f__Flavobacteriaceae", 434],
    ["o__Clostridiales", "f__", 686],
    ["o__Lactobacillales", "f__Carnobacteriaceae", 397],
    ["o__Lactobacillales", "f__Streptococcaceae", 4393],
    ["o__Clostridiales", "f__Clostridiales_Family_XI,_Incertae_Sedis", 21],
    ["o__Clostridiales", "f__Clostridiales_Family_XIII,_Incertae_Sedis", 39],
    ["o__Clostridiales", "f__Lachnospiraceae", 471],
    ["o__Clostridiales", "f__Veillonellaceae", 6324],
    ["o__Fusobacteriales", "f__Fusobacteriaceae", 4429],
    ["o__Neisseriales", "f__Neisseriaceae", 1242],
    ["o__Campylobacterales", "f__Campylobacteraceae", 681],
    ["o__Pasteurellales", "f__Pasteurellaceae", 6638],
    ["o__Spirochaetales", "f__Spirochaetaceae", 45],
    ["o__Erysipelotrichales", "f__Erysipelotrichaceae", 60],
    ["o__Mycoplasmatales", "f__Mycoplasmataceae", 43],
    ["f__Actinomycetaceae", "g__Actinomyces", 2074],
    ["f__Corynebacteriaceae", "g__Corynebacterium", 19],
    ["f__Micrococcaceae", "g__Rothia", 2276],
    ["f__Coriobacteriaceae", "g__Atopobium", 206],
    ["f__Pasteurellaceae", "g__", 5029],
    ["f__Porphyromonadaceae", "g__Porphyromonas", 67],
    ["f__Porphyromonadaceae", "g__Tannerella", 37],
    ["f__Prevotellaceae", "g__Prevotella", 16909],
    ["f__Flavobacteriaceae", "g__Capnocytophaga", 434],
    ["f__", "g__Gemella", 633],
    ["f__Carnobacteriaceae", "g__Granulicatella", 397],
    ["f__Streptococcaceae", "g__Streptococcus", 4393],
    ["f__Clostridiales_Family_XIII,_Incertae_Sedis", "g__Eubacterium", 23],
    ["f__Clostridiales_Family_XIII,_Incertae_Sedis", "g__Mogibacterium", 16],
    ["f__Lachnospiraceae", "g__Catonella", 62],
    ["f__Lachnospiraceae", "g__Moryella", 11],
    ["f__Lachnospiraceae", "g__Oribacterium", 193],
    ["f__Veillonellaceae", "g__Dialister", 78],
    ["f__Veillonellaceae", "g__Selenomonas", 78],
    ["f__Veillonellaceae", "g__Veillonella", 5836],
    ["f__Fusobacteriaceae", "g__Fusobacterium", 2929],
    ["f__Fusobacteriaceae", "g__Leptotrichia", 1500],
    ["f__Neisseriaceae", "g__Neisseria", 1242],
    ["f__Campylobacteraceae", "g__Campylobacter", 681],
    ["f__Pasteurellaceae", "g__Haemophilus", 2264],
    ["f__Spirochaetaceae", "g__Treponema", 45],
    ["f__Erysipelotrichaceae", "g__Bulleidia", 60],
    ["f__Mycoplasmataceae", "g__Mycoplasma", 43],
]
links_df = pd.DataFrame(links, columns=["source", "target", "value"])

In [62]:
nodes_t = [
    ["k__Bacteria"],
    [
        "p__Bacteroidetes",
        "p__Firmicutes",
        "p__Actinobacteria",
        "p__Proteobacteria",
        "p__Fusobacteria",
        "p__Spirochaetes",
        "p__Tenericutes"
    ],
    [
        "c__Bacteroidia",
        "c__Bacilli",
        "c__Actinobacteria_(class)",
        "c__Clostridia",
        "c__Flavobacteria",
        "c__Fusobacteria_(class)",
        "c__Gammaproteobacteria",
        "c__Betaproteobacteria",
        "c__Epsilonproteobacteria",
        "c__Spirochaetes_(class)",
        "c__Erysipelotrichi",
        "c__Mollicutes",
    ],
    [
        "o__Bacteroidales",
        "o__Actinomycetales",
        "o__Lactobacillales",
        "o__Clostridiales",
        "o__Coriobacteriales",
        "o__Flavobacteriales",
        "o__Fusobacteriales",
        "o__Pasteurellales",
        "o__Neisseriales",
        "o__Campylobacterales",
        "o__Spirochaetales",
        "o__Erysipelotrichales",
        "o__Mycoplasmatales",
    ],
    [
        "f__Prevotellaceae",
        "f__Porphyromonadaceae",
        "f__Porphyromonadaceae",
        "f__Micrococcaceae",
        "f__Actinomycetaceae",
        "f__Corynebacteriaceae",
        "f__Streptococcaceae",
        "f__Carnobacteriaceae",
        "f__Veillonellaceae",
        "f__",
        "f__Lachnospiraceae",
        "f__Clostridiales_Family_XIII,_Incertae_Sedis",
        "f__Coriobacteriaceae",
        "f__Fusobacteriaceae",
        "f__Flavobacteriaceae",
        "f__Pasteurellaceae",
        "f__Neisseriaceae",
        "f__Campylobacteraceae",
        "f__Spirochaetaceae",
        "f__Erysipelotrichaceae",
        "f__Mycoplasmataceae",
    ], 
    [
        "g__Prevotella",
        "g__Porphyromonas",
        "g__Tannerella",
        "g__Rothia",
        "g__Actinomyces",
        "g__Corynebacterium",
        "g__Atopobium",
        "g__Streptococcus",
        "g__Granulicatella",
        "o__Bacillales",
        "g__Veillonella",
        "g__Dialister",
        "g__Selenomonas",
        "g__Gemella",
        "f__Clostridiales_Family_XI,_Incertae_Sedis",
        "g__Oribacterium",
        "g__Catonella",
        "g__Moryella",
        "g__Eubacterium",
        "g__Mogibacterium",
        "g__Capnocytophaga",
        "g__Fusobacterium",
        "g__Leptotrichia",
        "g__",
        "g__Haemophilus",
        "g__Neisseria",
        "g__Campylobacter",
        "g__Treponema",
        "g__Bulleidia",
        "g__Mycoplasma",
    ]
]

In [78]:
def find_node(node_list, name):
    for i, l in enumerate(node_list):
        try:
            return i, l.index(name)
        except:
            continue
    return None

In [79]:
find_node(nodes_t, 'g__Neisseria')

(5, 25)

In [80]:
new_df = []
for row in links_df.itertuples():
    source_layer_id, source_id = find_node(nodes_t, row.source)
    target_layer_id, target_id = find_node(nodes_t, row.target)
    new_df.append((
        source_layer_id, source_id, row.source, target_layer_id, target_id, row.target, row.value
    ))
new_df = pd.DataFrame(new_df, columns=['source_layer_id', 'source_id', 'source_label', 'target_layer_id', 'target_id', 'target_label', 'value'])

In [81]:
new_df

Unnamed: 0,source_layer_id,source_id,source_label,target_layer_id,target_id,target_label,value
0,0,0,k__Bacteria,1,2,p__Actinobacteria,4575
1,0,0,k__Bacteria,1,0,p__Bacteroidetes,17491
2,0,0,k__Bacteria,1,1,p__Firmicutes,12331
3,0,0,k__Bacteria,1,4,p__Fusobacteria,4429
4,0,0,k__Bacteria,1,3,p__Proteobacteria,8561
...,...,...,...,...,...,...,...
77,4,17,f__Campylobacteraceae,5,26,g__Campylobacter,681
78,4,15,f__Pasteurellaceae,5,24,g__Haemophilus,2264
79,4,18,f__Spirochaetaceae,5,27,g__Treponema,45
80,4,19,f__Erysipelotrichaceae,5,28,g__Bulleidia,60


In [82]:
nodes = []
for i in range(new_df.target_layer_id.max() + 1):
    node = []
    if i == 0:
        for row in new_df[new_df['source_layer_id'] == i].groupby(['source_id', 'source_label']).sum('value').reset_index().itertuples():
            node.append({
                'name': row.source_label,
                'id': row.source_id,
                'size': value
            })
    else:
        for row in new_df[new_df['target_layer_id'] == i].groupby(['target_id', 'target_label']).sum('value').reset_index().itertuples():
            node.append({
                'name': row.target_label,
                'id': row.target_id,
                'size': value
            })
    nodes.append(node)

links = []
for i in range(len(nodes)):
    link = []
    for row in new_df[new_df['source_layer_id'] == i].itertuples():
        link.append({
            'source': row.source_label,
            'sourceid': row.source_id,
            'target': row.target_label,
            'targetid': row.target_id,
            'value': row.value
        })
    links.append(link)

In [85]:
with open('biosankey_proc.json', 'w') as fout:
    json.dump({'nodes': nodes, 'links': links}, fout)