In [8]:
import pandas as pd
import json
import itertools as it

## PBMC Celllayers

In [9]:
data = pd.read_csv('sankey_data/celllayers/PBMC_celllayers_fig1.csv')
data[['source_label', 'target_label', 'value']]

Unnamed: 0,source_label,target_label,value
0,res.0.1_3,res.0.2_4,99
1,res.0.1_1,res.0.2_5,94
2,res.0.1_2,res.0.2_3,101
3,res.0.1_0,res.0.2_0,203
4,res.0.1_1,res.0.2_2,107
5,res.0.1_0,res.0.2_1,110
6,res.0.1_4,res.0.2_6,32
7,res.0.2_4,res.0.3_3,99
8,res.0.2_5,res.0.3_5,94
9,res.0.2_3,res.0.3_2,101


In [11]:
layer1

[{'name': 'res.0.1_0', 'id': 0, 'size': 313},
 {'name': 'res.0.1_1', 'id': 1, 'size': 201},
 {'name': 'res.0.1_2', 'id': 2, 'size': 101},
 {'name': 'res.0.1_3', 'id': 3, 'size': 99},
 {'name': 'res.0.1_4', 'id': 4, 'size': 32}]

In [13]:
orig_orders[0].index()

['res.0.1_3', 'res.0.1_0', 'res.0.1_2', 'res.0.1_1', 'res.0.1_4']

In [14]:
sorted(layer1, key=lambda x: orig_orders[0].index(x["name"]))

[{'name': 'res.0.1_3', 'id': 3, 'size': 99},
 {'name': 'res.0.1_0', 'id': 0, 'size': 313},
 {'name': 'res.0.1_2', 'id': 2, 'size': 101},
 {'name': 'res.0.1_1', 'id': 1, 'size': 201},
 {'name': 'res.0.1_4', 'id': 4, 'size': 32}]

In [20]:
d = {'nodes': []}
orig_orders = {
    1: ["res.0.1_3", "res.0.1_0", "res.0.1_2", "res.0.1_1", "res.0.1_4"],
    2: ["res.0.2_4", "res.0.2_0", "res.0.2_1", "res.0.2_3", "res.0.2_2", "res.0.2_5", "res.0.2_6"],
    3: ["res.0.3_3", "res.0.3_0", "res.0.3_7", "res.0.3_4", "res.0.3_2", "res.0.3_1", "res.0.3_5", "res.0.3_6"],
    4: ["res.0.4_3", "res.0.4_0", "res.0.4_7", "res.0.4_2", "res.0.4_4", "res.0.4_1", "res.0.4_5", "res.0.4_6"],
    5: ["res.0.5_3", "res.0.5_0", "res.0.5_7", "res.0.5_2", "res.0.5_1", "res.0.5_4", "res.0.5_5", "res.0.5_6"]
}
layer1 = []
for row in data[data['source_label'].str.startswith('res.0.1')].groupby('source_label').sum('value').reset_index().itertuples():
    layer1.append({
        'name': row.source_label,
        'id': int(row.source_label.split('_')[-1]),
        'size': row.value
    })
d['nodes'].append(sorted(layer1, key=lambda x: orig_orders[1].index(x["name"])))

for i in range(2, 6):
    layer = []
    for row in data[data['target_label'].str.startswith(f'res.0.{i}')].groupby('target_label').sum('value').reset_index().itertuples():
        layer.append({
            'name': row.target_label,
            'id': int(row.target_label.split('_')[-1]),
            'size': row.value
        })
    d['nodes'].append(sorted(layer, key=lambda x: orig_orders[i].index(x["name"])))

In [21]:
d.update({'links': []})
for i in range(1, 6):
    links = []
    for row in data[data['source_label'].str.startswith(f'res.0.{i}')].itertuples():
        links.append({
            'source': row.source_label,
            'sourceid': int(row.source_label.split('_')[-1]),
            'target': row.target_label,
            'targetid': int(row.target_label.split('_')[-1]),
            'value': row.value
        })
    d['links'].append(links)

In [22]:
d["level"] = orig_orders
with open('sankey_data/celllayers/PBMC_celllayers_fig1_proc.json', 'w') as fout:
    json.dump(d, fout)

## Sequence Flow

In [2]:
def load_data_to_df(f):
    with open(f) as fin:
        data = json.load(fin)
    
    df = []
    for link in data['links']:
        source_layer_id = link['from']['column']
        source_id = link['from']['node']
        target_layer_id = link['to']['column']
        target_id = link['to']['node']
        value = link['value']
        df.append((
            source_layer_id, source_id, data['nodes'][source_layer_id][source_id]['label'],
            target_layer_id, target_id, data['nodes'][target_layer_id][target_id]['label'],
            value
        ))
    df = pd.DataFrame(df, columns=[
        'source_layer_id', 'source_id', 'source_label',
        'target_layer_id', 'target_id', 'target_label', 'value'])
    return df, data

def get_nodes_links(df, layer_start=None, layer_end=None):
    if layer_start is None:
        layer_start = 0
    if layer_end is None:
        layer_end = max(df.source_layer_id.max(), df.target_layer_id.max())
        
    nodes = []
    for i in range(layer_start, layer_end + 1):
        node = []
        if i == layer_start:
            for row in df[df['source_layer_id'] == i].groupby(['source_id', 'source_label']).sum('value').reset_index().itertuples():
                node.append({
                    'name': row.source_label,
                    'id': row.source_id,
                    'size': row.value
                })
        else:
            for row in df[df['target_layer_id'] == i].groupby(['target_id', 'target_label']).sum('value').reset_index().itertuples():
                node.append({
                    'name': row.target_label,
                    'id': row.target_id,
                    'size': row.value
                })
        nodes.append(node)
    
    links = []
    for i in range(layer_start, layer_end):
        link = []
        for row in df[df['source_layer_id'] == i].itertuples():
            link.append({
                'source': row.source_label,
                'sourceid': row.source_id,
                'target': row.target_label,
                'targetid': row.target_id,
                'value': row.value
            })
        links.append(link)

    return {'nodes': nodes, 'links': links}

In [4]:
f = 'msl_ref7_fig3'
df, _ = load_data_to_df(f'sankey_data/sequece_flow/{f}.json')

d = get_nodes_links(df, layer_start=51, layer_end=57)
# d = get_nodes_links(df, layer_start=20, layer_end=30)
# d = get_nodes_links(df, layer_start=88, layer_end=117)
# d = get_nodes_links(df, layer_start=144, layer_end=145)

with open(f'sankey_data/sequece_flow/{f}_proc.json', 'w') as fout:
    json.dump(d, fout)

In [7]:
# f = 'mucin2_fig2_right'
f = 'read_correction_fig2_left'
df, _ = load_data_to_df(f'sankey_data/sequence_flow/{f}.json')

d = get_nodes_links(df, layer_start=189, layer_end=200)
# d = get_nodes_links(df, layer_start=20, layer_end=30)
# d = get_nodes_links(df, layer_start=88, layer_end=117)
# d = get_nodes_links(df, layer_start=144, layer_end=145)

with open(f'sankey_data/sequence_flow/{f}_proc.json', 'w') as fout:
    d["level"] = {i: nl for i, nl in enumerate(d["nodes"])}
    d["links"] = list(it.chain.from_iterable(d["links"]))
    json.dump(d, fout)

In [15]:
df, data = load_data_to_df('sankey_data/sequece_flow/msl_ref7_fig3.json')

In [30]:
t = df.query('source_layer_id >= 20 & target_layer_id <= 57')

In [34]:
t[t['source_layer_id'] == 21]

Unnamed: 0,source_layer_id,source_id,source_label,target_layer_id,target_id,target_label,value
30,21,0,R,22,0,G,9
31,21,1,K,22,0,G,5


In [33]:
get_nodes_links(df, layer_start=20, layer_end=57)

{'nodes': [[{'name': 'A', 'id': 0, 'size': 2},
   {'name': 'V', 'id': 1, 'size': 1},
   {'name': 'L', 'id': 2, 'size': 3},
   {'name': 'M', 'id': 3, 'size': 7},
   {'name': 'S', 'id': 4, 'size': 1}],
  [{'name': 'R', 'id': 0, 'size': 9}, {'name': 'K', 'id': 1, 'size': 5}],
  [{'name': 'G', 'id': 0, 'size': 14}],
  [{'name': 'N', 'id': 0, 'size': 14}],
  [{'name': 'V', 'id': 0, 'size': 10}, {'name': 'I', 'id': 1, 'size': 4}],
  [{'name': 'V', 'id': 0, 'size': 12},
   {'name': 'I', 'id': 1, 'size': 1},
   {'name': 'L', 'id': 2, 'size': 1}],
  [{'name': 'D', 'id': 0, 'size': 14}],
  [{'name': 'L', 'id': 0, 'size': 12}, {'name': 'M', 'id': 1, 'size': 2}],
  [{'name': 'A', 'id': 0, 'size': 14}],
  [{'name': 'V', 'id': 0, 'size': 11}, {'name': 'I', 'id': 1, 'size': 3}],
  [{'name': 'A', 'id': 0, 'size': 5}, {'name': 'G', 'id': 1, 'size': 9}],
  [{'name': 'V', 'id': 0, 'size': 13}, {'name': 'I', 'id': 1, 'size': 1}],
  [{'name': 'V', 'id': 0, 'size': 8}, {'name': 'I', 'id': 1, 'size': 6}],
  

## BioSankey

In [47]:
nodes_t = {
    1: ["k__Bacteria"],
    2: [
        "p__Bacteroidetes",
        "p__Proteobacteria",
        "p__Firmicutes",
        "p__Actinobacteria",
        "p__Fusobacteria",
        # "p__Spirochaetes",
        # "p__Tenericutes"
    ],
    3: [
        "c__Bacteroidia",
        "c__Gammaproteobacteria",
        "c__Betaproteobacteria",
        "c__Actinobacteria_(class)",
        "c__Bacilli",
        "c__Clostridia",
        "c__Fusobacteria_(class)",
        # "c__Flavobacteria",
        # "c__Epsilonproteobacteria",
        # "c__Spirochaetes_(class)",
        # "c__Erysipelotrichi",
        # "c__Mollicutes",
    ],
    4: [
        "o__Bacteroidales",
        "o__Pasteurellales",
        "o__Actinomycetales",
        "o__Lactobacillales",
        "o__Neisseriales",
        "o__Clostridiales",
        "o__Fusobacteriales",
        # "o__Coriobacteriales",
        # "o__Flavobacteriales",
        # "o__Campylobacterales",
        # "o__Spirochaetales",
        # "o__Erysipelotrichales",
        # "o__Mycoplasmatales",
    ],
    5: [
        "f__Prevotellaceae",
        "f__Porphyromonadaceae",
        "f__Pasteurellaceae",
        "f__Micrococcaceae",
        "f__Actinomycetaceae",
        "f__Streptococcaceae",
        "f__Carnobacteriaceae",
        "f__Neisseriaceae",
        "f__Veillonellaceae",
        "f__Fusobacteriaceae",
        # "f__Corynebacteriaceae",
        # "f__",
        # "f__Lachnospiraceae",
        # "f__Clostridiales_Family_XIII,_Incertae_Sedis",
        # "f__Coriobacteriaceae",
        # "f__Flavobacteriaceae",
        # "f__Campylobacteraceae",
        # "f__Spirochaetaceae",
        # "f__Erysipelotrichaceae",
        # "f__Mycoplasmataceae",
    ], 
    6: [
        "g__Prevotella",
        "g__Porphyromonas",
        "g__",
        "g__Haemophilus",
        "g__Rothia",
        "g__Actinomyces",
        "g__Streptococcus",
        "g__Granulicatella",
        "g__Neisseria",
        "g__Veillonella",
        "g__Fusobacterium",
        "g__Leptotrichia",
        # "g__Tannerella",
        # "g__Corynebacterium",
        # "g__Atopobium",
        # "o__Bacillales",
        # "g__Dialister",
        # "g__Selenomonas",
        # "g__Gemella",
        # "f__Clostridiales_Family_XI,_Incertae_Sedis",
        # "g__Oribacterium",
        # "g__Catonella",
        # "g__Moryella",
        # "g__Eubacterium",
        # "g__Mogibacterium",
        # "g__Capnocytophaga",
        # "g__Campylobacter",
        # "g__Treponema",
        # "g__Bulleidia",
        # "g__Mycoplasma",
    ]
}
selected = list(it.chain.from_iterable(nodes_t.values()))

In [51]:
links = [
    ["k__Bacteria", "p__Actinobacteria", 4575],
    ["k__Bacteria", "p__Bacteroidetes", 17491],
    ["k__Bacteria", "p__Firmicutes", 12331],
    ["k__Bacteria", "p__Fusobacteria", 4429],
    ["k__Bacteria", "p__Proteobacteria", 8561],
    ["k__Bacteria", "p__Spirochaetes", 45],
    ["k__Bacteria", "p__Tenericutes", 103],
    ["p__Actinobacteria", "c__Actinobacteria_(class)", 4575],
    ["p__Bacteroidetes", "c__Bacteroidia", 17057],
    ["p__Bacteroidetes", "c__Flavobacteria", 434],
    ["p__Firmicutes", "c__Bacilli", 5423],
    ["p__Firmicutes", "c__Clostridia", 6908],
    ["p__Fusobacteria", "c__Fusobacteria_(class)", 4429],
    ["p__Proteobacteria", "c__Betaproteobacteria", 1242],
    ["p__Proteobacteria", "c__Epsilonproteobacteria", 681],
    ["p__Proteobacteria", "c__Gammaproteobacteria", 6638],
    ["p__Spirochaetes", "c__Spirochaetes_(class)", 45],
    ["p__Tenericutes", "c__Erysipelotrichi", 60],
    ["p__Tenericutes", "c__Mollicutes", 43],
    ["c__Actinobacteria_(class)", "o__Actinomycetales", 4369],
    ["c__Actinobacteria_(class)", "o__Coriobacteriales", 206],
    ["c__Bacteroidia", "o__Bacteroidales", 17057],
    ["c__Flavobacteria", "o__Flavobacteriales", 434],
    ["c__Bacilli", "o__Bacillales", 633],
    ["c__Bacilli", "o__Lactobacillales", 4790],
    ["c__Clostridia", "o__Clostridiales", 6908],
    ["c__Fusobacteria_(class)", "o__Fusobacteriales", 4429],
    ["c__Betaproteobacteria", "o__Neisseriales", 1242],
    ["c__Epsilonproteobacteria", "o__Campylobacterales", 681],
    ["c__Gammaproteobacteria", "o__Pasteurellales", 6638],
    ["c__Spirochaetes_(class)", "o__Spirochaetales", 45],
    ["c__Erysipelotrichi", "o__Erysipelotrichales", 60],
    ["c__Mollicutes", "o__Mycoplasmatales", 43],
    ["o__Actinomycetales", "f__Actinomycetaceae", 2074],
    ["o__Actinomycetales", "f__Corynebacteriaceae", 19],
    ["o__Actinomycetales", "f__Micrococcaceae", 2276],
    ["o__Coriobacteriales", "f__Coriobacteriaceae", 206],
    ["o__Bacteroidales", "f__Porphyromonadaceae", 148],
    ["o__Bacteroidales", "f__Prevotellaceae", 16909],
    ["o__Flavobacteriales", "f__Flavobacteriaceae", 434],
    ["o__Clostridiales", "f__", 686],
    ["o__Lactobacillales", "f__Carnobacteriaceae", 397],
    ["o__Lactobacillales", "f__Streptococcaceae", 4393],
    ["o__Clostridiales", "f__Clostridiales_Family_XI,_Incertae_Sedis", 21],
    ["o__Clostridiales", "f__Clostridiales_Family_XIII,_Incertae_Sedis", 39],
    ["o__Clostridiales", "f__Lachnospiraceae", 471],
    ["o__Clostridiales", "f__Veillonellaceae", 6324],
    ["o__Fusobacteriales", "f__Fusobacteriaceae", 4429],
    ["o__Neisseriales", "f__Neisseriaceae", 1242],
    ["o__Campylobacterales", "f__Campylobacteraceae", 681],
    ["o__Pasteurellales", "f__Pasteurellaceae", 6638],
    ["o__Spirochaetales", "f__Spirochaetaceae", 45],
    ["o__Erysipelotrichales", "f__Erysipelotrichaceae", 60],
    ["o__Mycoplasmatales", "f__Mycoplasmataceae", 43],
    ["f__Actinomycetaceae", "g__Actinomyces", 2074],
    ["f__Corynebacteriaceae", "g__Corynebacterium", 19],
    ["f__Micrococcaceae", "g__Rothia", 2276],
    ["f__Coriobacteriaceae", "g__Atopobium", 206],
    ["f__Pasteurellaceae", "g__", 5029],
    ["f__Porphyromonadaceae", "g__Porphyromonas", 67],
    ["f__Porphyromonadaceae", "g__Tannerella", 37],
    ["f__Prevotellaceae", "g__Prevotella", 16909],
    ["f__Flavobacteriaceae", "g__Capnocytophaga", 434],
    ["f__", "g__Gemella", 633],
    ["f__Carnobacteriaceae", "g__Granulicatella", 397],
    ["f__Streptococcaceae", "g__Streptococcus", 4393],
    ["f__Clostridiales_Family_XIII,_Incertae_Sedis", "g__Eubacterium", 23],
    ["f__Clostridiales_Family_XIII,_Incertae_Sedis", "g__Mogibacterium", 16],
    ["f__Lachnospiraceae", "g__Catonella", 62],
    ["f__Lachnospiraceae", "g__Moryella", 11],
    ["f__Lachnospiraceae", "g__Oribacterium", 193],
    ["f__Veillonellaceae", "g__Dialister", 78],
    ["f__Veillonellaceae", "g__Selenomonas", 78],
    ["f__Veillonellaceae", "g__Veillonella", 5836],
    ["f__Fusobacteriaceae", "g__Fusobacterium", 2929],
    ["f__Fusobacteriaceae", "g__Leptotrichia", 1500],
    ["f__Neisseriaceae", "g__Neisseria", 1242],
    ["f__Campylobacteraceae", "g__Campylobacter", 681],
    ["f__Pasteurellaceae", "g__Haemophilus", 2264],
    ["f__Spirochaetaceae", "g__Treponema", 45],
    ["f__Erysipelotrichaceae", "g__Bulleidia", 60],
    ["f__Mycoplasmataceae", "g__Mycoplasma", 43],
]
links = list(filter(lambda l: all(x in selected for x in l[:2]), links))
links_df = pd.DataFrame(links, columns=["source", "target", "value"])

In [39]:
def find_node(node_list, name):
    for i, l in enumerate(node_list):
        try:
            return i, l.index(name)
        except:
            continue
    return None

In [56]:
new_df = []
for row in links_df.itertuples():
    source_layer_id, source_id = find_node(nodes_t.values(), row.source)
    target_layer_id, target_id = find_node(nodes_t.values(), row.target)
    new_df.append((
        source_layer_id, source_id, row.source, target_layer_id, target_id, row.target, row.value
    ))
new_df = pd.DataFrame(new_df, columns=['source_layer_id', 'source_id', 'source_label', 'target_layer_id', 'target_id', 'target_label', 'value'])

In [57]:
nodes = []
for i in range(new_df.target_layer_id.max() + 1):
    node = []
    if i == 0:
        for row in new_df[new_df['source_layer_id'] == i].groupby(['source_id', 'source_label']).sum('value').reset_index().itertuples():
            node.append({
                'name': row.source_label,
                'id': row.source_id,
                'size': row.value
            })
    else:
        for row in new_df[new_df['target_layer_id'] == i].groupby(['target_id', 'target_label']).sum('value').reset_index().itertuples():
            node.append({
                'name': row.target_label,
                'id': row.target_id,
                'size': row.value
            })
    sorted(node, key=lambda x: nodes_t[i+1].index(x["name"]))
    nodes.append(node)

links = []
for i in range(len(nodes)):
    link = []
    for row in new_df[new_df['source_layer_id'] == i].itertuples():
        link.append({
            'source': row.source_label,
            'sourceid': row.source_id,
            'target': row.target_label,
            'targetid': row.target_id,
            'value': row.value
        })
    links.append(link)

In [59]:
with open('sankey_data/biosankey_proc.json', 'w') as fout:
    json.dump({'nodes': nodes, 'links': links, "level": nodes_t}, fout)

In [58]:
data = {
    "nodes": [{"name": v} for v in list(it.chain.from_iterable(nodes_t))],
    "links": links_df.to_dict(orient="records"),
    "level": {i: nl for i, nl in enumerate(nodes_t)}
}

In [59]:
with open('biosankey_proc.json', 'w') as fout:
    json.dump(data, fout)

In [61]:
len([{'name': 'dummy f__Clostridiales_Family_XI,_Incertae_Sedis434', 'size': 21.0}, {'name': 'dummy o__Bacillales234', 'size': 633.0}, {'name': 'f__', 'size': 633.0}, {'name': 'f__Actinomycetaceae', 'size': 2074.0}, {'name': 'f__Campylobacteraceae', 'size': 681.0}, {'name': 'f__Carnobacteriaceae', 'size': 397.0}, {'name': 'f__Clostridiales_Family_XIII,_Incertae_Sedis', 'size': 39.0}, {'name': 'f__Coriobacteriaceae', 'size': 206.0}, {'name': 'f__Corynebacteriaceae', 'size': 19.0}, {'name': 'f__Erysipelotrichaceae', 'size': 60.0}, {'name': 'f__Flavobacteriaceae', 'size': 434.0}, {'name': 'f__Fusobacteriaceae', 'size': 4429.0}, {'name': 'f__Lachnospiraceae', 'size': 266.0}, {'name': 'f__Micrococcaceae', 'size': 2276.0}, {'name': 'f__Mycoplasmataceae', 'size': 43.0}, {'name': 'f__Neisseriaceae', 'size': 1242.0}, {'name': 'f__Pasteurellaceae', 'size': 7293.0}, {'name': 'f__Porphyromonadaceae', 'size': 104.0}, {'name': 'f__Prevotellaceae', 'size': 16909.0}, {'name': 'f__Spirochaetaceae', 'size': 45.0}, {'name': '
', 'size': 4393.0}, {'name': 'f__Veillonellaceae', 'size': 5992.0}])

22