In [None]:
#preparing file for visualization & KEGG Reconstruct

import pandas as pd

# input file to df, deleting comments, choosing lines

df = pd.read_csv("found_ko_s_e10_p_e10_table.tsv", sep="\t")
df = df[["#query", "max_annot_lvl", "Description", "KEGG_ko", "KEGG_Pathway"]]

#explode KO column
df["KEGG_ko"] = df["KEGG_ko"].str.split(",")
df = df.explode("KEGG_ko")
#clean taxa column
df["max_annot_lvl"] = df["max_annot_lvl"].str.split("|").str[-1]
#clean KO numbers
df["KEGG_ko"] = df["KEGG_ko"].str.replace("ko:", "", regex=True)
#clean pathways lists
df["KEGG_Pathway"] = df["KEGG_Pathway"].apply(lambda x: ",".join([v for v in x.split(",") if v.startswith("ko")]))
df["KEGG_Pathway"] = df["KEGG_Pathway"].str.replace("ko", "", regex=True)
#filtering general pathways
#(01100 - Metabolic pathways; 01110 - Biosynthesis of secondary metabolites; 01120 - Microbial metabolism in diverse environments) 01130?
#to_remove = {"ko01100", "ko01110", "ko01120"}
def remove_values(cell):
    parts = cell.split(",")
    filtered = [p for p in parts if p not in to_remove]
    return ",".join(filtered)
#df["KEGG_Pathway"] = df["KEGG_Pathway"].apply(remove_values)
#explode Pathways column
df["KEGG_Pathway"] = df["KEGG_Pathway"].str.split(",")
df = df.explode("KEGG_Pathway")

print(df)
#3089 rows

#to input file for kegg reconstruct
df_kegg_input = df[["#query", "KEGG_ko"]]
df_kegg_input.to_csv("ko_kegg_input_table.tsv", sep="\t", index=False)

Intermediate step - KEGG Reconstruction
Copy results from Pathways to file raw_kegg_reconstruct_results_ko_in_path_cat

In [None]:
#formatting KEGG_reconstruct results
import re

#open file copied from KEGG_reconstruct (pathways)
with open("raw_kegg_reconstruct_results_ko_in_path_cat", "r", encoding="utf-8") as f:
    content = f.read()
kegg_reconstruct_to_tsv = "ko_line\tpath_id_line\tpath_name_line\tcategory_line\tquery_line\n"
ko_line = path_id_line = path_name_line = category_line = query_line = ""
valid_change = 0
#cleaning file format to tsv
lines = content.split("\n")
for line in lines:
  if re.match(r"^[0-9]{5}", line):
    parts = line.split(" ")
    path_id_line = parts[0]
    path_name_line = " ".join(parts[1:-1])
  elif re.match(r"^K[0-9]{5}", line):
    ko_line = line
  elif re.match(r"^query:", line):
    line = line.replace("query: ", "")
    splitted_line = [x.strip() for x in line.split(",")]
    queries = ", ".join(list(set(splitted_line)))
    query_line = queries
    valid_change = 1
  elif re.match(r"^[a-zA-Z]", line):
    category_line = line
  else:
    continue
  if valid_change == 1:
    kegg_reconstruct_to_tsv += f"{ko_line}\t{path_id_line}\t{path_name_line}\t{category_line}\t{query_line}\n"
    valid_change = 0
  else:
    continue

print(kegg_reconstruct_to_tsv)
#write to file
with open("kegg_reconstruct_result.tsv", "w") as f:
    f.write(kegg_reconstruct_to_tsv)

In [None]:
#Mapping category & pathway names

df_kegg_reconstruct_to_tsv = pd.read_csv("kegg_reconstruct_result.tsv", sep="\t", dtype=str)
df = df.astype(str)

df_mapped = df.merge(
    df_kegg_reconstruct_to_tsv,
    left_on=["KEGG_ko", "KEGG_Pathway"],
    right_on=["ko_line", "path_id_line"],
    how="inner",
)
df_mapped = df_mapped.drop(columns=["ko_line", "path_id_line", "query_line"])
df_mapped = df_mapped.sort_values(by='category_line')


df_filtered = df_mapped[~df_mapped['category_line'].isin(['Nervous system', 'Neurodegenerative disease', 'Cardiovascular disease', 'Global and overview maps', 'Immune system', 'Cancer: overview', 'Cancer: specific types', 'Endocrine system', 'Circulatory system', 'Digestive system', 'Infectious disease: bacterial', 'Endocrine and metabolic disease', 'Aging' ])]
df_mapped = df_filtered
display(df_mapped)

df_mapped.to_csv("table_for_alluvial_plot.tsv", sep="\t", index=False)

Adding Kraken2 Results

In [None]:
#kraken2 to df

import pandas as pd

#reading report

cols = ["perc", "reads_clade", "reads_direct", "rank", "taxid", "name"]
df_kraken_report = pd.read_csv("kraken2.report", sep="\t", names=cols)
df_kraken_report["name"] = df_kraken_report["name"].str.strip()
current_lineage = {"D": None, "P": None, "C": None, "O": None, "F": None, "G": None, "S": None}
records = []

for _, row in df_kraken_report.iterrows():
    rank = row["rank"]
    if rank in current_lineage:
        current_lineage[rank] = row["name"]

        # ← tutaj dodany fragment
        reset = False
        for r in ["D", "P", "C", "O", "F", "G", "S"]:
            if reset:
                current_lineage[r] = None
            if r == rank:
                reset = True

    lineage_record = {
        "taxid": row["taxid"],
        "rank": rank,
        "name": row["name"],
        "reads_clade": row["reads_clade"],
        "perc": row["perc"],
        "k2_domain": current_lineage["D"],
        "k2_phylum": current_lineage["P"],
        "k2_class": current_lineage["C"],
        "k2_order": current_lineage["O"],
        "k2_family": current_lineage["F"],
        "k2_genus": current_lineage["G"],
        "k2_species": current_lineage["S"],
    }
    records.append(lineage_record)


species_df = pd.DataFrame(records)
species_df = species_df.drop(columns=["rank", "reads_clade", "perc"])

#reading kraken2 clasification

cols2 = ["UC", "query", "taxo_name", "nr", "nr2"]
kraken2_clasif = pd.read_csv("kraken2_classification.tabular", sep="\t", names=cols2)
kraken2_clasif = kraken2_clasif.drop(columns=["nr", "nr2"])
kraken2_clasif = kraken2_clasif[kraken2_clasif["UC"] == "C"]
kraken2_clasif["taxid"] = kraken2_clasif["taxo_name"].str.extract(r"\((.*?)\)")
kraken2_clasif["taxid"] = kraken2_clasif["taxid"].str.replace(r"taxid ", "", regex=True)
kraken2_clasif["taxo_name"] = kraken2_clasif["taxo_name"].str.replace(r"\s*\(.*?\)", "", regex=True)
kraken2_clasif["taxid"] = pd.to_numeric(kraken2_clasif["taxid"], errors="coerce")

print(kraken2_clasif)

df_kraken = kraken2_clasif.merge(
    species_df,
    left_on=["taxid"],
    right_on=["taxid"],
    how="left"
)
df_kraken = df_kraken.drop(columns=["taxo_name", "UC"])

df_kraken.to_csv("df_plot_with_kraken.tsv", sep="\t", index=False)

#adding eggnog info table with kegg reconstruct
df_plot = pd.read_csv("table_for_alluvial_plot.tsv", sep="\t")
df_plot["#query"] = df_plot["#query"].str.split("_").str[:-1].str.join("_")


df_mixed_with_kraken = df_plot.merge(
    df_kraken,
    left_on="#query",
    right_on="query",
    how="left"
)
df_mixed_with_kraken = df_mixed_with_kraken.drop(columns=["#query", "max_annot_lvl", "name"])
df_mixed_with_kraken = df_mixed_with_kraken.fillna("Unclassified")
print(df_mixed_with_kraken)


In [None]:
#variables for alluvial plot preparation

import pandas as pd


#Unclassified cleaning

mask_col2 = df_mixed_with_kraken["k2_phylum"] == "Unclassified"
mask_col1 = df_mixed_with_kraken["k2_domain"] != "Unclassified"
mask = mask_col2 & mask_col1
df_mixed_with_kraken.loc[mask, "k2_phylum"] = "Unspecified " + df_mixed_with_kraken.loc[mask, "k2_domain"]

mask_col2 = df_mixed_with_kraken["k2_class"] == "Unclassified"
mask_col1 = df_mixed_with_kraken["k2_phylum"] != "Unclassified"
mask = mask_col2 & mask_col1
df_mixed_with_kraken.loc[mask, "k2_class"] = df_mixed_with_kraken.loc[mask, "k2_phylum"]

mask_col2 = df_mixed_with_kraken["k2_order"] == "Unclassified"
mask_col1 = df_mixed_with_kraken["k2_class"] != "Unclassified"
mask = mask_col2 & mask_col1
df_mixed_with_kraken.loc[mask, "k2_order"] = df_mixed_with_kraken.loc[mask, "k2_class"]

mask_col2 = df_mixed_with_kraken["k2_family"] == "Unclassified"
mask_col1 = df_mixed_with_kraken["k2_order"] != "Unclassified"
mask = mask_col2 & mask_col1
df_mixed_with_kraken.loc[mask, "k2_family"] = df_mixed_with_kraken.loc[mask, "k2_order"]

mask_col2 = df_mixed_with_kraken["k2_genus"] == "Unclassified"
mask_col1 = df_mixed_with_kraken["k2_family"] != "Unclassified"
mask = mask_col2 & mask_col1
df_mixed_with_kraken.loc[mask, "k2_genus"] = df_mixed_with_kraken.loc[mask, "k2_family"]

mask_col2 = df_mixed_with_kraken["k2_species"] == "Unclassified"
mask_col1 = df_mixed_with_kraken["k2_genus"] != "Unclassified"
mask = mask_col2 & mask_col1
df_mixed_with_kraken.loc[mask, "k2_species"] = df_mixed_with_kraken.loc[mask, "k2_genus"]


#orders
tax_cols = ["k2_domain", "k2_phylum", "k2_class", "k2_order", "k2_family", "k2_genus", "k2_species"]
df_tax_sorted = df_mixed_with_kraken.sort_values(by=tax_cols, ascending=False)

D_order = df_tax_sorted["k2_domain"].unique().tolist()
P_order = df_tax_sorted["k2_phylum"].unique().tolist()
C_order = df_tax_sorted["k2_class"].unique().tolist()
O_order = df_tax_sorted["k2_order"].unique().tolist()
F_order = df_tax_sorted["k2_family"].unique().tolist()
G_order = df_tax_sorted["k2_genus"].unique().tolist()
S_order = df_tax_sorted["k2_species"].unique().tolist()


#path_order
df_path_sorted = df_mixed_with_kraken.sort_values(by=['category_line', 'path_name_line'])
cat_order = df_path_sorted["category_line"].unique().tolist()
path_order = df_path_sorted["path_name_line"].unique().tolist()


#color mapping

phylum_list = [
    'Thermodesulfobacteriota', 'Pseudomonadota', 'Myxococcota',
    'Gemmatimonadota', 'Cyanobacteriota', 'Campylobacterota',
    'Bacteroidota', 'Bacillota', 'Actinomycetota', 'Acidobacteriota',
    'Thermoproteota', 'Nitrososphaerota'
]

color_list = ['#00ffff', '#fa8072', '#ff0000', '#ffff00', '#ffc0cb', '#800080',
 '#f0e68c', '#00ff00', '#ff00ff', '#0000ff', '#006400', '#40e0d0', ]

coloring_dict = dict(zip(phylum_list, color_list))

df_mixed_with_kraken["color"] = df_mixed_with_kraken["k2_phylum"].map(coloring_dict)
df_mixed_with_kraken.loc[df_mixed_with_kraken["k2_domain"] == "Unclassified", "color"] = "#808080"
df_mixed_with_kraken.loc[df_mixed_with_kraken["k2_phylum"] == "Unspecified Bacteria", "color"] = "#d3d3d3"
df_mixed_with_kraken.loc[df_mixed_with_kraken["k2_domain"] == "Archaea", "color"] = "#ffd700"
#print(df_mixed_with_kraken)






In [None]:
#dimensions for plot
import pandas as pd
import plotly.graph_objects as go


dim_D = go.parcats.Dimension(
    values=df_mixed_with_kraken['k2_domain'],
    categoryorder='array',
    categoryarray=D_order,
    label='Domain',
)
dim_P = go.parcats.Dimension(
    values=df_mixed_with_kraken['k2_phylum'],
    categoryorder='array',
    categoryarray=P_order,
    label='Phylum'
)
dim_C = go.parcats.Dimension(
    values=df_mixed_with_kraken['k2_class'],
    categoryorder='array',
    categoryarray=C_order,
    label='Class'
)
dim_O = go.parcats.Dimension(
    values=df_mixed_with_kraken['k2_order'],
    categoryorder='array',
    categoryarray=O_order,
    label='Order'
)
dim_F = go.parcats.Dimension(
    values=df_mixed_with_kraken['k2_family'],
    categoryorder='array',
    categoryarray=F_order,
    label='Family'
)
dim_G = go.parcats.Dimension(
    values=df_mixed_with_kraken['k2_genus'],
    categoryorder='array',
    categoryarray=G_order,
    label='Genus'
)
dim_S = go.parcats.Dimension(
    values=df_mixed_with_kraken['k2_species'],
    categoryorder='array',
    categoryarray=S_order,
    label='Species'
)



dim_path = go.parcats.Dimension(
    values=df_mixed_with_kraken['path_name_line'],
    categoryorder='array',
    categoryarray=path_order,
    label='Pathway'
)

dim_cat = go.parcats.Dimension(
    values=df_mixed_with_kraken['category_line'],
    categoryorder='array',
    categoryarray=cat_order,
    label='Category'
)

In [None]:
#Alluvial plot

import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

df_mixed_with_kraken.to_csv("df_alluvial_plot.tsv", sep="\t", index=False)


fig = go.Figure(data = [go.Parcats(
    dimensions=[dim_P, dim_C, dim_O, dim_F, dim_G, dim_path, dim_cat],
    line=dict(
        color=df_mixed_with_kraken["color"],
    ),
    hoveron="color"
    )])


fig.update_layout(
    title="Taxonomic contribution to biosurfactant metabolism within the soil microbiome",
    width=2000,
    height=3600,
    font=dict(size=12, family="Arial"),
    margin=dict(r=200)
)

fig.show()
#write to files
fig.write_html("alluvial_results.html")