# Parsing the Esophageal Carcinoma screening data

In [1]:
import sys
sys.path.insert(0, '..')

### RIS file parsing

In [2]:
from pathlib import Path

PATH_PREFIX = "esophagus"
GUIDELINE_ID = "oesophaguskarzinom"

PATH_GGPONC_LITERATURE = Path("../data/screening/ggponc_v2.1_2023_03_30_literature.csv")
PATH_RIS = Path(f"../data/screening/{PATH_PREFIX}/")
PATH_MANUAL_REVIEW = Path(f"../data/screening/{PATH_PREFIX}/manual_review.csv")
PATH_OUTPUT = Path(f"../data/screening/{PATH_PREFIX}/ris_with_matched_ids.csv")

PATH_LATEX_PLOT_OUTPUT = Path(
    f"../thesis/from_python/figures/experiments/{PATH_PREFIX}/"
)
PATH_LATEX_PLOT_OUTPUT.mkdir(parents=True, exist_ok=True)

In [3]:
from copy import deepcopy
import pandas as pd
import rispy
from evaluation.parsing import parse_kg_ris_file

mapping = deepcopy(rispy.TAG_KEY_MAPPING)
mapping["IP"] = "ip"
mapping["DP"] = "date_published"

kg_ris_parser = rispy.RisParser(mapping=mapping)
kg_ris_parser.START_TAG = "ID"

In [4]:
df_ris = pd.concat(
    [parse_kg_ris_file(f, kg_ris_parser) for f in PATH_RIS.rglob("*.txt")]
)

### Metadata Cleaning

In [5]:
df_ris["exclusion_reason"] = (
    df_ris["path"]
    .apply(lambda p: p.stem)
    .str.replace(r"(ÖsoCa_AG\d{1,2}_)", "", regex=True)
    .str.replace(r"(_\d{1,4}_RIS)", "", regex=True)
    .str.replace(".txt", "", regex=False)
)

In [6]:
df_ris["rq"] = df_ris["path"].apply(lambda p: p.parent.name)

In [7]:
df_ris["exclusion_reason"] = (
    df_ris["exclusion_reason"]
    .str.replace("TIAB1_include_gesamtOhneDuplikate", "TIAB1_include")
    .str.replace("FULLTEXT_exclude_kein Volltext", "FULLTEXT_exclude_Kein Volltext")
    .str.replace("searchResult_Nachtrag Slavage 61_RIS", "searchResult")
    .str.replace("TIAB1_Nachtrag Slavage exclude", "TIAB1_exclude")
    .str.replace("TIAB1_Nachtrag Salvage_include", "TIAB1_include")
    .str.replace("TIAB2_exclude_unclear", "TIAB2_exclude")
)

In [8]:
df_ris["publication_date"] = pd.to_datetime(df_ris["date_published"], errors="coerce")

### Extracting Identifiers

In [9]:
from evaluation.parsing import REGEX

In [10]:
df_ris["nct_id"] = pd.NA
df_ris["doi"] = pd.NA

In [11]:
df_ris["pm_id"] = df_ris["id"].str.extract(REGEX["pm_id_start_end"]).astype("Int64")

In [12]:
df_ris["cn_id"] = df_ris["id"].str.extract(REGEX["cn_id"])

### Deduplication

For the final dataframe, we only need unique documents. For each document, we want to keep only the version with the very last screening step. For this, we need to set a hierarchy of exclusion steps:

In [13]:
exclusion_reason_order = [
    "searchResult",
    "TIAB1_exclude",
    "TIAB1_include",
    "TIAB2_exclude",
    "TIAB2_include",
    "FULLTEXT_exclude_all",
    "FULLTEXT_exclude_Abstract oder Protokoll",
    "FULLTEXT_exclude_Nicht die gesuchte Fragestellung",
    "FULLTEXT_exclude_Falsche Intervention_Comparison",
    "FULLTEXT_exclude_Falscher Literaturtyp",
    "FULLTEXT_exclude_Falscher Publikationszeitraum",
    "FULLTEXT_exclude_Zu kleine Studienpopulation",
    "FULLTEXT_exclude_Kein Volltext",
    "FULLTEXT_exclude_Falsche Outcomes",
    "FULLTEXT_exclude_Falsche Population",
    "FULLTEXT_include",
]

df_ris["exclusion_reason"] = pd.Categorical(
    df_ris["exclusion_reason"], categories=exclusion_reason_order
)

We can now de-duplicate the records by only keeping the reference which made it to the deepest screening step per research question:

In [14]:
df_ris = df_ris.sort_values(["id", "exclusion_reason", "rq"]).drop_duplicates(
    subset=["id", "rq"], keep="last"
)

In [15]:
len(df_ris)

7054

For the analysis, we do not need the duplicates. I therefore fold the RQ / exclusion reason information into a JSON cell:

In [16]:
df_ris["rqs_and_reasons"] = df_ris["id"].map(
    df_ris.groupby("id")[["rq", "exclusion_reason"]].apply(
        lambda group: group.dropna().to_dict("records")
    )
)

In [17]:
df_ris.value_counts(["exclusion_reason", "rq"]).sort_index().rename(
    index=lambda x: x[:4], level=1
).to_frame(name="Count").loc["FULLTEXT_include"]

Unnamed: 0_level_0,Count
rq,Unnamed: 1_level_1
AG1,3
AG11,7
AG12,5
AG3,20
AG4,3
AG5,2
AG6,10
AG7,12
AG8,10
AG9,3


We can further reduce the data by eliminating duplicates across research questions:

In [18]:
df_ris = df_ris.sort_values(["id", "exclusion_reason", "rq"]).drop_duplicates(
    subset=["id"], keep="last"
)

In [19]:
len(df_ris)

3147

In [20]:
df_ris.value_counts(["exclusion_reason", "rq"]).sort_index().rename(
    index=lambda x: x[:4], level=1
).to_frame(name="Count").loc["FULLTEXT_include"]

Unnamed: 0_level_0,Count
rq,Unnamed: 1_level_1
AG1,3
AG11,6
AG12,5
AG3,20
AG4,3
AG5,2
AG6,9
AG7,11
AG8,10
AG9,3


As soon as a document is included after full text screening for any RQ, we consider it included:

In [21]:
df_ris["is_included"] = df_ris["exclusion_reason"] == "FULLTEXT_include"

We should also rename the ID column to reflect that it comes from the RIS files:

In [22]:
df_ris.rename(columns={"id": "ris_id"}, inplace=True)

### Identifier lookups

In [23]:
from integration.config import load_config
from integration.db import get_engine
from sqlalchemy.orm import sessionmaker

cfg = load_config("../config_v2.1_2023_03_30.ini")
engine = get_engine(cfg['DB']['url'])
session = sessionmaker(bind=engine)()

In [24]:
from integration.citation_utils import (
    get_title_to_id_mapping_pubmed,
    get_title_to_id_mapping_clinicaltrials,
)

df_pm = get_title_to_id_mapping_pubmed(session)
df_ct = get_title_to_id_mapping_clinicaltrials(session)

In [25]:
from tqdm.auto import tqdm
from integration.citation_utils import retrieve_all_identifiers
import os

tqdm.pandas(desc="Retrieving available identifiers")

df_ris = df_ris.progress_apply(  # type: ignore
    lambda row: retrieve_all_identifiers(
        row=row,
        entrez_email=os.environ.get("PUBMED_USER"),
        entrez_api_key=os.environ.get("PUBMED_API_KEY"),
        doi_pm_id_cache="../data/literature/doi_to_pm_id.json",
        cn_pm_id_cache="../data/literature/cn_id_to_pm_id.json",
        title_nct_id_cache="../data/literature/fuzzy_title_to_nct_id.json",
        title_pm_id_cache="../data/literature/fuzzy_title_to_pm_id.json",
        df_pm=df_pm,
        df_ct=df_ct,
    ),
    axis=1,
)
df_ris = df_ris.astype(
    {col: "Int64" for col in df_ris.columns if "pm_id" in col.lower()}
)

Retrieving available identifiers:   0%|          | 0/3147 [00:00<?, ?it/s]

Potential mislabellings:

In [26]:
df_ris[
    [
        "title",
        "ris_id",
        "pm_id",
        "nct_id",
        "doi",
        "cn_id",
        "pm_id_doi",
        "pm_id_cn",
        "nct_id_cn",
        "pmc_id_cn",
        "nct_id_db",
        "pm_id_db",
    ]
].query(
    "pm_id_db.notnull() & pm_id.notnull() & pm_id != pm_id_db"
)  # .merge(df_pm, left_on="pm_id", right_on="pm_id", suffixes=("_ris", "_db"))  # to get correct title

Unnamed: 0,title,ris_id,pm_id,nct_id,doi,cn_id,pm_id_doi,pm_id_cn,nct_id_cn,pmc_id_cn,nct_id_db,pm_id_db
23,Postoperative Chemotherapy for Thoracic Pathol...,31974708,31974708,,,,,,,,,31848823


### Check if evidence is in the database

In [27]:
from evaluation.matching import is_in_db

tqdm.pandas(desc="Checking of evidence is in database")

db_pm_ids = set(df_pm["pm_id"])
db_nct_ids = set(df_ct["nct_id"])

df_ris = df_ris.progress_apply(
    lambda row: is_in_db(row=row, db_pm_ids=db_pm_ids, db_nct_ids=db_nct_ids), axis=1
)

Checking of evidence is in database:   0%|          | 0/3147 [00:00<?, ?it/s]

### Retrieving Publication Types & Date

In [28]:
from tqdm.auto import tqdm
import os
from integration.citation_utils import pm_id_to_publication_types

tqdm.pandas(desc="Fetching article types")

df_ris["publication_types"] = df_ris["pm_id"].progress_apply(
    lambda pm_id: pm_id_to_publication_types(
        pm_id=pm_id,
        email=os.environ.get("PUBMED_USER"),
        api_key=os.environ.get("PUBMED_API_KEY"),
        cache_path="../data/literature/pm_id_to_entrez_xml.json",
    )
)

Fetching article types:   0%|          | 0/3147 [00:00<?, ?it/s]

Couldn't find document types for 34021328
Couldn't find document types for 34756850
Couldn't find document types for 34797478
Couldn't find document types for 32701242


In [40]:
from integration.citation_utils import set_rct_flag

df_ris = df_ris.apply(set_rct_flag, axis=1)

In [30]:
from integration.citation_utils import pm_id_to_publication_date

tqdm.pandas(desc="Fetching article dates")

df_ris["article_date_api"] = pd.to_datetime(
    df_ris["pm_id"].progress_apply(
        lambda pm_id: pm_id_to_publication_date(
            pm_id=pm_id,
            email=os.environ.get("PUBMED_USER"),
            api_key=os.environ.get("PUBMED_API_KEY"),
            cache_path="../data/literature/pm_id_to_entrez_xml.json",
        )
    )
)

Fetching article dates:   0%|          | 0/3147 [00:00<?, ?it/s]

Couldn't find article date for 29497928
Couldn't find article date for 29995688
Couldn't find article date for 30102633
Couldn't find article date for 30339626
Couldn't find article date for 30402826
Couldn't find article date for 30596899
Couldn't find article date for 30596900
Couldn't find article date for 30601260
Couldn't find article date for 30614938
Couldn't find article date for 30676381
Couldn't find article date for 30715226
Couldn't find article date for 30820525
Couldn't find article date for 30855081
Couldn't find article date for 30855089
Couldn't find article date for 30888418
Couldn't find article date for 30888419
Couldn't find article date for 30980079
Couldn't find article date for 31069391
Couldn't find article date for 31076753
Couldn't find article date for 31090563
Couldn't find article date for 31135037
Couldn't find article date for 31169876
Couldn't find article date for 31175353
Couldn't find article date for 31179509
Couldn't find article date for 31188215


### Presence in GGPONC

In [41]:
from evaluation.matching import is_in_ggponc
from tqdm.auto import tqdm

df_ggponc = (
    pd.read_csv(PATH_GGPONC_LITERATURE)
    .astype({"pm_id": "Int64"})
    .query("guideline_id == @GUIDELINE_ID")
)
ggponc_pm_ids = set(df_ggponc["pm_id"].dropna())
ggponc_titles = set(df_ggponc["title"])

tqdm.pandas(desc="Checking for presence in GGPONC")
df_ris = df_ris.progress_apply(
    lambda row: is_in_ggponc(
        row=row, ggponc_pm_ids=ggponc_pm_ids, ggponc_titles=ggponc_titles
    ),
    axis=1,
)

Checking for presence in GGPONC:   0%|          | 0/3008 [00:00<?, ?it/s]

### Preparing final dataframe

In some cases, we have resolved Cochrance Library IDs to Pubmed IDs which are already present in the dataset:

In [42]:
df_ris["is_pm_id_duplicate"] = (
    df_ris["pm_id"].duplicated(keep=False) & df_ris["pm_id"].notnull()
)

print(
    f"There are {len(df_ris.query('is_pm_id_duplicate == True'))} non-null PM ID duplicates in the dataset!"
)

There are 0 non-null PM ID duplicates in the dataset!


Some examples:

In [43]:
df_ris.query("is_pm_id_duplicate == True").sort_values(["pm_id", "exclusion_reason"])[
    ["ris_id", "pm_id", "exclusion_reason", "is_included"]
].head(10)

Unnamed: 0,ris_id,pm_id,exclusion_reason,is_included


Let's remove them:

In [44]:
df_ris.reset_index(drop=True, inplace=True)
df_ris["exclusion_reason"] = pd.Categorical(
    df_ris["exclusion_reason"], categories=exclusion_reason_order
)
df_ris = df_ris[
    (
        ~df_ris.sort_values(["pm_id", "exclusion_reason"], ascending=False).duplicated(
            subset="pm_id", keep="first"
        )
    )
    | (df_ris["pm_id"].isnull())
].drop(columns=["is_pm_id_duplicate"])

In [45]:
pd.crosstab(df_ris["is_included"], df_ris["is_rct_api"], margins=True)

is_rct_api,False,True,All
is_included,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,2640,296,2936
True,52,20,72
All,2692,316,3008


In [46]:
df_ris.query("is_included & is_rct_api & pm_id.isnull()")

Unnamed: 0,ris_id,issn,volume,ip,date_published,title,end_page,abstract,author_address,authors,...,is_in_db_pm,is_in_db_ct,is_in_db_title,is_in_db,publication_types,is_rct_api,article_date_api,is_in_ggponc,is_relevant,is_not_relevant


### Excluding documents outside the search time frame / marking as relevant

In [47]:
df_ris.query("exclusion_reason == 'FULLTEXT_exclude_Falscher Publikationszeitraum'")[
    ["pm_id", "publication_date", "is_rct_api", "is_in_ggponc", "exclusion_reason"]
]

Unnamed: 0,pm_id,publication_date,is_rct_api,is_in_ggponc,exclusion_reason
12,30470690,2019-04-01,True,False,FULLTEXT_exclude_Falscher Publikationszeitraum
34,30877177,2019-03-15,False,False,FULLTEXT_exclude_Falscher Publikationszeitraum
113,31278574,2019-07-05,False,False,FULLTEXT_exclude_Falscher Publikationszeitraum
148,31397195,2019-08-09,False,False,FULLTEXT_exclude_Falscher Publikationszeitraum
1902,30625052,2019-01-01,True,True,FULLTEXT_exclude_Falscher Publikationszeitraum
1912,30137281,2019-01-01,True,False,FULLTEXT_exclude_Falscher Publikationszeitraum
1921,33198003,2019-01-01,False,False,FULLTEXT_exclude_Falscher Publikationszeitraum
1943,30308612,2019-01-01,True,True,FULLTEXT_exclude_Falscher Publikationszeitraum
1950,30732125,2019-01-01,False,False,FULLTEXT_exclude_Falscher Publikationszeitraum
1967,30913304,2019-01-01,True,False,FULLTEXT_exclude_Falscher Publikationszeitraum


For the evaluation, we are interested in those that are either marked as included or already contained in GGPONC AND fall within the screening timeframe. This means that records that occur in GGPONC, but were excluded to due having been published outside the screening time frame have to be marked as not relevant:

In [65]:
# mask_not_excluded_due_to_wrong_tf = (df_ris["exclusion_reason"] != "FULLTEXT_exclude_Falscher Publikationszeitraum")
mask_is_included = df_ris["is_included"]
mask_is_in_ggponc_and_in_tf = df_ris["is_in_ggponc"] & (
    df_ris["exclusion_reason"] != "FULLTEXT_exclude_Falscher Publikationszeitraum"
)

In [66]:
df_ris["is_relevant"] = mask_is_included | mask_is_in_ggponc_and_in_tf
df_ris["is_not_relevant"] = ~df_ris["is_relevant"]

In [67]:
pd.crosstab(df_ris["is_relevant"], df_ris["is_rct_api"], margins=True)

is_rct_api,False,True,All
is_relevant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,2637,290,2927
True,55,26,81
All,2692,316,3008


In [68]:
df_ris[~df_ris.is_included & df_ris.is_in_ggponc].shape

(11, 39)

In [69]:
df_ris.exclusion_reason.value_counts().sum() - 72

2936

### Manual Screening Results

We also have the results of the manual screening available. Although these were added after the original screening, we parse them at this point to reduce code duplication:

In [70]:
df_manual = pd.read_csv(PATH_MANUAL_REVIEW)
df_manual = (
    df_manual.query("source in ['Pubmed', 'Civic']")
    .astype({"pm_id": "Int64"})
    .rename(columns={"Ein- / Ausschlussgrund": "reason"})[
        ["pm_id", "publication_date", "reason", "is_included"]
    ]
)
df_manual["publication_date"] = pd.to_datetime(df_manual["publication_date"])
df_manual["is_excluded"] = ~df_manual["is_included"]
df_manual["is_relevant"] = df_manual["is_included"]
df_manual["is_not_relevant"] = df_manual["is_excluded"]
df_manual["exclusion_reason"] = df_manual.apply(lambda row: row["reason"] if row["is_excluded"] else pd.NA, axis=1)  # type: ignore
df_manual.drop(columns=["reason"], inplace=True)

# Since these records have been retrieved from the system before, we know that they are in the DB and that they are RCTs:
df_manual["publication_types"] = [[] for _ in range(len(df_manual))]
df_manual["is_rct_api"] = True

# But these 3 are actually reviews and not RCTs:
df_manual.loc[
    df_manual["pm_id"].isin([34860570, 34980003, 33640423]), "is_rct_api"
] = False

df_manual["is_in_db"] = True
df_manual["is_in_ggponc"] = False
df_manual["screening_origin"] = "manual"

In [71]:
df_ris["screening_origin"] = "original"

In [72]:
df_ris = pd.concat([df_ris, df_manual])

### Saving the output

In [73]:
df_ris.groupby(['screening_origin', 'is_relevant', 'is_rct_api']).size()

screening_origin  is_relevant  is_rct_api
manual            False        False            3
                               True             6
                  True         True             9
original          False        False         2637
                               True           290
                  True         False           55
                               True            26
dtype: int64

In [74]:
from evaluation.parsing import save_parsed_ris_content

In [75]:
save_parsed_ris_content(
    df=df_ris,
    guideline_id="oesophaguskarzinom",
    guideline_name="Esophageal Carcinoma",
    cols_additional=["rq", "rqs_and_reasons"],
    csv_path=PATH_OUTPUT,
)

#### Sankey Diagram for thesis

In [None]:
len(df_ris)

In [None]:
df_ris_with_manual = df_ris

In [None]:
df_ris = df_ris.query("screening_origin == 'original'")

This is an attempt to reconstruct the screening process from the available exclusion reason tags. The overall flow is not perfectly documented by the files, so this is a best effort.

In [None]:
in_ggponc[in_ggponc.is_excluded].shape

In [None]:
final_other = df_ris.query(
    "(is_included | (is_in_ggponc & (exclusion_reason != 'FULLTEXT_exclude_Falscher Publikationszeitraum'))) & ~is_rct_api"
)
final_rct = df_ris.query(
    "(is_included | (is_in_ggponc & (exclusion_reason != 'FULLTEXT_exclude_Falscher Publikationszeitraum'))) & is_rct_api"
)

in_ggponc = df_ris.query(
    "is_in_ggponc & (exclusion_reason != 'FULLTEXT_exclude_Falscher Publikationszeitraum')"
)
in_ggponc_rct = in_ggponc.query("is_rct_api")
in_ggponc_other = in_ggponc.query("~is_rct_api")

not_in_ggponc = df_ris.query("~is_in_ggponc & is_included")
not_in_ggponc_rct = df_ris.query("~is_in_ggponc & is_included & is_rct_api")
not_in_ggponc_other = df_ris.query("~is_in_ggponc & is_included & ~is_rct_api")

ft_inc = df_ris.query("is_included")
ft_inc_ggponc = ft_inc.query("is_in_ggponc")
ft_inc_no_ggponc = ft_inc.query("~is_in_ggponc")

ft_exc = df_ris.query("exclusion_reason.str.startswith('FULLTEXT_exclude')")
ft_exc_ggponc = ft_exc.query(
    "is_in_ggponc & (exclusion_reason != 'FULLTEXT_exclude_Falscher Publikationszeitraum')"
)
ft_exc_no_ggponc = ft_exc.query("~is_in_ggponc")

tiab2_inc = pd.concat([ft_inc, ft_exc])
tiab2_exc = df_ris.query("exclusion_reason == 'TIAB2_exclude'")
tiab2_exc_ggponc = tiab2_exc.query("is_in_ggponc")
tiab2_exc_no_ggponc = tiab2_exc.query("~is_in_ggponc")

tiab1_inc = pd.concat(
    [tiab2_inc, tiab2_exc, df_ris.query("exclusion_reason == 'TIAB1_include'")]
)
tiab1_inc_final = df_ris.query("exclusion_reason == 'TIAB1_include'")
tiab1_exc = df_ris.query("exclusion_reason == 'TIAB1_exclude'")
tiab1_exc_ggponc = tiab1_exc.query("is_in_ggponc")
tiab1_exc_no_ggponc = tiab1_exc.query("~is_in_ggponc")

In [None]:
nodes = [
    [("All Found", len(df_ris))],
    [
        ("1. T/A\nExcluded", len(tiab1_exc), {"label_pos": "right"}),
        ("1. T/A\nIncluded", len(tiab1_inc), {"label_pos": "bottom"}),
    ],
    [
        (
            "2. T/A\nExcluded",
            len(tiab2_exc) + len(tiab1_inc_final),
            {"label_pos": "top"},
        ),  # some records have final state TIAB1_include
        ("2. T/A\nIncluded", len(tiab2_inc), {"label_pos": "bottom"}),
    ],
    [
        ("Full Text\nExcluded", len(ft_exc), {"label_pos": "top"}),
        ("Full Text\nIncluded", len(ft_inc), {"label_pos": "bottom"}),
    ],
    [
        ("Already cited\nin CPG", len(in_ggponc), {"label_pos": "top"}),
        ("Not\nin CPG", len(not_in_ggponc), {"label_pos": "top"}),
    ],
    [
        ("RCTs", len(final_rct), {"label_pos": "right"}),
        ("Other", len(final_other), {"label_pos": "right"}),
    ],
]

In [None]:
flows = [
    ("All Found", "1. T/A\nExcluded", len(tiab1_exc)),
    ("All Found", "1. T/A\nIncluded", len(tiab1_inc)),
    ("1. T/A\nExcluded", "Already cited\nin CPG", len(tiab1_exc_ggponc)),
    ("1. T/A\nIncluded", "2. T/A\nExcluded", len(tiab2_exc) + len(tiab1_inc_final)),
    ("1. T/A\nIncluded", "2. T/A\nIncluded", len(tiab2_inc)),
    ("2. T/A\nExcluded", "Already cited\nin CPG", len(tiab2_exc_ggponc)),
    ("2. T/A\nIncluded", "Full Text\nExcluded", len(ft_exc)),
    ("2. T/A\nIncluded", "Full Text\nIncluded", len(ft_inc)),
    ("Full Text\nExcluded", "Already cited\nin CPG", len(ft_exc_ggponc)),
    ("Full Text\nIncluded", "Already cited\nin CPG", len(ft_inc_ggponc)),
    ("Full Text\nIncluded", "Not\nin CPG", len(ft_inc_no_ggponc)),
    ("Already cited\nin CPG", "RCTs", len(in_ggponc_rct)),
    ("Already cited\nin CPG", "Other", len(in_ggponc_other)),
    ("Not\nin CPG", "RCTs", len(not_in_ggponc_rct)),
    ("Not\nin CPG", "Other", len(not_in_ggponc_other)),
]

In [None]:
from sankeyflow import Sankey
from evaluation.plotting import set_size
import matplotlib.pyplot as plt

#plt.style.use("../thesis.mplstyle")

fig, ax = plt.subplots(figsize=set_size("thesis"), layout="constrained")

s = Sankey(
    flow_color_mode="source",
    nodes=nodes,
    flows=flows,
    node_pad_y_min=0.3,
    align_y="tree",
    flow_color_mode_alpha=0.8,
)
s.draw(ax=ax)

fig.savefig(PATH_LATEX_PLOT_OUTPUT / "sankey_unique_per_step_reconstructed.pdf")