Skip to content

Commit

Permalink
script: generalize compare_positives to use other lineage columns
Browse files Browse the repository at this point in the history
  • Loading branch information
Katherine Eaton committed Oct 3, 2022
1 parent ffd4f15 commit 1d5406f
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 7 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,13 @@ my_profiles/
compare/

# Ignore datasets
data/public*
data/sars-cov-2*
data/canada*
data/requests*
data/custom*
data/X*
data/proposed*
data/gauntlet
data/*gisaid*

# all-contributors and yarn
yarn.lock
26 changes: 21 additions & 5 deletions scripts/compare_positives.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
UNKNOWN_COLOR = "dimgrey"
UNKNOWN_RGB = colors.to_rgb(UNKNOWN_COLOR)

LINEAGE_COLS = ["recombinant_lineage_curated", "lineage", "pango_lineage"]


def create_sankey_data(df):

Expand Down Expand Up @@ -210,13 +212,20 @@ def create_sankey_plot(sankey_data):
@click.option("--ver-2", help="Second version for title", required=True)
@click.option("--outdir", help="Output directory", required=True)
@click.option("--log", help="Logfile", required=False)
@click.option(
"--node-sort",
type=click.Choice(["size", "alphabetical"], case_sensitive=True),
required=False,
default="alphabetical",
)
def main(
positives_1,
positives_2,
ver_1,
ver_2,
outdir,
log,
node_sort,
):
"""Compare positive recombinants between two tables."""

Expand All @@ -231,11 +240,18 @@ def main(
logger.info("Parsing table: {}".format(positives_2))
positives_2_df = pd.read_csv(positives_2, sep="\t")

lineages_1 = positives_1_df[["strain", "recombinant_lineage_curated"]]
lineages_2 = positives_2_df[["strain", "recombinant_lineage_curated"]]

lineages_1 = lineages_1.rename(columns={"recombinant_lineage_curated": "source"})
lineages_2 = lineages_2.rename(columns={"recombinant_lineage_curated": "target"})
# Try to find the lineage col for each df
for col in LINEAGE_COLS:
if col in positives_1_df.columns:
lineages_1 = positives_1_df[["strain", col]]
lineages_1 = lineages_1.rename(columns={col: "source"})
break

for col in LINEAGE_COLS:
if col in positives_2_df.columns:
lineages_2 = positives_2_df[["strain", col]]
lineages_2 = lineages_2.rename(columns={col: "target"})
break

logger.info("Merging tables.")
lineages_df = pd.merge(lineages_1, lineages_2, how="outer", on="strain")
Expand Down

0 comments on commit 1d5406f

Please sign in to comment.