Skip to content

Commit

Permalink
consolidate lineage call in report
Browse files Browse the repository at this point in the history
  • Loading branch information
Katherine Eaton authored and ktmeaton committed Apr 25, 2022
1 parent 5f3e363 commit 93a3020
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 16 deletions.
1 change: 1 addition & 0 deletions docs/notes/Notes_development.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
1. Tidy up all the x_to_x.tsv files in `data/controls`.

- `issue_to_lineage.tsv` used by rule `usher`.
- `lineage_to_issue.tsv` used by rule `report`.

1. Investigate why X* lineages are excluded from nextclade
1. Add citations to report.
Expand Down
59 changes: 48 additions & 11 deletions scripts/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

@click.command()
@click.option("--summary", help="Summary (tsv).", required=True)
@click.option("--cols", help="Columns to rename (csv)", required=True)
@click.option("--cols-rename", help="Renamed columns (csv)", required=True)
@click.option(
"--years", help="Reporting years (csv)", required=True, default="2022,2021"
)
Expand Down Expand Up @@ -49,6 +47,7 @@ def main(

# Import Dataframes
summary_df = pd.read_csv(summary, sep="\t")
summary_df.fillna(NO_DATA_CHAR, inplace=True)

lineage_issue_df = pd.read_csv(lineage_to_issue, sep="\t", header=None)
lineage_issue_df.columns = ["lineage", "issue"]
Expand All @@ -57,15 +56,49 @@ def main(
breakpoint_issue_df.columns = ["breakpoint", "issue"]

# Select and rename columns from summary
cols = "strain,sc2rf_lineage,usher_pango_lineage_map,Nextclade_pango,sc2rf_clades_filter,date,country,sc2rf_breakpoints_regions_filter,usher_subtree"
cols_rename = "strain,lineage_sc2rf,lineage_usher,lineage_nextclade,parents,date,country,breakpoints,subtree"
cols_list = cols.split(",")
cols_rename_list = cols_rename.split(",")
rename_dict = {c: r for c, r in zip(cols_list, cols_rename_list)}
cols_dict = {
"strain": "strain",
"sc2rf_lineage": "lineage_sc2rf",
"usher_pango_lineage_map": "lineage_usher",
"Nextclade_pango": "lineage_nextclade",
"sc2rf_clades_filter": "parents",
"date": "date",
"country": "country",
"sc2rf_breakpoints_regions_filter": "breakpoints",
"usher_subtree": "subtree",
}
cols_list = list(cols_dict.keys())

report_df = summary_df[cols_list]
report_df.rename(columns=rename_dict, inplace=True)
report_df.rename(columns=cols_dict, inplace=True)
outpath = os.path.join(outdir, "linelist.tsv")
report_df.to_csv(outpath, sep="\t", index=False)
report_df.insert(1, "lineage", [NO_DATA_CHAR] * len(report_df))
report_df.insert(2, "classifier", [NO_DATA_CHAR] * len(report_df))

# Lineage classification
print(report_df)
for rec in report_df.iterrows():
lineage = ""
classifier = ""
lineages_sc2rf = rec[1]["lineage_sc2rf"].split(",")
lineage_usher = rec[1]["lineage_usher"]

# If sc2rf was unambiguous
if len(lineages_sc2rf) == 1 and lineages_sc2rf[0] != NO_DATA_CHAR:
lineage = lineages_sc2rf[0]
classifier = "sc2rf"
# Otherwise, use UShER
else:
lineage = lineage_usher
classifier = "UShER"

report_df.at[rec[0], "lineage"] = lineage
report_df.at[rec[0], "classifier"] = classifier

# Drop other lineage cols
drop_cols = ["lineage_sc2rf", "lineage_usher", "lineage_nextclade"]
report_df.drop(columns=drop_cols, inplace=True)

# Check if previous report was specified
if prev_report:
Expand Down Expand Up @@ -111,6 +144,7 @@ def main(
"earliest_date",
"latest_date",
"lineage:issue",
"classifier",
"subtree",
"year",
]
Expand All @@ -127,9 +161,10 @@ def main(
sequences = len(bp_df)
earliest_date = min(bp_df["datetime"])
latest_date = max(bp_df["datetime"])
subtree = list(bp_df["subtree"])[0]
lineage = list(bp_df["lineage_usher"])[0]
lineage = list(bp_df["lineage"])[0]
issue = NO_DATA_CHAR
classifier = list(bp_df["classifier"])[0]
subtree = list(bp_df["subtree"])[0]
year = earliest_date.year

# Try to find the issue based on lineages (designated)
Expand All @@ -150,7 +185,8 @@ def main(

for clade, rename in CLADES_RENAME.items():
parents = parents.replace(clade, rename)
parents = parents.replace(",", ", ")
parents = list(set(parents.split(",")))
parents = ", ".join(parents)

# Calculate growth from previous report
growth = 0
Expand All @@ -174,6 +210,7 @@ def main(
data["earliest_date"].append(earliest_date)
data["latest_date"].append(latest_date)
data["lineage:issue"].append(lineage_issue)
data["classifier"].append(classifier)
data["subtree"].append(subtree)
data["year"].append(year)

Expand Down
13 changes: 8 additions & 5 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -695,11 +695,14 @@ rule report:
"logs/{rule}/{{build}}_{today}.log".format(today=today, rule=rule_name),
shell:
"""
python3 scripts/report.py \
--summary {input.summary} \
--prev-report {params.prev_report} \
2> {log};
# Check if we're specifying a previous report to compare to
prev_report_flag=""
if [[ "{params.prev_report}" ]]; then
prev_report_flag="--prev-report {params.prev_report}"
fi
# Generate report
python3 scripts/report.py --summary {input.summary} $prev_report_flag 2> {log};
"""

# -----------------------------------------------------------------------------#
Expand Down

0 comments on commit 93a3020

Please sign in to comment.