Skip to content

Commit

Permalink
add new rule usher_metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
Katherine Eaton authored and ktmeaton committed Apr 14, 2022
1 parent c500345 commit fdee0da
Show file tree
Hide file tree
Showing 9 changed files with 118 additions and 76 deletions.
10 changes: 10 additions & 0 deletions data/controls/issue_to_lineage.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
444 XD
454 XE
422 XF
439 XF
441 XF
447 XG
448 XH
467 XJ
471 XS
468 XQ
File renamed without changes.
10 changes: 0 additions & 10 deletions data/controls/usher_to_pango.tsv

This file was deleted.

11 changes: 10 additions & 1 deletion defaults/parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ rule_params:
# k : The number of contextual samples to select when extracting subtrees.
- name: usher_subtree
k: 100
auspice_color_by: "lineage_usher"
auspice_tip_labels: ""

# extra_cols : Extra columns from the metadata to output (csv)
- name: usher_metadata
extra_cols: "date,country"

# extra_cols : Extra columns from the metadata to output (csv)
- name: summary
Expand All @@ -101,7 +107,7 @@ rule_params:
# cols : Columns to output in the report (csv)
- name: report
cols: "strain,usher_pango_lineage_map,Nextclade_pango,sc2rf_clades_filter,date,country,sc2rf_breakpoints_regions_filter,usher_subtree"
cols_rename: "strain,usher_lineage,nextclade_lineage,clades,date,country,breakpoints,subtree"
cols_rename: "strain,lineage_usher,lineage_nextclade,parents,date,country,breakpoints,subtree"



Expand All @@ -123,6 +129,9 @@ builds:
- name: controls
base_input: public-latest

usher_metadata:
extra_cols: "date,country,gisaid_epi_isl,genbank_accession"

# Universal rule parameters can (optionally) be overridden for a specific build.
sc2rf:
sc2rf_args: "--ansi --parents 2-4 --breakpoints 1-4 --unique 1 --max-intermission-length 3 --max-intermission-count 3 --mutation-threshold 0.5"
5 changes: 4 additions & 1 deletion docs/notes/Notes_development.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@
1. Add new rule `report`.
1. Filter non-recombinants from `sc2rf` ansi output.
1. Fix `subtrees_collapse` failing if only 1 tree specified
1. Add new rule `usher_metadata` for merge metadata for subtrees.

## To Do

1. Consolidate `summary` and `usher_metadata`.
1. Edit Auspice json to change default colorings,filters,and panels.
1. Automate unit test update.
1. Move `report` output to `reporting` directory.
1. Test the `exclude_clades` param of Nextclade to make sure isn't removing.
1. Test the `exclude_clades` param of Nextclade to make sure it isn't removing.
2 changes: 1 addition & 1 deletion scripts/slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ cpus="${cpus:-1}"
mem="${mem:-4GB}"

today=$(date +"%Y-%m-%d")
log_dir="logs/slurm"
log_dir="logs/ncov-recombinant"

# -----------------------------------------------------------------------------
# Run the Workflow
Expand Down
54 changes: 0 additions & 54 deletions scripts/usher_collapse_metadata.sh

This file was deleted.

48 changes: 48 additions & 0 deletions scripts/usher_metadata.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/bash

# -----------------------------------------------------------------------------
# Argument Parsing

while [[ $# -gt 0 ]]; do
case $1 in
--base-input)
base_input=$2
shift # past argument
shift # past value
;;
--build)
build=$2
shift # past argument
shift # past value
;;
--extra-cols)
extra_cols=$2
shift # past argument
shift # past value
;;
-*|--*)
echo "Unknown option: $1"
exit 1
;;
esac
done

base_metadata="data/${base_input}/metadata.tsv"
nextclade="results/${build}/nextclade.metadata.tsv"
sc2rf="results/${build}/sc2rf.recombinants.tsv"
usher_clades="results/${build}/usher.clades.tsv"
usher_placement="results/${build}/usher.placement_stats.tsv"
issue_to_lineage="data/controls/issue_to_lineage.tsv"

default_cols="strain"
extract_cols="clade,usher_clade,Nextclade_pango,usher_pango_lineage,dataset,sc2rf_clades_filter,sc2rf_clades_regions_filter,sc2rf_breakpoints_regions_filter,usher_num_best"
rename_cols="Nextstrain_clade,Nextstrain_clade_usher,pangolin_lineage,pango_lineage_usher,dataset,parents,parents_regions,breakpoints,usher_placements"
rename_cols_final="clade_nextclade,clade_usher,lineage_nextclade,lineage_usher,dataset,parents,parents_regions,breakpoints,usher_placements"

csvtk merge -t -f "strain" ${nextclade} ${sc2rf} ${usher_clades} ${usher_placement} \
| csvtk cut -t -f "${default_cols},${extra_cols},${extract_cols}" \
| csvtk rename -t -f "${extract_cols}" -n "${rename_cols}" \
| csvtk concat -t -u "?" - ${base_metadata} \
| csvtk cut -t -f "${default_cols},${extra_cols},${rename_cols}" \
| csvtk rename -t -f "${rename_cols}" -n "${rename_cols_final}" \
| csvtk replace -t -f "lineage_usher" -p "proposed([0-9]+)" -k ${issue_to_lineage} -r "{kv}"
54 changes: 45 additions & 9 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ rule usher:
pb = lambda wildcards: "data/{base_input}/usher.pb.gz".format(
base_input=config["builds"][wildcards.build]["base_input"]
),
usher_to_pango = "data/controls/usher_to_pango.tsv",
issue_to_lineage = "data/controls/issue_to_lineage.tsv",
output:
pb = "results/{build}/usher.pb.gz",
samples = "results/{build}/usher.samples.txt",
Expand Down Expand Up @@ -516,7 +516,7 @@ rule usher:
echo -e "strain\\tusher_clade\\tusher_pango_lineage" > {output.clades};
cat {params.outdir}/clades.txt >> {output.clades};
csvtk mutate -t -f "usher_pango_lineage" -n "usher_pango_lineage_map" {output.clades} \
| csvtk replace -t -f "usher_pango_lineage_map" -p "(proposed[0-9]+)" -k {input.usher_to_pango} -r "{{kv}}" \
| csvtk replace -t -f "usher_pango_lineage_map" -p "proposed([0-9]+)" -k {input.issue_to_lineage} -r "{{kv}}" \
> {output.clades}.tmp;
mv {output.clades}.tmp {output.clades}
Expand All @@ -529,6 +529,41 @@ rule usher:
rm -f {params.outdir}/placement_stats.tsv
"""

# -----------------------------------------------------------------------------
rule_name = "usher_metadata"
rule usher_metadata:
"""
Merge metadata for UShER protobuf.
"""
message: "{wildcards.build} | Merging metadata for UShER protobuf."
wildcard_constraints:
mode = "|".join(LOCAL_INPUTS),
input:
base_metadata = lambda wildcards: "data/{base_input}/metadata.tsv".format(
base_input=config["builds"][wildcards.build]["base_input"]
),
clades = "results/{build}/usher.clades.tsv",
output:
metadata = "results/{build}/usher.metadata.tsv",
params:
base_input = lambda wildcards: config["builds"][wildcards.build]["base_input"],
extra_cols = lambda wildcards: config["builds"][wildcards.build]["usher_metadata"]["extra_cols"],
threads: 1
resources:
cpus = 1,
benchmark:
"benchmarks/{rule}/{{build}}_{today}.tsv".format(today=today, rule=rule_name),
log:
"logs/{rule}/{{build}}_{today}.log".format(today=today, rule=rule_name),
shell:
"""
bash scripts/usher_metadata.sh \
--base-input {params.base_input} \
--build {wildcards.build} \
--extra-cols {params.extra_cols} \
1> {output.metadata} 2> {log};
"""

# -----------------------------------------------------------------------------
# The outdir param doesn't work correctly, hence the cd and relative paths

Expand All @@ -539,8 +574,9 @@ rule usher_subtree:
"""
message: "{wildcards.build} | Creating subtrees from UShER protobuf."
input:
pb = "results/{build}/usher.pb.gz",
samples = "results/{build}/sc2rf.recombinants.txt",
pb = "results/{build}/usher.pb.gz",
metadata = "results/{build}/usher.metadata.tsv",
samples = "results/{build}/sc2rf.recombinants.txt",
output:
subtrees_dir = directory("results/{build}/subtrees"),
metadata = "results/{build}/subtrees_collapse/metadata.tsv",
Expand All @@ -560,15 +596,16 @@ rule usher_subtree:
# Create subtrees
mkdir -p {output.subtrees_dir}
cd {output.subtrees_dir}
matUtils extract -i ../../../{input.pb} --nearest-k-batch ../../../{input.samples}:{params.k} > ../../../{log} 2>&1;
matUtils extract \
-i ../../../{input.pb} \
--nearest-k-batch ../../../{input.samples}:{params.k} \
-M ../../../{input.metadata} \
> ../../../{log} 2>&1;
cd - >> ../../../{log}
# Collapse subtrees
mkdir -p {output.collapse_dir};
python3 scripts/usher_collapse.py --indir {output.subtrees_dir} --outdir {output.collapse_dir} >> {log} 2>&1;
# Collapse metadata
bash scripts/usher_collapse_metadata.sh --base-input {params.base_input} --build {wildcards.build} >> {log} 2>&1;
"""

# -----------------------------------------------------------------------------#
Expand Down Expand Up @@ -625,7 +662,6 @@ rule report:
message: "{wildcards.build} | Generating a report."
input:
summary = "results/{build}/summary.tsv",
pango_to_issue = "data/controls/pango_to_issue.tsv",
output:
report = "results/{build}/report.tsv"
threads: 1
Expand Down

0 comments on commit fdee0da

Please sign in to comment.