Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of github.com:hbc/projects
- Loading branch information
Showing
4 changed files
with
271 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
#!/usr/bin/env python | ||
"""Prepare bcbio-nextgen configuration files for calling structural variation pilot. | ||
""" | ||
import csv | ||
import glob | ||
import os | ||
import subprocess | ||
import sys | ||
|
||
from bcbio import utils | ||
|
||
priority_file = "/n/hsphS10/hsphfs1/chb/projects/tanzi_ad/data/gwas/AD-Master-v2.csv" | ||
bam_dir = "/n/hsphS10/hsphfs2/tanzi_recalled" | ||
name = "alz-sv-pilot" | ||
|
||
def main(family_file, region_file): | ||
families = get_families(family_file) | ||
samples = add_bams(get_samples(families, priority_file), bam_dir) | ||
config_file = write_config_file(samples, name, region_file) | ||
|
||
def write_config_file(samples, name, region_file): | ||
meta_file = os.path.join(utils.safe_makedir(os.path.join(name, "config")), | ||
"%s.csv" % name) | ||
bams = [] | ||
with open(meta_file, "w") as out_handle: | ||
writer = csv.writer(out_handle) | ||
writer.writerow(["samplename", "description", "batch", "sex", | ||
"aligner", "mark_duplicates", "variantcaller", "svcaller", "variant_regions"]) | ||
for sample in sorted(samples, key=lambda x: x["family"]): | ||
bams.append(sample["bam"]) | ||
writer.writerow([os.path.basename(sample["bam"]), sample["sample"], sample["family"], | ||
sample["gender"], "false", "false", "false", "lumpy;cn.mops", region_file]) | ||
subprocess.check_call(["bcbio_nextgen.py", "-w", "template", "freebayes-variant", | ||
meta_file] + bams) | ||
|
||
def add_bams(samples, bam_dir): | ||
out = [] | ||
for sample in samples: | ||
sample["bam"] = get_bam(sample["sample"], bam_dir) | ||
out.append(sample) | ||
return out | ||
|
||
def get_bam(sample, bam_dir): | ||
bam_files = glob.glob(os.path.join(bam_dir, "*", "final", sample, "%s-*am" % sample)) | ||
assert len(bam_files) > 0, "Did not find BAM files for %s: %s" % (sample, bam_files) | ||
if len(bam_files) > 1: | ||
bam_files = [x for x in bam_files if x.endswith(".bam")] | ||
return bam_files[0] | ||
|
||
def get_samples(families, fname): | ||
samples = [] | ||
with open(fname) as in_handle: | ||
reader = csv.reader(in_handle) | ||
reader.next() # header | ||
for parts in reader: | ||
family_id, sample_id, priority = parts[1:4] | ||
status_flag = parts[16] | ||
if status_flag != "Exclude" and family_id in families: | ||
samples.append({"sample": sample_id, "family": family_id, "gender": _get_gender(parts[12])}) | ||
return samples | ||
|
||
def _get_gender(gender): | ||
if gender.lower() in ["m", "male", "1"]: | ||
return "male" | ||
elif gender.lower() in ["f", "female", "2"]: | ||
return "female" | ||
else: | ||
return "" | ||
|
||
def get_families(in_file): | ||
families = set([]) | ||
with open(in_file) as in_handle: | ||
for line in in_handle: | ||
families.add(line.strip()) | ||
return families | ||
|
||
if __name__ == "__main__": | ||
main(*sys.argv[1:]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#!/usr/bin/env python | ||
"""Run squaring off batch scripts for priority 1 and 2 families. | ||
""" | ||
import csv | ||
import glob | ||
import os | ||
import shutil | ||
import subprocess | ||
import sys | ||
|
||
import joblib | ||
|
||
from bcbio import utils | ||
from bcbio.distributed.transaction import file_transaction | ||
from bcbio.variation import vcfutils | ||
|
||
priority_file = "/n/hsphS10/hsphfs1/chb/projects/tanzi_ad/data/gwas/AD-Master-v2.csv" | ||
input_vcf_dir = "/n/hsphS10/hsphfs1/chb/projects/tanzi_ad/data/recall_variants/freebayes" | ||
bam_dir = "/n/hsphS10/hsphfs2/tanzi_recalled" | ||
name = "tanzi_ad_p1and2-square" | ||
ref_file = "/n/hsphS10/hsphfs1/chb/biodata/genomes/Hsapiens/GRCh37/seq/GRCh37.fa" | ||
|
||
def main(cores=1): | ||
start_dir = os.getcwd() | ||
work_dir = utils.safe_makedir("/scratch/square") | ||
priorities = set(["1", "2"]) | ||
list_file = get_input_list(start_dir, priorities) | ||
# Ensure input CRAMs are indexed; gets IO bound quickly so limit cores | ||
cram_cores = min(int(cores), 6) | ||
for cindex in joblib.Parallel(cram_cores)(joblib.delayed(index_cram)(x) for x in find_crams(list_file)): | ||
print cindex | ||
with utils.chdir(work_dir): | ||
out_file = run_squaring(list_file, name, ref_file, cores) | ||
for ext in ["", ".tbi"]: | ||
new_file = os.path.join(start_dir, os.path.basename(out_file) + ext) | ||
if not utils.file_exists(new_file): | ||
shutil.copy(out_file + ext, new_file) | ||
|
||
def run_squaring(list_file, name, ref_file, cores): | ||
mem = 3 * int(cores) | ||
mem_opts = ["-Xms%sG" % (mem // 2), "-Xmx%sG" % mem] | ||
out_file = os.path.join(os.getcwd(), "%s.vcf.gz" % name) | ||
if not utils.file_exists(out_file): | ||
subprocess.check_call(["bcbio-variation-recall"] + mem_opts + | ||
["square", out_file, ref_file, list_file, | ||
"--caller", "freebayes", "--cores", str(cores)]) | ||
return out_file | ||
|
||
def index_cram(fname): | ||
out_file = "%s.crai" % fname | ||
if not utils.file_exists(out_file): | ||
print "Indexing", fname | ||
with file_transaction(out_file) as tx_out_file: | ||
tx_in_file = os.path.splitext(tx_out_file)[0] | ||
utils.symlink_plus(fname, tx_in_file) | ||
subprocess.check_call(["cram_index", tx_in_file]) | ||
return out_file | ||
|
||
def find_crams(in_file): | ||
with open(in_file) as in_handle: | ||
for line in (l.strip() for l in in_handle): | ||
if line.endswith(".cram"): | ||
yield line | ||
|
||
def get_input_list(work_dir, priorities): | ||
list_file = os.path.join(work_dir, "p1and2-input-files.txt") | ||
if not utils.file_exists(list_file): | ||
families = read_families_by_priority(priority_file, priorities) | ||
vcf_files = [get_vcf(input_vcf_dir, fam) for fam in families] | ||
bam_files = [] | ||
for vcf_file in vcf_files: | ||
bam_files.extend(get_bams(vcf_file, bam_dir)) | ||
|
||
with open(list_file, "w") as out_handle: | ||
for fname in vcf_files + bam_files: | ||
out_handle.write(fname + "\n") | ||
return list_file | ||
|
||
def get_vcf(vcf_dir, fam): | ||
vcfs = sorted(glob.glob(os.path.join(vcf_dir, "%s-*vcf*" % fam))) | ||
for ending in [".vcf.gz", ".vcf"]: | ||
for f in vcfs: | ||
if f.endswith(ending): | ||
return f | ||
raise ValueError("Did not find VCF for %s in %s" % (fam, vcf_dir)) | ||
|
||
def get_bams(vcf_file, bam_dir): | ||
out = [] | ||
for sample in vcfutils.get_samples(vcf_file): | ||
bam_files = glob.glob(os.path.join(bam_dir, "*", "final", sample, "%s-*am" % sample)) | ||
assert len(bam_files) > 0, "Did not find BAM files for %s: %s" % (sample, bam_files) | ||
if len(bam_files) > 1: | ||
bam_files = [x for x in bam_files if x.endswith(".bam")] | ||
out.append(bam_files[0]) | ||
return out | ||
|
||
def read_families_by_priority(fname, priorities): | ||
families = set([]) | ||
with open(fname) as in_handle: | ||
reader = csv.reader(in_handle) | ||
reader.next() # header | ||
for parts in reader: | ||
_, family_id, _, priority = parts[:4] | ||
status_flag = parts[16] | ||
if status_flag != "Exclude" and priority.strip() in priorities: | ||
families.add(family_id) | ||
return sorted(list(families)) | ||
|
||
|
||
if __name__ == "__main__": | ||
main(*sys.argv[1:]) |