Skip to content

Commit

Permalink
Merge branch 'master' of github.com:hbc/projects
Browse files Browse the repository at this point in the history
  • Loading branch information
roryk committed Apr 15, 2014
2 parents d86f963 + 82a883f commit 0fc20bd
Show file tree
Hide file tree
Showing 4 changed files with 271 additions and 27 deletions.
107 changes: 81 additions & 26 deletions tanzi_ad/config/runprep_config.yaml
Expand Up @@ -2,33 +2,88 @@ idmapping: gwas/NIMH_Patient.Rutgers.ID_Translation.txt
priority: gwas/AD-Master-v2.csv
coverage: inputs/alz-priority-regions.bed
fam: gwas/WGS_NIMH.fam
# -- Priority 2 full re-run

# -- Priority 1/2 re-run missing VCFs/samples
# p1 '51186'
# p2f-g7 '50296', '50948', '51184', '52267'
# 50948 has some confusing labels
# extra illumina sample, missing from AD-Master-v2: 5010533 actually 5010553 in 50296
# -- appears to be a typo
# Not in illumina ['5010623', '5010452']
# 3 families also missing single samples:
# ['5110084', '5010175', '5010500']
# 51105 50391 50722
#
# params:
# name: alz-p12_redo
# max_samples: 75
# families: ['51186', '50296', '50948', '51184', '52267', '51105', '50391', '50722']

# Total = 1476 + 1
# -- Priority 3 final 400
params:
name: alz-p3f
max_samples: 65
# finished p1 and p2
excludefamilies: ['50115', '50116', '50127', '50152', '50270',
'50288', '50291', '50302', '50307', '50323', '50328', '50329',
'50332', '50336', '50354', '50361', '50362', '50452', '50455',
'50456', '50523', '50589', '50590', '50593', '50638', '50646',
'50670', '50841', '50981', '50986', '50993', '51114', '51126',
'51136', '51146', '51147', '51154', '51161', '51181', '51185',
'51186', '51192', '51193', '51196', '51200', '51223', '51225',
'51227', '51241', '51243', '51253', '52104', '52106', '52111',
'52112', '52122', '52123', '52126', '52127', '52143', '52147',
'52149', '52153', '52154', '52163', '52164', '52179', '52180',
'52185', '52196', '52199', '52203', '52218', '52228', '52230',
'52231', '52250', '52251', '52260', 'gomez', 'shmaman', '88xxxx',
'50319', '50322', '50330', '50334', '50368', '50521', '50653', '50722',
'50723', '50840', '50854', '50942', '50980', '50985', '50994', '50995',
'51106', '51123', '51174', '51257', '51277', '51278', '52103', '52109',
'52110', '52125', '52140', '52142', '52174', '52259', '50143', '50296',
'50391', '50395', '50397', '50446', '50586', '50610', '50663', '50948',
'51105', '51109', '51157', '51159', '51180', '51184', '51214', '52115',
'52256', '52267',
# BAM files need transfer to sanger
'exapoe', '51160', '51188', '52248', '50445', '51206', '50996',
'51240', '50948', '52134', '52168']
name: alz-p3f_2
max_samples: 75
excludefamilies: ['50105', '50115', '50116', '50122', '50123', '50127', '50130', '50133',
'50143', '50152', '50260', '50261', '50270', '50279', '50288', '50291', '50296',
'50302', '50307', '50319', '50322', '50323', '50328', '50329', '50330', '50331',
'50332', '50334', '50336', '50343', '50354', '50361', '50362', '50367', '50368',
'50369', '50372', '50385', '50388', '50391', '50395', '50397', '50444', '50446',
'50452', '50455', '50456', '50459', '50461', '50500', '50506', '50521', '50523',
'50534', '50563', '50572', '50578', '50581', '50586', '50588', '50589', '50590',
'50593', '50610', '50637', '50638', '50641', '50646', '50648', '50649', '50653',
'50660', '50663', '50668', '50669', '50670', '50676', '50679', '50681', '50722',
'50723', '50732', '50735', '50748', '50771', '50798', '50803', '50808', '50814',
'50840', '50841', '50854', '50857', '50942', '50948', '50962', '50979', '50980',
'50981', '50985', '50986', '50990', '50991', '50993', '50994', '50995', '50997',
'50998', '50999', '51105', '51106', '51107', '51109', '51110', '51111', '51113',
'51114', '51115', '51119', '51123', '51126', '51129', '51134', '51136', '51137',
'51138', '51139', '51146', '51147', '51151', '51153', '51154', '51157', '51158',
'51159', '51161', '51167', '51174', '51175', '51176', '51177', '51180', '51181',
'51184', '51185', '51186', '51192', '51193', '51196', '51200', '51204', '51208',
'51211', '51213', '51214', '51216', '51219', '51223', '51225', '51226', '51227',
'51230', '51238', '51241', '51243', '51250', '51253', '51255', '51257', '51259',
'51263', '51264', '51266', '51267', '51268', '51277', '51278', '52102', '52103',
'52104', '52106', '52107', '52108', '52109', '52110', '52111', '52112', '52113',
'52115', '52121', '52122', '52123', '52125', '52126', '52127', '52133', '52136',
'52137', '52139', '52140', '52142', '52143', '52147', '52148', '52149', '52153',
'52154', '52156', '52157', '52158', '52159', '52161', '52163', '52164', '52165',
'52167', '52170', '52172', '52173', '52174', '52175', '52176', '52177', '52179',
'52180', '52181', '52183', '52184', '52185', '52186', '52187', '52189', '52190',
'52196', '52199', '52200', '52201', '52203', '52207', '52208', '52209', '52215',
'52216', '52218', '52223', '52225', '52226', '52228', '52230', '52231', '52233',
'52234', '52235', '52236', '52240', '52245', '52246', '52249', '52250', '52251',
'52252', '52253', '52254', '52256', '52258', '52259', '52260', '52261', '52262',
'52264', '52267', 'gomez', 'shmaman', '88xxxx']

# -- Priority 3 initial set
# params:
# name: alz-p3f
# max_samples: 65
# # finished p1 and p2
# excludefamilies: ['50115', '50116', '50127', '50152', '50270',
# '50288', '50291', '50302', '50307', '50323', '50328', '50329',
# '50332', '50336', '50354', '50361', '50362', '50452', '50455',
# '50456', '50523', '50589', '50590', '50593', '50638', '50646',
# '50670', '50841', '50981', '50986', '50993', '51114', '51126',
# '51136', '51146', '51147', '51154', '51161', '51181', '51185',
# '51186', '51192', '51193', '51196', '51200', '51223', '51225',
# '51227', '51241', '51243', '51253', '52104', '52106', '52111',
# '52112', '52122', '52123', '52126', '52127', '52143', '52147',
# '52149', '52153', '52154', '52163', '52164', '52179', '52180',
# '52185', '52196', '52199', '52203', '52218', '52228', '52230',
# '52231', '52250', '52251', '52260', 'gomez', 'shmaman', '88xxxx',
# '50319', '50322', '50330', '50334', '50368', '50521', '50653', '50722',
# '50723', '50840', '50854', '50942', '50980', '50985', '50994', '50995',
# '51106', '51123', '51174', '51257', '51277', '51278', '52103', '52109',
# '52110', '52125', '52140', '52142', '52174', '52259', '50143', '50296',
# '50391', '50395', '50397', '50446', '50586', '50610', '50663', '50948',
# '51105', '51109', '51157', '51159', '51180', '51184', '51214', '52115',
# '52256', '52267',
# # BAM files need transfer to sanger
# 'exapoe', '51160', '51188', '52248', '50445', '51206', '50996',
# '51240', '50948', '52134', '52168']
# -- Priority 2 full re-run
#params:
# name: alz-p2f
# priority: 2
Expand Down
2 changes: 1 addition & 1 deletion tanzi_ad/scripts/prep_run_configs_p1.py
Expand Up @@ -48,7 +48,7 @@ def write_config(g, baminfo, name, config):
"20000000", "high", "genome", config["coverage"]])
bam_files.append(bamfile)
else:
print("BAM file missing for %s: %s" % (info["sample"], family))
raise ValueError("BAM file missing for %s: %s" % (info["sample"], family))
subprocess.check_call(["bcbio_nextgen.py", "-w", "template", "freebayes-variant",
meta_file] + bam_files)

Expand Down
78 changes: 78 additions & 0 deletions tanzi_ad/scripts/prep_sv_pilot.py
@@ -0,0 +1,78 @@
#!/usr/bin/env python
"""Prepare bcbio-nextgen configuration files for calling structural variation pilot.
"""
import csv
import glob
import os
import subprocess
import sys

from bcbio import utils

priority_file = "/n/hsphS10/hsphfs1/chb/projects/tanzi_ad/data/gwas/AD-Master-v2.csv"
bam_dir = "/n/hsphS10/hsphfs2/tanzi_recalled"
name = "alz-sv-pilot"

def main(family_file, region_file):
families = get_families(family_file)
samples = add_bams(get_samples(families, priority_file), bam_dir)
config_file = write_config_file(samples, name, region_file)

def write_config_file(samples, name, region_file):
meta_file = os.path.join(utils.safe_makedir(os.path.join(name, "config")),
"%s.csv" % name)
bams = []
with open(meta_file, "w") as out_handle:
writer = csv.writer(out_handle)
writer.writerow(["samplename", "description", "batch", "sex",
"aligner", "mark_duplicates", "variantcaller", "svcaller", "variant_regions"])
for sample in sorted(samples, key=lambda x: x["family"]):
bams.append(sample["bam"])
writer.writerow([os.path.basename(sample["bam"]), sample["sample"], sample["family"],
sample["gender"], "false", "false", "false", "lumpy;cn.mops", region_file])
subprocess.check_call(["bcbio_nextgen.py", "-w", "template", "freebayes-variant",
meta_file] + bams)

def add_bams(samples, bam_dir):
out = []
for sample in samples:
sample["bam"] = get_bam(sample["sample"], bam_dir)
out.append(sample)
return out

def get_bam(sample, bam_dir):
bam_files = glob.glob(os.path.join(bam_dir, "*", "final", sample, "%s-*am" % sample))
assert len(bam_files) > 0, "Did not find BAM files for %s: %s" % (sample, bam_files)
if len(bam_files) > 1:
bam_files = [x for x in bam_files if x.endswith(".bam")]
return bam_files[0]

def get_samples(families, fname):
samples = []
with open(fname) as in_handle:
reader = csv.reader(in_handle)
reader.next() # header
for parts in reader:
family_id, sample_id, priority = parts[1:4]
status_flag = parts[16]
if status_flag != "Exclude" and family_id in families:
samples.append({"sample": sample_id, "family": family_id, "gender": _get_gender(parts[12])})
return samples

def _get_gender(gender):
if gender.lower() in ["m", "male", "1"]:
return "male"
elif gender.lower() in ["f", "female", "2"]:
return "female"
else:
return ""

def get_families(in_file):
families = set([])
with open(in_file) as in_handle:
for line in in_handle:
families.add(line.strip())
return families

if __name__ == "__main__":
main(*sys.argv[1:])
111 changes: 111 additions & 0 deletions tanzi_ad/scripts/run_p1and2_squaring.py
@@ -0,0 +1,111 @@
#!/usr/bin/env python
"""Run squaring off batch scripts for priority 1 and 2 families.
"""
import csv
import glob
import os
import shutil
import subprocess
import sys

import joblib

from bcbio import utils
from bcbio.distributed.transaction import file_transaction
from bcbio.variation import vcfutils

priority_file = "/n/hsphS10/hsphfs1/chb/projects/tanzi_ad/data/gwas/AD-Master-v2.csv"
input_vcf_dir = "/n/hsphS10/hsphfs1/chb/projects/tanzi_ad/data/recall_variants/freebayes"
bam_dir = "/n/hsphS10/hsphfs2/tanzi_recalled"
name = "tanzi_ad_p1and2-square"
ref_file = "/n/hsphS10/hsphfs1/chb/biodata/genomes/Hsapiens/GRCh37/seq/GRCh37.fa"

def main(cores=1):
start_dir = os.getcwd()
work_dir = utils.safe_makedir("/scratch/square")
priorities = set(["1", "2"])
list_file = get_input_list(start_dir, priorities)
# Ensure input CRAMs are indexed; gets IO bound quickly so limit cores
cram_cores = min(int(cores), 6)
for cindex in joblib.Parallel(cram_cores)(joblib.delayed(index_cram)(x) for x in find_crams(list_file)):
print cindex
with utils.chdir(work_dir):
out_file = run_squaring(list_file, name, ref_file, cores)
for ext in ["", ".tbi"]:
new_file = os.path.join(start_dir, os.path.basename(out_file) + ext)
if not utils.file_exists(new_file):
shutil.copy(out_file + ext, new_file)

def run_squaring(list_file, name, ref_file, cores):
mem = 3 * int(cores)
mem_opts = ["-Xms%sG" % (mem // 2), "-Xmx%sG" % mem]
out_file = os.path.join(os.getcwd(), "%s.vcf.gz" % name)
if not utils.file_exists(out_file):
subprocess.check_call(["bcbio-variation-recall"] + mem_opts +
["square", out_file, ref_file, list_file,
"--caller", "freebayes", "--cores", str(cores)])
return out_file

def index_cram(fname):
out_file = "%s.crai" % fname
if not utils.file_exists(out_file):
print "Indexing", fname
with file_transaction(out_file) as tx_out_file:
tx_in_file = os.path.splitext(tx_out_file)[0]
utils.symlink_plus(fname, tx_in_file)
subprocess.check_call(["cram_index", tx_in_file])
return out_file

def find_crams(in_file):
with open(in_file) as in_handle:
for line in (l.strip() for l in in_handle):
if line.endswith(".cram"):
yield line

def get_input_list(work_dir, priorities):
list_file = os.path.join(work_dir, "p1and2-input-files.txt")
if not utils.file_exists(list_file):
families = read_families_by_priority(priority_file, priorities)
vcf_files = [get_vcf(input_vcf_dir, fam) for fam in families]
bam_files = []
for vcf_file in vcf_files:
bam_files.extend(get_bams(vcf_file, bam_dir))

with open(list_file, "w") as out_handle:
for fname in vcf_files + bam_files:
out_handle.write(fname + "\n")
return list_file

def get_vcf(vcf_dir, fam):
vcfs = sorted(glob.glob(os.path.join(vcf_dir, "%s-*vcf*" % fam)))
for ending in [".vcf.gz", ".vcf"]:
for f in vcfs:
if f.endswith(ending):
return f
raise ValueError("Did not find VCF for %s in %s" % (fam, vcf_dir))

def get_bams(vcf_file, bam_dir):
out = []
for sample in vcfutils.get_samples(vcf_file):
bam_files = glob.glob(os.path.join(bam_dir, "*", "final", sample, "%s-*am" % sample))
assert len(bam_files) > 0, "Did not find BAM files for %s: %s" % (sample, bam_files)
if len(bam_files) > 1:
bam_files = [x for x in bam_files if x.endswith(".bam")]
out.append(bam_files[0])
return out

def read_families_by_priority(fname, priorities):
families = set([])
with open(fname) as in_handle:
reader = csv.reader(in_handle)
reader.next() # header
for parts in reader:
_, family_id, _, priority = parts[:4]
status_flag = parts[16]
if status_flag != "Exclude" and priority.strip() in priorities:
families.add(family_id)
return sorted(list(families))


if __name__ == "__main__":
main(*sys.argv[1:])

0 comments on commit 0fc20bd

Please sign in to comment.