Merge branch 'master' of github.com:hbc/projects

hbc · Apr 15, 2014 · 0fc20bd · 0fc20bd
2 parents d86f963 + 82a883f
commit 0fc20bd
Show file tree

Hide file tree

Showing 4 changed files with 271 additions and 27 deletions.
diff --git a/tanzi_ad/config/runprep_config.yaml b/tanzi_ad/config/runprep_config.yaml
@@ -2,33 +2,88 @@ idmapping: gwas/NIMH_Patient.Rutgers.ID_Translation.txt
 priority: gwas/AD-Master-v2.csv
 coverage: inputs/alz-priority-regions.bed
 fam: gwas/WGS_NIMH.fam
-# -- Priority 2 full re-run
+
+# -- Priority 1/2 re-run missing VCFs/samples
+# p1 '51186'
+# p2f-g7 '50296', '50948', '51184', '52267'
+# 50948 has some confusing labels
+# extra illumina sample, missing from AD-Master-v2: 5010533 actually 5010553 in 50296
+# -- appears to be a typo
+# Not in illumina ['5010623', '5010452']
+# 3 families also missing single samples:
+# ['5110084', '5010175', '5010500']
+# 51105 50391 50722
+#
+# params:
+#  name: alz-p12_redo
+#  max_samples: 75
+#  families: ['51186', '50296', '50948', '51184', '52267', '51105', '50391', '50722']
+
+# Total = 1476 + 1
+# -- Priority 3 final 400
 params:
-  name: alz-p3f
-  max_samples: 65
-  # finished p1 and p2
-  excludefamilies: ['50115', '50116', '50127', '50152', '50270',
-  '50288', '50291', '50302', '50307', '50323', '50328', '50329',
-  '50332', '50336', '50354', '50361', '50362', '50452', '50455',
-  '50456', '50523', '50589', '50590', '50593', '50638', '50646',
-  '50670', '50841', '50981', '50986', '50993', '51114', '51126',
-  '51136', '51146', '51147', '51154', '51161', '51181', '51185',
-  '51186', '51192', '51193', '51196', '51200', '51223', '51225',
-  '51227', '51241', '51243', '51253', '52104', '52106', '52111',
-  '52112', '52122', '52123', '52126', '52127', '52143', '52147',
-  '52149', '52153', '52154', '52163', '52164', '52179', '52180',
-  '52185', '52196', '52199', '52203', '52218', '52228', '52230',
-  '52231', '52250', '52251', '52260', 'gomez', 'shmaman', '88xxxx',
-  '50319', '50322', '50330', '50334', '50368', '50521', '50653', '50722',
-  '50723', '50840', '50854', '50942', '50980', '50985', '50994', '50995',
-  '51106', '51123', '51174', '51257', '51277', '51278', '52103', '52109',
-  '52110', '52125', '52140', '52142', '52174', '52259', '50143', '50296',
-  '50391', '50395', '50397', '50446', '50586', '50610', '50663', '50948',
-  '51105', '51109', '51157', '51159', '51180', '51184', '51214', '52115',
-  '52256', '52267',
-  # BAM files need transfer to sanger
-  'exapoe', '51160', '51188', '52248', '50445', '51206', '50996',
-  '51240', '50948', '52134', '52168']
+  name: alz-p3f_2
+  max_samples: 75
+  excludefamilies: ['50105', '50115', '50116', '50122', '50123', '50127', '50130', '50133',
+'50143', '50152', '50260', '50261', '50270', '50279', '50288', '50291', '50296',
+'50302', '50307', '50319', '50322', '50323', '50328', '50329', '50330', '50331',
+'50332', '50334', '50336', '50343', '50354', '50361', '50362', '50367', '50368',
+'50369', '50372', '50385', '50388', '50391', '50395', '50397', '50444', '50446',
+'50452', '50455', '50456', '50459', '50461', '50500', '50506', '50521', '50523',
+'50534', '50563', '50572', '50578', '50581', '50586', '50588', '50589', '50590',
+'50593', '50610', '50637', '50638', '50641', '50646', '50648', '50649', '50653',
+'50660', '50663', '50668', '50669', '50670', '50676', '50679', '50681', '50722',
+'50723', '50732', '50735', '50748', '50771', '50798', '50803', '50808', '50814',
+'50840', '50841', '50854', '50857', '50942', '50948', '50962', '50979', '50980',
+'50981', '50985', '50986', '50990', '50991', '50993', '50994', '50995', '50997',
+'50998', '50999', '51105', '51106', '51107', '51109', '51110', '51111', '51113',
+'51114', '51115', '51119', '51123', '51126', '51129', '51134', '51136', '51137',
+'51138', '51139', '51146', '51147', '51151', '51153', '51154', '51157', '51158',
+'51159', '51161', '51167', '51174', '51175', '51176', '51177', '51180', '51181',
+'51184', '51185', '51186', '51192', '51193', '51196', '51200', '51204', '51208',
+'51211', '51213', '51214', '51216', '51219', '51223', '51225', '51226', '51227',
+'51230', '51238', '51241', '51243', '51250', '51253', '51255', '51257', '51259',
+'51263', '51264', '51266', '51267', '51268', '51277', '51278', '52102', '52103',
+'52104', '52106', '52107', '52108', '52109', '52110', '52111', '52112', '52113',
+'52115', '52121', '52122', '52123', '52125', '52126', '52127', '52133', '52136',
+'52137', '52139', '52140', '52142', '52143', '52147', '52148', '52149', '52153',
+'52154', '52156', '52157', '52158', '52159', '52161', '52163', '52164', '52165',
+'52167', '52170', '52172', '52173', '52174', '52175', '52176', '52177', '52179',
+'52180', '52181', '52183', '52184', '52185', '52186', '52187', '52189', '52190',
+'52196', '52199', '52200', '52201', '52203', '52207', '52208', '52209', '52215',
+'52216', '52218', '52223', '52225', '52226', '52228', '52230', '52231', '52233',
+'52234', '52235', '52236', '52240', '52245', '52246', '52249', '52250', '52251',
+'52252', '52253', '52254', '52256', '52258', '52259', '52260', '52261', '52262',
+'52264', '52267', 'gomez', 'shmaman', '88xxxx']
+
+# -- Priority 3 initial set
+# params:
+#   name: alz-p3f
+#   max_samples: 65
+#   # finished p1 and p2
+#   excludefamilies: ['50115', '50116', '50127', '50152', '50270',
+#   '50288', '50291', '50302', '50307', '50323', '50328', '50329',
+#   '50332', '50336', '50354', '50361', '50362', '50452', '50455',
+#   '50456', '50523', '50589', '50590', '50593', '50638', '50646',
+#   '50670', '50841', '50981', '50986', '50993', '51114', '51126',
+#   '51136', '51146', '51147', '51154', '51161', '51181', '51185',
+#   '51186', '51192', '51193', '51196', '51200', '51223', '51225',
+#   '51227', '51241', '51243', '51253', '52104', '52106', '52111',
+#   '52112', '52122', '52123', '52126', '52127', '52143', '52147',
+#   '52149', '52153', '52154', '52163', '52164', '52179', '52180',
+#   '52185', '52196', '52199', '52203', '52218', '52228', '52230',
+#   '52231', '52250', '52251', '52260', 'gomez', 'shmaman', '88xxxx',
+#   '50319', '50322', '50330', '50334', '50368', '50521', '50653', '50722',
+#   '50723', '50840', '50854', '50942', '50980', '50985', '50994', '50995',
+#   '51106', '51123', '51174', '51257', '51277', '51278', '52103', '52109',
+#   '52110', '52125', '52140', '52142', '52174', '52259', '50143', '50296',
+#   '50391', '50395', '50397', '50446', '50586', '50610', '50663', '50948',
+#   '51105', '51109', '51157', '51159', '51180', '51184', '51214', '52115',
+#   '52256', '52267',
+#   # BAM files need transfer to sanger
+#   'exapoe', '51160', '51188', '52248', '50445', '51206', '50996',
+#   '51240', '50948', '52134', '52168']
+# -- Priority 2 full re-run
 #params:
 #  name: alz-p2f
 #  priority: 2

diff --git a/tanzi_ad/scripts/prep_run_configs_p1.py b/tanzi_ad/scripts/prep_run_configs_p1.py
@@ -48,7 +48,7 @@ def write_config(g, baminfo, name, config):
                                      "20000000", "high", "genome", config["coverage"]])
                     bam_files.append(bamfile)
                 else:
-                    print("BAM file missing for %s: %s" % (info["sample"], family))
+                    raise ValueError("BAM file missing for %s: %s" % (info["sample"], family))
     subprocess.check_call(["bcbio_nextgen.py", "-w", "template", "freebayes-variant",
                            meta_file] + bam_files)
 

diff --git a/tanzi_ad/scripts/prep_sv_pilot.py b/tanzi_ad/scripts/prep_sv_pilot.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+"""Prepare bcbio-nextgen configuration files for calling structural variation pilot.
+"""
+import csv
+import glob
+import os
+import subprocess
+import sys
+
+from bcbio import utils
+
+priority_file = "/n/hsphS10/hsphfs1/chb/projects/tanzi_ad/data/gwas/AD-Master-v2.csv"
+bam_dir = "/n/hsphS10/hsphfs2/tanzi_recalled"
+name = "alz-sv-pilot"
+
+def main(family_file, region_file):
+    families = get_families(family_file)
+    samples = add_bams(get_samples(families, priority_file), bam_dir)
+    config_file = write_config_file(samples, name, region_file)
+
+def write_config_file(samples, name, region_file):
+    meta_file = os.path.join(utils.safe_makedir(os.path.join(name, "config")),
+                             "%s.csv" % name)
+    bams = []
+    with open(meta_file, "w") as out_handle:
+        writer = csv.writer(out_handle)
+        writer.writerow(["samplename", "description", "batch", "sex",
+                         "aligner", "mark_duplicates", "variantcaller", "svcaller", "variant_regions"])
+        for sample in sorted(samples, key=lambda x: x["family"]):
+            bams.append(sample["bam"])
+            writer.writerow([os.path.basename(sample["bam"]), sample["sample"], sample["family"],
+                             sample["gender"], "false", "false", "false", "lumpy;cn.mops", region_file])
+    subprocess.check_call(["bcbio_nextgen.py", "-w", "template", "freebayes-variant",
+                           meta_file] + bams)
+
+def add_bams(samples, bam_dir):
+    out = []
+    for sample in samples:
+        sample["bam"] = get_bam(sample["sample"], bam_dir)
+        out.append(sample)
+    return out
+
+def get_bam(sample, bam_dir):
+    bam_files = glob.glob(os.path.join(bam_dir, "*", "final", sample, "%s-*am" % sample))
+    assert len(bam_files) > 0, "Did not find BAM files for %s: %s" % (sample, bam_files)
+    if len(bam_files) > 1:
+        bam_files = [x for x in bam_files if x.endswith(".bam")]
+    return bam_files[0]
+
+def get_samples(families, fname):
+    samples = []
+    with open(fname) as in_handle:
+        reader = csv.reader(in_handle)
+        reader.next()  # header
+        for parts in reader:
+            family_id, sample_id, priority = parts[1:4]
+            status_flag = parts[16]
+            if status_flag != "Exclude" and family_id in families:
+                samples.append({"sample": sample_id, "family": family_id, "gender": _get_gender(parts[12])})
+    return samples
+
+def _get_gender(gender):
+    if gender.lower() in ["m", "male", "1"]:
+        return "male"
+    elif gender.lower() in ["f", "female", "2"]:
+        return "female"
+    else:
+        return ""
+
+def get_families(in_file):
+    families = set([])
+    with open(in_file) as in_handle:
+        for line in in_handle:
+            families.add(line.strip())
+    return families
+
+if __name__ == "__main__":
+    main(*sys.argv[1:])
diff --git a/tanzi_ad/scripts/run_p1and2_squaring.py b/tanzi_ad/scripts/run_p1and2_squaring.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+"""Run squaring off batch scripts for priority 1 and 2 families.
+"""
+import csv
+import glob
+import os
+import shutil
+import subprocess
+import sys
+
+import joblib
+
+from bcbio import utils
+from bcbio.distributed.transaction import file_transaction
+from bcbio.variation import vcfutils
+
+priority_file = "/n/hsphS10/hsphfs1/chb/projects/tanzi_ad/data/gwas/AD-Master-v2.csv"
+input_vcf_dir = "/n/hsphS10/hsphfs1/chb/projects/tanzi_ad/data/recall_variants/freebayes"
+bam_dir = "/n/hsphS10/hsphfs2/tanzi_recalled"
+name = "tanzi_ad_p1and2-square"
+ref_file = "/n/hsphS10/hsphfs1/chb/biodata/genomes/Hsapiens/GRCh37/seq/GRCh37.fa"
+
+def main(cores=1):
+    start_dir = os.getcwd()
+    work_dir = utils.safe_makedir("/scratch/square")
+    priorities = set(["1", "2"])
+    list_file = get_input_list(start_dir, priorities)
+    # Ensure input CRAMs are indexed; gets IO bound quickly so limit cores
+    cram_cores = min(int(cores), 6)
+    for cindex in joblib.Parallel(cram_cores)(joblib.delayed(index_cram)(x) for x in find_crams(list_file)):
+        print cindex
+    with utils.chdir(work_dir):
+        out_file = run_squaring(list_file, name, ref_file, cores)
+    for ext in ["", ".tbi"]:
+        new_file = os.path.join(start_dir, os.path.basename(out_file) + ext)
+        if not utils.file_exists(new_file):
+            shutil.copy(out_file + ext, new_file)
+
+def run_squaring(list_file, name, ref_file, cores):
+    mem = 3 * int(cores)
+    mem_opts = ["-Xms%sG" % (mem // 2), "-Xmx%sG" % mem]
+    out_file = os.path.join(os.getcwd(), "%s.vcf.gz" % name)
+    if not utils.file_exists(out_file):
+        subprocess.check_call(["bcbio-variation-recall"] + mem_opts +
+                              ["square", out_file, ref_file, list_file,
+                               "--caller", "freebayes", "--cores", str(cores)])
+    return out_file
+
+def index_cram(fname):
+    out_file = "%s.crai" % fname
+    if not utils.file_exists(out_file):
+        print "Indexing", fname
+        with file_transaction(out_file) as tx_out_file:
+            tx_in_file = os.path.splitext(tx_out_file)[0]
+            utils.symlink_plus(fname, tx_in_file)
+            subprocess.check_call(["cram_index", tx_in_file])
+    return out_file
+
+def find_crams(in_file):
+    with open(in_file) as in_handle:
+        for line in (l.strip() for l in in_handle):
+            if line.endswith(".cram"):
+                yield line
+
+def get_input_list(work_dir, priorities):
+    list_file = os.path.join(work_dir, "p1and2-input-files.txt")
+    if not utils.file_exists(list_file):
+        families = read_families_by_priority(priority_file, priorities)
+        vcf_files = [get_vcf(input_vcf_dir, fam) for fam in families]
+        bam_files = []
+        for vcf_file in vcf_files:
+            bam_files.extend(get_bams(vcf_file, bam_dir))
+
+        with open(list_file, "w") as out_handle:
+            for fname in vcf_files + bam_files:
+                out_handle.write(fname + "\n")
+    return list_file
+
+def get_vcf(vcf_dir, fam):
+    vcfs = sorted(glob.glob(os.path.join(vcf_dir, "%s-*vcf*" % fam)))
+    for ending in [".vcf.gz", ".vcf"]:
+        for f in vcfs:
+            if f.endswith(ending):
+                return f
+    raise ValueError("Did not find VCF for %s in %s" % (fam, vcf_dir))
+
+def get_bams(vcf_file, bam_dir):
+    out = []
+    for sample in vcfutils.get_samples(vcf_file):
+        bam_files = glob.glob(os.path.join(bam_dir, "*", "final", sample, "%s-*am" % sample))
+        assert len(bam_files) > 0, "Did not find BAM files for %s: %s" % (sample, bam_files)
+        if len(bam_files) > 1:
+            bam_files = [x for x in bam_files if x.endswith(".bam")]
+        out.append(bam_files[0])
+    return out
+
+def read_families_by_priority(fname, priorities):
+    families = set([])
+    with open(fname) as in_handle:
+        reader = csv.reader(in_handle)
+        reader.next()  # header
+        for parts in reader:
+            _, family_id, _, priority = parts[:4]
+            status_flag = parts[16]
+            if status_flag != "Exclude" and priority.strip() in priorities:
+                families.add(family_id)
+    return sorted(list(families))
+
+
+if __name__ == "__main__":
+    main(*sys.argv[1:])