Skip to content

Commit

Permalink
adding remap-phase-set command line tool
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed Aug 22, 2017
1 parent 8ec7c11 commit 3adda62
Show file tree
Hide file tree
Showing 7 changed files with 426 additions and 32 deletions.
6 changes: 5 additions & 1 deletion tools/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<!--
dsh-bio-tools Command line tools.
Copyright (c) 2013-2016 held jointly by the individual authors.
Copyright (c) 2013-2017 held jointly by the individual authors.
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
Expand Down Expand Up @@ -214,6 +214,10 @@
<id>dsh-intersect-bed</id>
<mainClass>org.dishevelled.bio.tools.IntersectBed</mainClass>
</program>
<program>
<id>dsh-remap-phase-set</id>
<mainClass>org.dishevelled.bio.tools.RemapPhaseSet</mainClass>
</program>
<program>
<id>dsh-split-fastq</id>
<mainClass>org.dishevelled.bio.tools.SplitFastq</mainClass>
Expand Down
230 changes: 230 additions & 0 deletions tools/src/main/java/org/dishevelled/bio/tools/RemapPhaseSet.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
/*
dsh-bio-tools Command line tools.
Copyright (c) 2013-2017 held jointly by the individual authors.
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this library; if not, write to the Free Software Foundation,
Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
> http://www.fsf.org/licensing/licenses/lgpl.html
> http://www.opensource.org/licenses/lgpl-license.php
*/
package org.dishevelled.bio.tools;

import static com.google.common.base.Preconditions.checkNotNull;

import static org.dishevelled.compress.Readers.reader;
import static org.dishevelled.compress.Writers.writer;

import java.io.File;
import java.io.PrintWriter;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import java.util.concurrent.Callable;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ListMultimap;

import org.dishevelled.bio.variant.vcf.VcfGenotype;
import org.dishevelled.bio.variant.vcf.VcfHeader;
import org.dishevelled.bio.variant.vcf.VcfReader;
import org.dishevelled.bio.variant.vcf.VcfRecord;
import org.dishevelled.bio.variant.vcf.VcfSample;
import org.dishevelled.bio.variant.vcf.VcfWriter;
import org.dishevelled.bio.variant.vcf.VcfStreamAdapter;

import org.dishevelled.bio.variant.vcf.header.VcfHeaderLineType;
import org.dishevelled.bio.variant.vcf.header.VcfHeaderLines;

import org.dishevelled.commandline.ArgumentList;
import org.dishevelled.commandline.CommandLine;
import org.dishevelled.commandline.CommandLineParseException;
import org.dishevelled.commandline.CommandLineParser;
import org.dishevelled.commandline.Switch;
import org.dishevelled.commandline.Usage;

import org.dishevelled.commandline.argument.FileArgument;

/**
* Remap Type=String PS phase set ids in VCF format to Type=Integer.
*
* @author Michael Heuer
*/
public final class RemapPhaseSet implements Callable<Integer> {
private final File inputVcfFile;
private final File outputVcfFile;
private static final String USAGE = "dsh-remap-phase-set -i input.vcf.gz -o output.vcf.gz";


/**
* Remap Type=String PS phase set ids in VCF format to Type=Integer.
*
* @param inputVcfFile input VCF file, if any
* @param outputVcfFile output VCF file, if any
*/
public RemapPhaseSet(final File inputVcfFile, final File outputVcfFile) {
this.inputVcfFile = inputVcfFile;
this.outputVcfFile = outputVcfFile;
}


@Override
public Integer call() throws Exception {
PrintWriter writer = null;
try {
writer = writer(outputVcfFile);

final PrintWriter w = writer;
VcfReader.stream(reader(inputVcfFile), new VcfStreamAdapter() {
private boolean wroteSamples = false;
private boolean remapPhaseSet = false;
private List<VcfSample> samples = new ArrayList<VcfSample>();
private int phaseSetId = 0;
private Map<String, Integer> phaseSetIds = new HashMap<String, Integer>();

@Override
public void header(final VcfHeader header) {
VcfHeaderLines headerLines = VcfHeaderLines.fromHeader(header);
if (headerLines.getFormatHeaderLines().containsKey("PS") &&
VcfHeaderLineType.String.equals(headerLines.getFormatHeaderLines().get("PS").getType())) {

remapPhaseSet = true;
VcfHeader.Builder builder = VcfHeader.builder().withFileFormat(header.getFileFormat());
for (String meta : header.getMeta()) {
if (meta.startsWith("##FORMAT=<ID=PS")) {
String description = headerLines.getFormatHeaderLines().get("PS").getDescription() + " (converted from Type=String to Type=Integer)";
builder.withMeta("##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"" + description + "\">");
}
else {
builder.withMeta(meta);
}
}
VcfWriter.writeHeader(builder.build(), w);
}
else {
VcfWriter.writeHeader(header, w);
}
}

@Override
public void sample(final VcfSample sample) {
samples.add(sample);
}

@Override
public void record(final VcfRecord record) {
// write out samples
if (!wroteSamples) {
VcfWriter.writeColumnHeader(samples, w);
wroteSamples = true;
}

// write out record, remapping phase set if necessary
if (remapPhaseSet) {
Map<String, VcfGenotype> genotypes = new HashMap<String, VcfGenotype>(record.getGenotypes().size());
for (Map.Entry<String, VcfGenotype> entry : record.getGenotypes().entrySet()) {
String sampleId = entry.getKey();
VcfGenotype genotype = entry.getValue();

if (genotype.containsFieldKey("PS")) {
String oldPhaseSetId = genotype.getFieldString("PS");
if (!phaseSetIds.containsKey(oldPhaseSetId)) {
phaseSetIds.put(oldPhaseSetId, phaseSetId);
phaseSetId++;
}
Integer newPhaseSetId = phaseSetIds.get(oldPhaseSetId);
genotypes.put(sampleId, VcfGenotype.builder(genotype).replaceField("PS", newPhaseSetId.toString()).build());
}
else {
genotypes.put(sampleId, genotype);
}
}
VcfWriter.writeRecord(samples, VcfRecord.builder(record).replaceGenotypes(genotypes).build(), w);
}
else {
VcfWriter.writeRecord(samples, record, w);
}
}
});

return 0;
}
finally {
try {
writer.close();
}
catch (Exception e) {
// empty
}
}
}

/**
* Main.
*
* @param args command line args
*/
public static void main(final String[] args) {
Switch about = new Switch("a", "about", "display about message");
Switch help = new Switch("h", "help", "display help message");
FileArgument inputVcfFile = new FileArgument("i", "input-vcf-file", "input VCF file, default stdin", false);
FileArgument outputVcfFile = new FileArgument("o", "output-vcf-file", "output VCF file, default stdout", false);

ArgumentList arguments = new ArgumentList(about, help, inputVcfFile, outputVcfFile);
CommandLine commandLine = new CommandLine(args);

RemapPhaseSet remapPhaseSet = null;
try {
CommandLineParser.parse(commandLine, arguments);
if (about.wasFound()) {
About.about(System.out);
System.exit(0);
}
if (help.wasFound()) {
Usage.usage(USAGE, null, commandLine, arguments, System.out);
System.exit(0);
}
remapPhaseSet = new RemapPhaseSet(inputVcfFile.getValue(), outputVcfFile.getValue());
}
catch (CommandLineParseException e) {
if (about.wasFound()) {
About.about(System.out);
System.exit(0);
}
if (help.wasFound()) {
Usage.usage(USAGE, null, commandLine, arguments, System.out);
System.exit(0);
}
Usage.usage(USAGE, e, commandLine, arguments, System.err);
System.exit(-1);
}
catch (NullPointerException | IllegalArgumentException e) {
Usage.usage(USAGE, e, commandLine, arguments, System.err);
System.exit(-1);
}
try {
System.exit(remapPhaseSet.call());
}
catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
}
3 changes: 2 additions & 1 deletion tools/src/main/java/org/dishevelled/bio/tools/Tools.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
dsh-bio-tools Command line tools.
Copyright (c) 2013-2016 held jointly by the individual authors.
Copyright (c) 2013-2017 held jointly by the individual authors.
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
Expand Down Expand Up @@ -114,6 +114,7 @@ static String[] dropFirst(final String[] args) {
.put("filter-vcf", new Command("filter-vcf", "filter variants in VCF format", FilterVcf.class))
.put("interleave-fastq", new Command("interleave-fastq", "convert first and second sequence files in FASTQ format to interleaved FASTQ format", InterleaveFastq.class))
//.put("intersect-bed", new Command("intersect-bed", "similar to bedtools2 intersect -v", IntersectBed.class))
.put("remap-phase-set", new Command("remap-phase-set", "remap Type=String PS phase set ids in VCF format to Type=Integer", RemapPhaseSet.class))
.put("split-fastq", new Command("split-fastq", "convert interleaved FASTQ format into first and second sequence files in FASTQ format", SplitFastq.class))
.put("vcf-pedigree", new Command("vcf-pedigree", "extract a pedigree from VCF format", VcfPedigree.class))
.put("vcf-samples", new Command("vcf-samples", "extract samples from VCF format", VcfSamples.class))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
dsh-bio-tools Command line tools.
Copyright (c) 2013-2017 held jointly by the individual authors.
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this library; if not, write to the Free Software Foundation,
Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
> http://www.fsf.org/licensing/licenses/lgpl.html
> http://www.opensource.org/licenses/lgpl-license.php
*/
package org.dishevelled.bio.tools;

import static org.junit.Assert.assertNotNull;

import java.io.File;

import org.junit.Before;
import org.junit.Test;

/**
* Unit test for RemapPhaseSet.
*
* @author Michael Heuer
*/
public final class RemapPhaseSetTest {
private File inputVcfFile;
private File outputVcfFile;
private RemapPhaseSet remapPhaseSet;

@Before
public void setUp() throws Exception {
remapPhaseSet = new RemapPhaseSet(inputVcfFile, outputVcfFile);
}

@Test
public void testConstructor() {
assertNotNull(remapPhaseSet);
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
dsh-bio-variant Variants.
Copyright (c) 2013-2016 held jointly by the individual authors.
Copyright (c) 2013-2017 held jointly by the individual authors.
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
Expand Down Expand Up @@ -509,6 +509,20 @@ public static Builder builder() {
return new Builder();
}

/**
* Create and return a new VCF genotype builder populated from the fields in the specified VCF genotype.
*
* @param genotype VCF genotype, must not be null
* @return a new VCF genotype builder populated from the fields in the specified VCF genotype
*/
public static Builder builder(final VcfGenotype genotype) {
checkNotNull(genotype, "genotype must not be null");
return new Builder()
.withRef(genotype.getRef())
.withAlt(genotype.getAlt())
.withFields(genotype.getFields());
}

/**
* VCF genotype builder.
*/
Expand Down Expand Up @@ -579,6 +593,47 @@ public Builder withFields(final ListMultimap<String, String> fields) {
return this;
}

/**
* Return this VCF genotype builder configured with the specified genotype field
* replacing the previously configured value(s). Use sparingly, more expensive than
* <code>withFields</code>.
*
* @param id genotype field id, must not be null
* @param values genotype field values, must not be null
* @return this VCF genotype builder configured with the specified genotype field
* replacing the previously configured value(s)
*/
public Builder replaceField(final String id, final String... values) {
checkNotNull(values);

// copy old field values except id
ListMultimap<String, String> oldFields = this.fields.build();
this.fields = ImmutableListMultimap.builder();
for (String key : oldFields.keys()) {
if (!key.equals(id)) {
this.fields.putAll(key, oldFields.get(key));
}
}
// add new value(s)
for (String value : values) {
fields.put(id, value);
}
return this;
}

/**
* Return this VCF genotype builder configured with the specified genotype fields.
* Use sparingly, more expensive than <code>withFields</code>.
*
* @param fields genotype fields, must not be null
* @return this VCF genotype builder configured with the specified genotype fields
*/
public Builder replaceFields(final ListMultimap<String, String> fields) {
this.fields = ImmutableListMultimap.builder();
this.fields.putAll(fields);
return this;
}

/**
* Reset this VCF genotype builder.
*
Expand Down
Loading

0 comments on commit 3adda62

Please sign in to comment.