Skip to content

Commit

Permalink
moved panVCF.VCF to top level import
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffhsu3 committed Jan 18, 2016
1 parent ebb7f99 commit 138d72f
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 27 deletions.
7 changes: 5 additions & 2 deletions doc/source/vcf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@ Working With VCFs
The VCF Class
=============

panVCF.VCF(file)
genda.formats.panVCF.VCF(vcf_file)
parameters:
file - The VCF file which you want to load into panVCF
vcf_file - The VCF file which you want to load into panVCF
chunksize - pandas.read_csv chunksize DOESN'T WORK ATM as parsing info
expects a pd.DataFrame

Data
----
Expand All @@ -26,6 +28,7 @@ Hardy-Weinberg

Changing Reference Genome
-------------------------
:TODO switch to how NCBI does the first|second pass
VCF.change_base(old_reference_file, new_reference_file, chrom)
Used when the reference genome of a vcf file must be changed. Must be done one chromosome at a time (vcf can have more than one, but the fasta files can only have one).

Expand Down
1 change: 1 addition & 0 deletions genda/formats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
"""

from .gene_utils import *
from .panVCF import VCF
39 changes: 16 additions & 23 deletions genda/formats/panVCF.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ def parse_chr(entry):


def parse_geno(entry, GT=0):
""" Need to somehow keep phase, maybe use multi_index, but only
"""
:TODO
Need to somehow keep phase, maybe use multi_index, but only
works if haplotype is contigous across whole section. Maybe
groups to denote haplotype regions?
"""
try:
g = entry.split(":")[GT]
Expand Down Expand Up @@ -56,11 +57,7 @@ def __init__(self, vcf_file, chunksize = None, allelic_ratio=False,
self._parse_meta()
self.samples = samples
self.info_dict = []

sex_chromosomes = ['X', 'Y']



convertor = {}
#convertor = dict((k + 9 , parse_geno) for k in range(len(samples)))
#convertor = dict((k + 9 , allele_ratio) for k in range(len(samples)))
Expand All @@ -72,12 +69,9 @@ def __init__(self, vcf_file, chunksize = None, allelic_ratio=False,
converters = convertor,
chunksize = chunksize,
)

pg=functools.partial(parse_geno,GT = self.vcf.ix[0,8].split(":").index("GT"))
self.geno = self.vcf.ix[:,9:].applymap(pg)

#self.vcf.rename(columns = {'#CHROM': 'CHROM'}, inplace=True)

rsID = self.vcf["ID"]
novel = [str(i) + "_" + str(j) + "_" + str(k) for (i, j, k)\
in zip(list(self.vcf["#CHROM"][rsID=="."]),
Expand All @@ -86,11 +80,10 @@ def __init__(self, vcf_file, chunksize = None, allelic_ratio=False,
)
]
self.novel = novel
rsID[rsID == "."] = np.asarray(novel)
rsID.loc[rsID == "."] = np.asarray(novel)
self.vcf.index = pd.Index(rsID)
self.geno.index = self.vcf.index
info_dict = self._split_info(self.vcf.INFO)

# Parsing the INFO
def flag_parse(info_dict, key):
try:
Expand Down Expand Up @@ -146,8 +139,6 @@ def string_field(info_dict, key):
del self.vcf['INFO']
del self.vcf['ID']
# Parse GENO


def _parse_haplotype(entry):
g = entry.split(":")[0]
try:
Expand Down Expand Up @@ -230,7 +221,9 @@ def _parse_meta(self):
""" Parses meta information and returns a list of the INFO and FORMAT
fields.
Okay so all VCF files are slightly different. Why is the number sometimes a letter? Jesus
Okay so all VCF files are slightly different.
Why is the number sometimes a letter?
:TODO work on edge cases
"""
convert = {
'Integer': np.int,
Expand Down Expand Up @@ -261,7 +254,7 @@ def _parse_meta(self):

def _parse_info(self, entry):
""" Split the INFO field into a dictionary. Creates an instance
of the class varaible ans saves the dictionary to it.
of the class varaible and saves the dictionary to it.
"""
def parse_field(field):
try:
Expand Down Expand Up @@ -311,7 +304,6 @@ def list_samples_with_alternate_allele(self, rsID, show_na=False):
""" Returns a list of samples that have the alternate non-ref allele
at the variant identified by rsID
"""

non_ref = np.logical_not(self.geno.ix[rsID, :] == 0)
nans = np.isnan(self.geno.ix[rsID, :])
return(self.geno.columns[np.logical_or(non_ref, nans)])
Expand Down Expand Up @@ -376,13 +368,13 @@ def __getitem__(self, val):
def change_base(self, old_ref_file, new_ref_file, chrom):
from copy import deepcopy
from Bio import SeqIO
newvcf=deepcopy(self.vcf)
newvcf = deepcopy(self.vcf)
old = SeqIO.read(open(old_ref_file), format="fasta")
new = SeqIO.read(open(new_ref_file), format="fasta")
from Bio import pairwise2
pos={}
pos = {}
diff = {}
shifts={}
shifts = {}
shift=0
old_margin = 0
new_margin = 0
Expand Down Expand Up @@ -462,7 +454,7 @@ def convert_genotypes(samples, multiple_alleles = False):
:TODO The multiple conversion isn't quite right
"""
for n,p in enumerate(samples):
for n, p in enumerate(samples):
p = p.split(':')
freqs = p[1].split(',')[1] + ',' + p[1].split(',')[0]
if multiple_alleles:
Expand All @@ -489,7 +481,7 @@ def convert_genotypes(samples, multiple_alleles = False):
d[c[0]] = [c[1], c[2], c[3]]

for row in range(self.vcf.shape[0]):
if newvcf.ix[row,0].endswith(chrom):
if newvcf.ix[row, 0].endswith(chrom):
pos = self.vcf.ix[row,1]
newpos=pos + change_coord(pos)
if pos in d.keys():
Expand Down Expand Up @@ -570,10 +562,11 @@ def switch_indexes(n, s):
newobj = deepcopy(self)
newobj.vcf = newvcf
#pg = functools.partial(parse_geno,GT = self.vcf.ix[0,6].split(":").index("GT"))
newobj.geno = newobj.vcf.ix[:,7:].applymap(parse_geno, GT=self.vcf.ix[0,6].split(":").index("GT"))

newobj.geno = newobj.vcf.ix[:,7:].applymap(parse_geno,
GT=self.vcf.ix[0,6].split(":").index("GT"))
return newobj


def pos_line_convert(line):
""" Convert chr:pos into a integer
"""
Expand Down
3 changes: 1 addition & 2 deletions genda/plotting/plotting_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def plot_transcript(transcript_id, paths, ax, y=0, height=2.,
for patch in ps:
ax.add_patch(patch)
except AttributeError:
for i, exon in enumerate(paths[transcript_id]):
for i in paths[transcript_id]:
try:
path = make_rectangle(i[0], i[1], y, y+height)
except IndexError:
Expand All @@ -133,7 +133,6 @@ def add_gene_bounderies(ax, gene_annot, gene_name, x_scale):
return(patch)



def add_snp_arrow(adj_pv, x, snp, ax):
"""Adds an arrow with snp text to plot
:TODO The best snp should be some sort of object
Expand Down
25 changes: 25 additions & 0 deletions tests/plotting_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import unittest

from genda.plotting.plotting_utils import plot_transcript


class TestPlotTranscript(unittest.TestCase):
def setUp(self):
self.transcript_paths = False
self.transcript = 'TRANSCRIPT1'
pass


def test_plot_transcript_paths():
"""Testcase where the input
to plot_transcript are already paths
"""
#plot_transcript
pass

def test_plot_transcript_dict():
"""Testcase where the input
to plot_transcript are dict
of transcripts
"""
pass

0 comments on commit 138d72f

Please sign in to comment.