moved panVCF.VCF to top level import

jeffhsu3 · Jan 18, 2016 · 138d72f · 138d72f
1 parent ebb7f99
commit 138d72f
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 27 deletions.
diff --git a/doc/source/vcf.rst b/doc/source/vcf.rst
@@ -5,9 +5,11 @@ Working With VCFs
 The VCF Class
 =============
 
-panVCF.VCF(file)
+genda.formats.panVCF.VCF(vcf_file)
     parameters:
-        file - The VCF file which you want to load into panVCF
+        vcf_file - The VCF file which you want to load into panVCF
+        chunksize - pandas.read_csv chunksize DOESN'T WORK ATM as parsing info
+        expects a pd.DataFrame
 
 Data
 ----
@@ -26,6 +28,7 @@ Hardy-Weinberg
 
 Changing Reference Genome
 -------------------------
+    :TODO switch to how NCBI does the first|second pass
     VCF.change_base(old_reference_file, new_reference_file, chrom)
         Used when the reference genome of a vcf file must be changed. Must be done one chromosome at a time (vcf can have more than one, but the fasta files can only have one).
 

diff --git a/genda/formats/__init__.py b/genda/formats/__init__.py
@@ -4,3 +4,4 @@
 """
 
 from .gene_utils import *
+from .panVCF import VCF
diff --git a/genda/formats/panVCF.py b/genda/formats/panVCF.py
@@ -21,10 +21,11 @@ def parse_chr(entry):
 
 
 def parse_geno(entry, GT=0):
-    """ Need to somehow keep phase, maybe use multi_index, but only
+    """ 
+    :TODO 
+    Need to somehow keep phase, maybe use multi_index, but only
     works if haplotype is contigous across whole section.  Maybe
     groups to denote haplotype regions?
-
     """
     try:
         g = entry.split(":")[GT]
@@ -56,11 +57,7 @@ def __init__(self, vcf_file, chunksize = None, allelic_ratio=False,
         self._parse_meta()
         self.samples = samples
         self.info_dict = []
-
         sex_chromosomes = ['X', 'Y']
-
-
-
         convertor = {}
         #convertor = dict((k + 9 , parse_geno) for k in range(len(samples)))
         #convertor = dict((k + 9 , allele_ratio) for k in range(len(samples)))
@@ -72,12 +69,9 @@ def __init__(self, vcf_file, chunksize = None, allelic_ratio=False,
                                  converters = convertor,
                                  chunksize = chunksize,
                                 )
-
         pg=functools.partial(parse_geno,GT = self.vcf.ix[0,8].split(":").index("GT"))
         self.geno = self.vcf.ix[:,9:].applymap(pg)
-
         #self.vcf.rename(columns = {'#CHROM': 'CHROM'}, inplace=True)
-
         rsID = self.vcf["ID"]
         novel = [str(i) + "_" + str(j) + "_" + str(k) for (i, j, k)\
                  in zip(list(self.vcf["#CHROM"][rsID=="."]),
@@ -86,11 +80,10 @@ def __init__(self, vcf_file, chunksize = None, allelic_ratio=False,
                        )
                 ]
         self.novel = novel
-        rsID[rsID == "."] = np.asarray(novel)
+        rsID.loc[rsID == "."] = np.asarray(novel)
         self.vcf.index = pd.Index(rsID)
         self.geno.index = self.vcf.index
         info_dict = self._split_info(self.vcf.INFO)
-
         # Parsing the INFO
         def flag_parse(info_dict, key):
             try:
@@ -146,8 +139,6 @@ def string_field(info_dict, key):
         del self.vcf['INFO']
         del self.vcf['ID']
         # Parse GENO
-
-
         def _parse_haplotype(entry):
             g = entry.split(":")[0]
             try:
@@ -230,7 +221,9 @@ def _parse_meta(self):
         """ Parses meta information and returns a list of the INFO and FORMAT
         fields.
 
-        Okay so all VCF files are slightly different.  Why is the number sometimes a letter?  Jesus
+        Okay so all VCF files are slightly different.  
+        Why is the number sometimes a letter? 
+        :TODO work on edge cases
         """
         convert = {
             'Integer': np.int,
@@ -261,7 +254,7 @@ def _parse_meta(self):
 
     def _parse_info(self, entry):
         """ Split the INFO field into a dictionary.  Creates an instance
-        of the class varaible ans saves the dictionary to it.
+        of the class varaible and saves the dictionary to it.
         """
         def parse_field(field):
             try:
@@ -311,7 +304,6 @@ def list_samples_with_alternate_allele(self, rsID, show_na=False):
         """ Returns a list of samples that have the alternate non-ref allele 
         at the variant identified by rsID
         """
-
         non_ref = np.logical_not(self.geno.ix[rsID, :] == 0)
         nans = np.isnan(self.geno.ix[rsID, :])
         return(self.geno.columns[np.logical_or(non_ref, nans)])
@@ -376,13 +368,13 @@ def __getitem__(self, val):
     def change_base(self, old_ref_file, new_ref_file, chrom):
         from copy import deepcopy
         from Bio import SeqIO
-        newvcf=deepcopy(self.vcf)
+        newvcf = deepcopy(self.vcf)
         old = SeqIO.read(open(old_ref_file), format="fasta")
         new = SeqIO.read(open(new_ref_file), format="fasta")
         from Bio import pairwise2
-        pos={}
+        pos = {}
         diff = {}
-        shifts={}
+        shifts = {}
         shift=0
         old_margin = 0
         new_margin = 0
@@ -462,7 +454,7 @@ def convert_genotypes(samples, multiple_alleles = False):
 
             :TODO The multiple conversion isn't quite right
             """
-            for n,p in enumerate(samples):
+            for n, p in enumerate(samples):
                 p = p.split(':')
                 freqs = p[1].split(',')[1] + ',' + p[1].split(',')[0]
                 if multiple_alleles:
@@ -489,7 +481,7 @@ def convert_genotypes(samples, multiple_alleles = False):
             d[c[0]] = [c[1], c[2], c[3]]
 
         for row in range(self.vcf.shape[0]):
-            if newvcf.ix[row,0].endswith(chrom):
+            if newvcf.ix[row, 0].endswith(chrom):
                 pos = self.vcf.ix[row,1]
                 newpos=pos + change_coord(pos)
                 if pos in d.keys():
@@ -570,10 +562,11 @@ def switch_indexes(n, s):
         newobj = deepcopy(self)
         newobj.vcf = newvcf
         #pg = functools.partial(parse_geno,GT = self.vcf.ix[0,6].split(":").index("GT"))
-        newobj.geno = newobj.vcf.ix[:,7:].applymap(parse_geno, GT=self.vcf.ix[0,6].split(":").index("GT"))
-
+        newobj.geno = newobj.vcf.ix[:,7:].applymap(parse_geno, 
+                GT=self.vcf.ix[0,6].split(":").index("GT"))
         return newobj
 
+
 def pos_line_convert(line):
     """ Convert chr:pos into a integer
     """

diff --git a/genda/plotting/plotting_utils.py b/genda/plotting/plotting_utils.py
@@ -106,7 +106,7 @@ def plot_transcript(transcript_id, paths, ax, y=0, height=2.,
         for patch in ps:
             ax.add_patch(patch)
     except AttributeError:
-        for i, exon in enumerate(paths[transcript_id]):
+        for i in paths[transcript_id]:
             try:
                 path = make_rectangle(i[0], i[1], y, y+height)
             except IndexError:
@@ -133,7 +133,6 @@ def add_gene_bounderies(ax, gene_annot, gene_name, x_scale):
     return(patch)
 
 
-
 def add_snp_arrow(adj_pv, x, snp, ax):
     """Adds an arrow with snp text to plot 
     :TODO The best snp should be some sort of object

diff --git a/tests/plotting_tests.py b/tests/plotting_tests.py
@@ -0,0 +1,25 @@
+import unittest
+
+from genda.plotting.plotting_utils import plot_transcript
+
+
+class TestPlotTranscript(unittest.TestCase):
+    def setUp(self):
+        self.transcript_paths = False
+        self.transcript = 'TRANSCRIPT1'
+    pass
+
+
+    def test_plot_transcript_paths():
+        """Testcase where the input 
+        to plot_transcript are already paths
+        """
+        #plot_transcript
+        pass
+
+    def test_plot_transcript_dict():
+        """Testcase where the input 
+        to plot_transcript are dict
+        of transcripts
+        """
+        pass