From 828b6854a0c199f0b33249261bd35acdeb87854a Mon Sep 17 00:00:00 2001
From: Jared Knoblauch <jaredknoblauch@gmail.com>
Date: Tue, 16 Mar 2021 18:16:12 -0400
Subject: [PATCH] adding docs for genome_region and cleaning up tests

---
 pgpipe/genome_region.py                   | 107 +++++++++++++++++++---
 tests/input/chr11.snp.sr.vcf.gz.tbi       | Bin 0 -> 107 bytes
 tests/input/macaca_missingdata.vcf.gz.tbi | Bin 0 -> 1328 bytes
 tests/tests_jared.py                      |   5 +-
 4 files changed, 98 insertions(+), 14 deletions(-)
 create mode 100644 tests/input/chr11.snp.sr.vcf.gz.tbi
 create mode 100644 tests/input/macaca_missingdata.vcf.gz.tbi

diff --git a/pgpipe/genome_region.py b/pgpipe/genome_region.py
index a09cc9b..4cce95e 100644
--- a/pgpipe/genome_region.py
+++ b/pgpipe/genome_region.py
@@ -1,3 +1,22 @@
+'''
+    The genome_reigon module provides two classes: Region, which represents 
+    a region in a genome, and RegionList, which contains a list of Regions. 
+    Coordinates are assumed to be in zero-based, half-open format (include
+    start coordinate, exclude end), so the first hundred bases of a genome 
+    would be represented as 0:100. Can read regions from a file, string, or
+    list. If columns are not formatted following usual BED convention, 
+    `collist` can be provided, which will indicate which columns correspond
+    to the start, end, and chromosome columns in a data input. (A string 
+    can be specified with `colstr` for command-line use) By default, the 
+    list will be sorted in 'version' order, so numerical components of 
+    a string will be sorted numerically, everything else by string. Lists
+    can be also be randomized, and if using an extended BED file, the full
+    input line can be read in and later printed out. 
+
+
+'''
+
+
 import sys
 import re
 import logging
@@ -53,6 +72,8 @@ def __init__(self, start, end, chrom, fullline = None):
         """Zero-based, half open coordinates and chromosome info for
         a region in a genome. Coords will be formatted according to
         flags passed to RegionList, then stored in a single format.
+        If fullline is provided, when region is written out this 
+        string will be written instead of just the region info. 
         """
         self.start = start
         self.end = end
@@ -116,6 +137,10 @@ def __len__(self):
         return self.end-self.start
 
     def containsRecord(self, rec):
+        '''
+        Returns the position of the input VCF record in
+        relation to this region. 
+        '''
         k1 = self.getChromKey()
         kr = getChromKey(rec.chrom)
         if k1 != kr:
@@ -150,7 +175,7 @@ class RegionList:
 
     def __init__(self, filename=None, genestr=None, reglist=None,
                  zeroclosed=False, zeroho=True,
-                 colstr=None, sortlist=True, checkoverlap=None,
+                 colstr=None, collist=None, sortlist=True, checkoverlap=None,
                  sortmethod=None, sortorder=None, chromfilter=None,
                  list_template=None, randomize=False,keep_full_line=False):
         """Class for storing gene region information
@@ -176,10 +201,14 @@ def __init__(self, filename=None, genestr=None, reglist=None,
             If true, uses zero-based, closed coordinates, which adds 1
             to the end value for a region. Rarely if ever used option.
         colstr : str (None)
-            Three-element string separated by semicolons, with
+            Three-element string separated by commas, with
             each element being an integer corresponding to the 0-based
             index of the column of the start, end, and chromosome values
             for input region(s).
+        collist : list (int)
+            Three-element list with integer indices corresponding to the
+            column index for start, end and chromosome values (in that
+            order) from the input source. 
         sortlist : bool (True)
             Will sort list once initialized using assigned sorting scheme.
         checkoverlap : str (None, 'error','fix')
@@ -208,6 +237,9 @@ def __init__(self, filename=None, genestr=None, reglist=None,
             checkoverlap, and sortorder.
         randomize : bool (False)
             If set, will randomly shuffle regions in list.
+        keep_full_line : bool (False)
+           If set, will store the original unmodified string corresponding 
+           to a region for later output
 
         Exceptions
         ----------
@@ -222,12 +254,21 @@ def __init__(self, filename=None, genestr=None, reglist=None,
             sortlist = list_template.sortlist
             checkoverlap = list_template.checkoverlap
             sortorder = list_template.sortorder
-
-        if self.collist is None:
-            if colstr is None:
-                self.collist = [1, 2, 0]
-            else:
-                self.parseCols(colstr)
+        if collist is not None and colstr is not None:
+            raise Exception(("Only one of collist and colstr can be specified"))
+        #if self.collist is None:
+            #if colstr is None:
+            #    self.collist = [1, 2, 0]
+            #else:
+            #    self.parseCols(colstr)
+        if collist is not None: 
+            self.collist = collist
+        elif colstr is not None:
+            self.parseCols(colstr)
+        else:
+            self.collist = [1,2,0]
+        if len(self.collist) != 3:
+            raise Exception(("Column list must have exactly three values"))
         if filename is None and genestr is None and reglist is None:
             raise Exception(("A filename, region string, or list of "
                              "region strings must be provided for " "creating a gene region list"))
@@ -270,7 +311,8 @@ def __init__(self, filename=None, genestr=None, reglist=None,
             shuffle(self.regions)
 
     def initFile(self,filename):
-        """Initialize RegionList with a region file
+        """Initialize RegionList with a region file. Will skip
+        commented lines. 
         """
         past_header = False
         with open(filename, 'r') as regionfile:
@@ -291,6 +333,9 @@ def initFile(self,filename):
 
 
     def initStr(self,genestr):
+        '''
+        Initialize region from string. Assumes start/end/chrom are colon-delimited. 
+        '''
         la = genestr.split(':')
         self.initRegion(la)
 
@@ -302,6 +347,12 @@ def initList(self,reglist):
 
 
     def initRegion(self,la):
+        '''
+        Initialize region from list and append to RegionList's list of regions.
+        Uses collist to specify which element is the start, end, and 
+        chromosome. Exceptions if either start or end coordinate is less than 
+        0.
+        '''
         start = int(la[self.collist[0]])
         end = int(la[self.collist[1]])
         chrom = la[self.collist[2]]
@@ -333,6 +384,11 @@ def toStr(self, idx, sep=':'):
                                        sep=sep)
 
     def hasOverlap(self):
+        '''
+        Checks to see if any two regions in a list overlap. 
+        Sorts list into separate var, then compares overlap between adjacent
+        regions. 
+        '''
         region_hold = sorted(self.regions)
         for i in range(len(region_hold)-1):
             if region_hold[i].chrom == region_hold[i+1].chrom:
@@ -344,6 +400,10 @@ def hasOverlap(self):
         return False
 
     def fixOverlap(self):
+        '''
+        Merged regions that overlap into a single region. Will sort the list,
+        and is not capable of returning regions to their original position. 
+        '''
         region_hold = []
         self.regions.sort()
         i = 0
@@ -364,6 +424,12 @@ def fixOverlap(self):
     def printList(self, file_handle=None, file_name=None,
                   return_str=False, delim="\t", add_chr=False,
                   remove_chr=False):
+        '''
+        Prints regions from RegionList, to a given file handle or filename
+        unless neither is specified, in which case output is written to
+        stdout. If full line was read in during list creation, that is what
+        will be written out. 
+        '''
         if file_handle is None and file_name is None:
             file_handle = sys.stdout
         if file_name is not None:
@@ -392,6 +458,8 @@ def printList(self, file_handle=None, file_name=None,
                 out_str += reg_str
             else:
                 file_handle.write(reg_str)
+        if file_name is not None:
+            file_handle.close()
         if return_str:
             return out_str
 
@@ -405,8 +473,10 @@ def filterOutXY(self):
         self.filterByChrom(['X','Y','chrX','chrY'])
 
     def expandRegions(self,expand_size):
-        #Expands regions, but does not merge overlaps 
-        #that may occur
+        '''
+        Expands regions from start and end by specified value. 
+        Does not merge regions, which can be done separately with fixOverlap. 
+        '''
         for i in range(len(self.regions)):
             self.regions[i].start = max(self.regions[i].start-expand_size,0)
             self.regions[i].end += expand_size
@@ -415,6 +485,14 @@ def expandRegions(self,expand_size):
     #Add sort method for strictly text based sorting
 
 def getIntervalsBetween(region_list, padding=0, firstline=True):
+    '''
+    Returns a new RegionList with regions that represent positions not
+    contained in the input list. Padding can be provided to exclude 
+    regions within a given distance of the input list. Note that 
+    a region will not be created between the last region on a chromosome
+    and the end of the chromosome, as the position of the end of the
+    chromosome is unknown. 
+    '''
     region_hold = []
     if len(region_list.regions) < 2:
         raise Exception("Region list for complement requires at least two regions")
@@ -437,8 +515,11 @@ def getIntervalsBetween(region_list, padding=0, firstline=True):
     return out_list
 
 def subtractBed(stat_list, filter_list):
-    #Sorts regions and removes regions from stat list 
-    #that overlap with any region in filter_list
+    '''
+    Modifies stat_list so that any regions that overlaps with any region
+    in filter_list is dropped. Regions are not trimmed to exclude the 
+    region in the filter list, just entirely dropped. 
+    '''
     stat_idx = 0
     filter_idx = 0
     stat_list.regions.sort()
diff --git a/tests/input/chr11.snp.sr.vcf.gz.tbi b/tests/input/chr11.snp.sr.vcf.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..15d2e71e00dde42111e936265e1812f00015b6bc
GIT binary patch
literal 107
zcmb2|=3rp}f&Xj_PR>jWehl1&pHfm%5)u-ak|cPUP6f=8?BL}+cu3JnK-D`TOTx<`
qhHbs4K<<l2Jv=KK*gbjIW=JqFa65}mx(+l&9?dvu24=A7AOZlujTkNf

literal 0
HcmV?d00001

diff --git a/tests/input/macaca_missingdata.vcf.gz.tbi b/tests/input/macaca_missingdata.vcf.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..b233ac267924cf91ec856cb1c042bccbbc99e862
GIT binary patch
literal 1328
zcmV-01<(2)iwFb&00000{{{d;LjnL31!b0dOw|P#hJPqXNFJxC>1J7gYsIv5X8e<F
zG)mBfl<-oR;t_9nArnzkQ9*(@ASj5SW+n<jUXa8KVwb6e$#R<7#j?UoqbXiuZrVDp
zzVFt)*!0=+c;*i0@_k=iM3j$@<2W6>e?<I*knlmyLhs*E-5)$+hVO%aIeg%9A26C-
z{#sb@NN@zZ{8UAe8yv|l&mR|F48F=PZ=TU~3>?KSpL8ShXE26co_nLZD?FB6KDVUx
zh5OHgUA|*}<7{v=yS(q%rJKMp?DEcC@=kzp?DF&9ME?f9&Mx;nwW0@Jy?A!H<0M6Z
zW7*{c0}@lfaqRM(>*un-@$B-T%lqrV3GDLS*VBIo-(Z(VUG((q=;fR2^1SmmUjh@@
z<-MHrY;YpGyynE^E#O=1^1wR_4uF%`<u|jA`~gm8m#?cD(HkG7M0R<4s_R*B3cGyp
z&CRcXZgzRY(O=WRsqFHY70*<HN$m2KzEi#jr?Jap$M0(cr?boBCl&<w;d9R}zjeIZ
z6W~mC`HrB!W58MZT+#2jBd3AM?DD=zOP7PQ^?9Q&IheX0oWm|3&{(|_oXajBJN(X%
z;5>HurTq!Fz!Y}*(-Ct*J9+swyL@!$mC@jQc6m&ZYZmy9UQc?Z6IBW>V3%)mC)a?f
z?DDJi)4l>1vdiNKmbHUvdY$Q^tA@G!y<EgDU(s}_Ke(7(Uf+Fa9JqvC-Z-pFGWagL
z{PUvbmEcl#d2oBbQg9i&yy;8-z2I_o`M%_{Enqsk{Fg%|cke%6cKNA6<9a-V&!6TY
zy~87OBf(Yda!=I{31Eii2|cW5tq06xmp_{NQ#rVrT^<<jd<$mjdrHqNp3w@v$1aaN
zd7@*0m)Y#{t7Fy<26Oa2(z~~%M1k+K%j1{rSqSFp{iWZz{d_*?VV5U7zV`^2$1d+Z
zGyNQx&n}N|>T?ZTqyL|N)aCnFXD<ub<$+ZfhJuB99q75K?j&%n_6WVM?Vn}fI(B*Z
z$Zos9B6fL0aAYG`thq$ruz2f5a6P-+Jz;e_Si&xkSX1W;#N*iIEyuI^gQe{9>CVe>
zU>UnSv`^P$aHGCQ^w=q<Gr&!nd-S?lFKz-i>-VJ}$<D3?KVX-KSNMMoZqe&Nzn%YF
z3-}?s{OHM@_rR@sU+AZMR}6UA%a7RQqw2~c!ENmFnt8DaV7c}YePzEZx!`uqdAhT)
zXF0e-`<LD;WaAmILi>)M@#NN4u##Qw@)Y`Z@$zGKd4Ric2w24~KhytoB3R8XPrFdE
z2>gUyK5Dq90^F(Jmmbp6=`(N_yL@V*?=^5YyF6!iv!Bb$J-Q$0f8MR{3)Zm9gGv*k
z!M(aG=zg~^Oa^P&<=@3_F91Jfmmh0rs{r@0%X5MjoCWLjeCXx9+W!Lgv&#!JYJIzU
zd4OFW7Ce0jcu;eXURZW38a$-O)17Yj(!hGX?{xo3DFxtRcKHS$zc0WBeNXA$P1+1b
z7e2U-bq}j&uJ&>M`;+SY|7Nx~vAuciO>1vfdz0Fm)83T!X0$h<z4`1-XKyxpli8cg
z-c<HxvNw^vdF)MNZx(x#*qg)N6!vDYH-X*xcBk8&ZFjQWxpt@8ooRQX-FbGW*_~x~
zlHECWr`VlgcY>YycBb2zZD+Ebxpt=7nQ3RDoq2Yq*_mZ$lASqrrr4QbXM+9C>*rs+
mQ6`@r4*&okiwFb&00000{{{d;LjnLB00RI3000000002a(8l8c

literal 0
HcmV?d00001

diff --git a/tests/tests_jared.py b/tests/tests_jared.py
index dbc419e..48b3c2d 100644
--- a/tests/tests_jared.py
+++ b/tests/tests_jared.py
@@ -29,7 +29,11 @@ def compareVcfsNoComments(va,vb):
             while l2[0] == '#':
                 l2 = vfb.readline()
         if line != l2:
+            vfa.close()
+            vfb.close()
             return False
+    vfa.close()
+    vfb.close()
     return True
 
 def compareVcfsZipped(va,vb):
@@ -148,7 +152,6 @@ def test_CR_before(self):
         rec = self.getARecord()
         rl = RegionList(genestr="11:142531:142600", zeroho = False)
         val = rl.regions[0].containsRecord(rec)
-        sys.stderr.write(str(val)+'\n')
         self.assertTrue(rl.regions[0].containsRecord(rec) == 'before')
 
     def test_CR_after(self):