Skip to content

Commit

Permalink
adding docs for genome_region and cleaning up tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jaredgk committed Mar 16, 2021
1 parent 9b75c0c commit 828b685
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 14 deletions.
107 changes: 94 additions & 13 deletions pgpipe/genome_region.py
@@ -1,3 +1,22 @@
'''
The genome_reigon module provides two classes: Region, which represents
a region in a genome, and RegionList, which contains a list of Regions.
Coordinates are assumed to be in zero-based, half-open format (include
start coordinate, exclude end), so the first hundred bases of a genome
would be represented as 0:100. Can read regions from a file, string, or
list. If columns are not formatted following usual BED convention,
`collist` can be provided, which will indicate which columns correspond
to the start, end, and chromosome columns in a data input. (A string
can be specified with `colstr` for command-line use) By default, the
list will be sorted in 'version' order, so numerical components of
a string will be sorted numerically, everything else by string. Lists
can be also be randomized, and if using an extended BED file, the full
input line can be read in and later printed out.
'''


import sys
import re
import logging
Expand Down Expand Up @@ -53,6 +72,8 @@ def __init__(self, start, end, chrom, fullline = None):
"""Zero-based, half open coordinates and chromosome info for
a region in a genome. Coords will be formatted according to
flags passed to RegionList, then stored in a single format.
If fullline is provided, when region is written out this
string will be written instead of just the region info.
"""
self.start = start
self.end = end
Expand Down Expand Up @@ -116,6 +137,10 @@ def __len__(self):
return self.end-self.start

def containsRecord(self, rec):
'''
Returns the position of the input VCF record in
relation to this region.
'''
k1 = self.getChromKey()
kr = getChromKey(rec.chrom)
if k1 != kr:
Expand Down Expand Up @@ -150,7 +175,7 @@ class RegionList:

def __init__(self, filename=None, genestr=None, reglist=None,
zeroclosed=False, zeroho=True,
colstr=None, sortlist=True, checkoverlap=None,
colstr=None, collist=None, sortlist=True, checkoverlap=None,
sortmethod=None, sortorder=None, chromfilter=None,
list_template=None, randomize=False,keep_full_line=False):
"""Class for storing gene region information
Expand All @@ -176,10 +201,14 @@ def __init__(self, filename=None, genestr=None, reglist=None,
If true, uses zero-based, closed coordinates, which adds 1
to the end value for a region. Rarely if ever used option.
colstr : str (None)
Three-element string separated by semicolons, with
Three-element string separated by commas, with
each element being an integer corresponding to the 0-based
index of the column of the start, end, and chromosome values
for input region(s).
collist : list (int)
Three-element list with integer indices corresponding to the
column index for start, end and chromosome values (in that
order) from the input source.
sortlist : bool (True)
Will sort list once initialized using assigned sorting scheme.
checkoverlap : str (None, 'error','fix')
Expand Down Expand Up @@ -208,6 +237,9 @@ def __init__(self, filename=None, genestr=None, reglist=None,
checkoverlap, and sortorder.
randomize : bool (False)
If set, will randomly shuffle regions in list.
keep_full_line : bool (False)
If set, will store the original unmodified string corresponding
to a region for later output
Exceptions
----------
Expand All @@ -222,12 +254,21 @@ def __init__(self, filename=None, genestr=None, reglist=None,
sortlist = list_template.sortlist
checkoverlap = list_template.checkoverlap
sortorder = list_template.sortorder

if self.collist is None:
if colstr is None:
self.collist = [1, 2, 0]
else:
self.parseCols(colstr)
if collist is not None and colstr is not None:
raise Exception(("Only one of collist and colstr can be specified"))
#if self.collist is None:
#if colstr is None:
# self.collist = [1, 2, 0]
#else:
# self.parseCols(colstr)
if collist is not None:
self.collist = collist
elif colstr is not None:
self.parseCols(colstr)
else:
self.collist = [1,2,0]
if len(self.collist) != 3:
raise Exception(("Column list must have exactly three values"))
if filename is None and genestr is None and reglist is None:
raise Exception(("A filename, region string, or list of "
"region strings must be provided for " "creating a gene region list"))
Expand Down Expand Up @@ -270,7 +311,8 @@ def __init__(self, filename=None, genestr=None, reglist=None,
shuffle(self.regions)

def initFile(self,filename):
"""Initialize RegionList with a region file
"""Initialize RegionList with a region file. Will skip
commented lines.
"""
past_header = False
with open(filename, 'r') as regionfile:
Expand All @@ -291,6 +333,9 @@ def initFile(self,filename):


def initStr(self,genestr):
'''
Initialize region from string. Assumes start/end/chrom are colon-delimited.
'''
la = genestr.split(':')
self.initRegion(la)

Expand All @@ -302,6 +347,12 @@ def initList(self,reglist):


def initRegion(self,la):
'''
Initialize region from list and append to RegionList's list of regions.
Uses collist to specify which element is the start, end, and
chromosome. Exceptions if either start or end coordinate is less than
0.
'''
start = int(la[self.collist[0]])
end = int(la[self.collist[1]])
chrom = la[self.collist[2]]
Expand Down Expand Up @@ -333,6 +384,11 @@ def toStr(self, idx, sep=':'):
sep=sep)

def hasOverlap(self):
'''
Checks to see if any two regions in a list overlap.
Sorts list into separate var, then compares overlap between adjacent
regions.
'''
region_hold = sorted(self.regions)
for i in range(len(region_hold)-1):
if region_hold[i].chrom == region_hold[i+1].chrom:
Expand All @@ -344,6 +400,10 @@ def hasOverlap(self):
return False

def fixOverlap(self):
'''
Merged regions that overlap into a single region. Will sort the list,
and is not capable of returning regions to their original position.
'''
region_hold = []
self.regions.sort()
i = 0
Expand All @@ -364,6 +424,12 @@ def fixOverlap(self):
def printList(self, file_handle=None, file_name=None,
return_str=False, delim="\t", add_chr=False,
remove_chr=False):
'''
Prints regions from RegionList, to a given file handle or filename
unless neither is specified, in which case output is written to
stdout. If full line was read in during list creation, that is what
will be written out.
'''
if file_handle is None and file_name is None:
file_handle = sys.stdout
if file_name is not None:
Expand Down Expand Up @@ -392,6 +458,8 @@ def printList(self, file_handle=None, file_name=None,
out_str += reg_str
else:
file_handle.write(reg_str)
if file_name is not None:
file_handle.close()
if return_str:
return out_str

Expand All @@ -405,8 +473,10 @@ def filterOutXY(self):
self.filterByChrom(['X','Y','chrX','chrY'])

def expandRegions(self,expand_size):
#Expands regions, but does not merge overlaps
#that may occur
'''
Expands regions from start and end by specified value.
Does not merge regions, which can be done separately with fixOverlap.
'''
for i in range(len(self.regions)):
self.regions[i].start = max(self.regions[i].start-expand_size,0)
self.regions[i].end += expand_size
Expand All @@ -415,6 +485,14 @@ def expandRegions(self,expand_size):
#Add sort method for strictly text based sorting

def getIntervalsBetween(region_list, padding=0, firstline=True):
'''
Returns a new RegionList with regions that represent positions not
contained in the input list. Padding can be provided to exclude
regions within a given distance of the input list. Note that
a region will not be created between the last region on a chromosome
and the end of the chromosome, as the position of the end of the
chromosome is unknown.
'''
region_hold = []
if len(region_list.regions) < 2:
raise Exception("Region list for complement requires at least two regions")
Expand All @@ -437,8 +515,11 @@ def getIntervalsBetween(region_list, padding=0, firstline=True):
return out_list

def subtractBed(stat_list, filter_list):
#Sorts regions and removes regions from stat list
#that overlap with any region in filter_list
'''
Modifies stat_list so that any regions that overlaps with any region
in filter_list is dropped. Regions are not trimmed to exclude the
region in the filter list, just entirely dropped.
'''
stat_idx = 0
filter_idx = 0
stat_list.regions.sort()
Expand Down
Binary file added tests/input/chr11.snp.sr.vcf.gz.tbi
Binary file not shown.
Binary file added tests/input/macaca_missingdata.vcf.gz.tbi
Binary file not shown.
5 changes: 4 additions & 1 deletion tests/tests_jared.py
Expand Up @@ -29,7 +29,11 @@ def compareVcfsNoComments(va,vb):
while l2[0] == '#':
l2 = vfb.readline()
if line != l2:
vfa.close()
vfb.close()
return False
vfa.close()
vfb.close()
return True

def compareVcfsZipped(va,vb):
Expand Down Expand Up @@ -148,7 +152,6 @@ def test_CR_before(self):
rec = self.getARecord()
rl = RegionList(genestr="11:142531:142600", zeroho = False)
val = rl.regions[0].containsRecord(rec)
sys.stderr.write(str(val)+'\n')
self.assertTrue(rl.regions[0].containsRecord(rec) == 'before')

def test_CR_after(self):
Expand Down

0 comments on commit 828b685

Please sign in to comment.