Skip to content

Commit

Permalink
made improvements to gene region chromosome handling
Browse files Browse the repository at this point in the history
  • Loading branch information
jaredgk committed Sep 13, 2017
1 parent faf933f commit 8345556
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 56 deletions.
4 changes: 4 additions & 0 deletions jared/example/overlap_regions.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
14 10000 20000
15 100 200
14 100000 110000
14 11000 19000
103 changes: 56 additions & 47 deletions jared/gene_region.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,8 @@
import sys
import re
import logging
from functools import total_ordering

def matchChr(ref_chr,list_chr):
"""Checks ref_chr for whether "chr" is first chars, then modifies
list_chr to match"""
if ref_chr is None:
return list_chr
has_chr = (ref_chr[0:3]=="chr")
list_has_chr = (list_chr[0:3]=="chr")
if has_chr == list_has_chr:
return list_chr
elif has_chr:
return "chr"+list_chr
return list_chr[3:]


def getChromKey(chrom):
"""Get key from chromosome string so that it can be naturally sorted
Expand All @@ -41,40 +29,30 @@ def keyComp(k1,k2):

@total_ordering
class Region:
def __init__(self, start, end, chrom):
def __init__(self, start, end, chrom, natsort=True):
"""Zero-based, half open coordinates and chromosome info for
a region in a genome. Coords will be formatted according to
flags passed to RegionList, then stored in a single format.
"""
self.start = start
self.end = end
self.chrom = chrom

def chromMod(self):
"""Removes 'chr' from front of chromosome. Used for sorting"""
if self.chrom[0:3] == 'chr':
c = self.chrom[3:]
if chrom[0:3] == 'chr':
self.chrom = chrom[3:]
else:
c = self.chrom
return c
self.chrom = chrom

def getChromKey(self):
"""Splits chromosone name for natural sort. Ints and strs are
grouped with adjacent elements of the same type
Example: 'front88place' returns ['front',88,'place']
"""
c = self.chromMod()
convert = lambda text: int(text) if text.isdigit() else text.lower()
k = [convert(i) for i in re.split('([0-9]+)',c) if len(i) != 0]
return k
return getChromKey(self.chrom)


def __eq__(self, other):
c = self.chromMod()
oc = other.chromMod()
return ((self.start == other.start) and (self.end == other.end)
and (c == oc))
and (self.chrom == other.chrom))

def __lt__(self, other):
"""Sort key order: chrom-key, start position, end position
Expand All @@ -91,7 +69,7 @@ def containsRecord(self, rec):
k1 = self.getChromKey()
kr = getChromKey(rec.chrom)
if k1 != kr:
if k1 < kr:
if keyComp(k1,kr):
return 'before'
return 'after'
if rec.pos < self.start:
Expand All @@ -100,12 +78,22 @@ def containsRecord(self, rec):
return 'after'
return 'in'

def toStr(self, halfopen=True, oneidx=False, sep=':'):
start = self.start
end = self.end
if not halfopen:
end -= 1
if oneidx:
start += 1
end += 1
return str(start)+sep+str(end)+sep+str(self.chrom)


class RegionList:

def __init__(self, filename=None, genestr=None, oneidx=False,
colstr=None, defaultchrom=None, halfopen=True,
sortlist=True):
sortlist=True, checkoverlap=True, natsort=True):
"""Class for storing gene region information
Will either read a file or take a single gene region in a string
Expand Down Expand Up @@ -153,29 +141,27 @@ def __init__(self, filename=None, genestr=None, oneidx=False,
self.halfopen = halfopen
if filename is not None:
self.initList(filename, oneidx, colstr, defaultchrom,
halfopen, sortlist)
halfopen, sortlist, checkoverlap)
else:
self.initStr(genestr, oneidx, colstr, defaultchrom, halfopen)

def initList(self, filename, oneidx, colstr, defaultchrom, halfopen,
sortlist):
sortlist, checkoverlap):
"""Initialize RegionList with a region file
"""
with open(filename, 'r') as regionfile:
for line in regionfile:
if line[0] == '#':
continue
la = line.strip().split()
try:
start = int(la[self.collist[0]])
end = int(la[self.collist[1]])
if len(self.collist) == 3:
c = la[self.collist[2]]
chrom = matchChr(defaultchrom,c)
else:
chrom = defaultchrom
except:
sys.stderr.write("Column is missing")
start = int(la[self.collist[0]])
end = int(la[self.collist[1]])
if len(self.collist) == 3:
chrom = la[self.collist[2]]
elif defaultchrom is not None:
chrom = defaultchrom
else:
raise Exception("Chromosome for region is not specified")
if oneidx:
start -= 1
end -= 1
Expand All @@ -184,17 +170,23 @@ def initList(self, filename, oneidx, colstr, defaultchrom, halfopen,
self.regions.append(Region(start, end, chrom))
if sortlist:
self.regions.sort()
if checkoverlap:
if self.hasOverlap():
raise Exception("Region overlap detected")

def initStr(self, genestr, oneidx, colstr, defaultchrom, halfopen):
la = genestr.split(':')
start = int(la[0])
end = int(la[1])
if len(la) == 3:
c = la[2]
chrom = matchChr(defaultchrom,c)
chrom = la[2]
elif defaultchrom is not None:
chrom = defaultchrom
else:
raise Exception("Region has no chromosome provided")
if oneidx:
start += 1
end += 1
start -= 1
end -= 1
if not halfopen:
end += 1
self.regions.append(Region(start,end,chrom))
Expand All @@ -206,5 +198,22 @@ def parseCols(self,cols):
"too long" % (cols)))
self.collist = col_list

def toStr(self, idx, sep=':'):
return self.regions[idx].toStr(halfopen=self.halfopen,
oneidx=self.oneidx,
sep=sep)

def hasOverlap(self):
region_hold = sorted(self.regions)
for i in range(len(region_hold)-1):
if region_hold[i].chrom == region_hold[i+1].chrom:
if (region_hold[i].start <= region_hold[i+1].start
< region_hold[i].end):
logging.warning("Regions %s and %s overlap" %
(self.toStr(i),self.toStr(i+1)))
return True
return False


#TO do: add check for intersecting regions
#Add sort method for strictly text based sorting
8 changes: 6 additions & 2 deletions jared/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ def test_RL_sort(self):
rl.regions.sort()
self.assertTrue(rl.regions == sl.regions)

def test_RL_overlap(self):
rl = RegionList('example/overlap_regions.txt', checkoverlap=False)
self.assertTrue(rl.hasOverlap())


class snpTest(unittest.TestCase):

Expand Down Expand Up @@ -206,5 +210,5 @@ def test_vcf_to_seq_config_required(self):


if __name__ == "__main__":
initLogger()
unittest.main()
initLogger(filelevel="ERROR",streamlevel="ERROR")
unittest.main(verbosity=2)
6 changes: 1 addition & 5 deletions jared/vcf_reader_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def getRecordList(vcf_reader, region):
return lst


def getRecordListUnzipped(vcf_reader, region, chrom, prev_last_rec):
def getRecordListUnzipped(vcf_reader, region, prev_last_rec):
"""Method for getting record list from unzipped VCF file.
This method will sequentially look through a VCF file until it finds
Expand All @@ -61,10 +61,6 @@ def getRecordListUnzipped(vcf_reader, region, chrom, prev_last_rec):
VCF reader initialized from other function
region : region object
Region object with start and end coordinates of region of interest.
Chromosome may be included but will be overwritten by `chrom`
parameter.
chrom : str
Chromosome name of region of interest.
prev_last_rec : VariantRecord object
Variable with last record read from VcfReader. Stored here so that
if two adjacent regions are called, the overflow record from the
Expand Down
2 changes: 1 addition & 1 deletion jared/vcf_ref_to_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def vcf_to_seq(sys_args):
rec_list = vf.getRecordList(vcf_reader, region)
else:
rec_list, prev_last_rec = vf.getRecordListUnzipped(vcf_reader,
region, chrom, prev_last_rec)
region, prev_last_rec)
if len(rec_list) == 0:
logging.warning(("Region from %d to %d has no variants "
"in VCF file") % (region.start,region.end))
Expand Down
2 changes: 1 addition & 1 deletion jared/vcf_to_ima.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ def vcf_to_ima(sys_args):
rec_list = vf.getRecordList(vcf_reader, region)
else:
rec_list, prev_last_rec = vf.getRecordListUnzipped(vcf_reader,
region, chrom, prev_last_rec)
region, prev_last_rec)
if len(rec_list) == 0:
logging.warning(("Region from %d to %d has no variants "
"in VCF file") % (region.start,region.end))
Expand Down

0 comments on commit 8345556

Please sign in to comment.