In [9]:
# searches for a tile path given its location
# note: requires unix for system 'cat' command.

import numpy as np
import subprocess
import re

In [11]:
# load the coefficient paths from pgp data and generate tile path, step, and phase.
coefPaths = np.load("./tiling-server/tiling_files/hiq-pgp-info")
tile_path = np.trunc(coefPaths/(16**5))
tile_step = np.trunc((coefPaths - tile_path*16**5)/2)
tile_phase = np.trunc((coefPaths - tile_path*16**5 - 2*tile_step))
vhex = np.vectorize(hex)
vectorizedPath = vhex(tile_path.astype('int'))
vectorizedStep = vhex(tile_step.astype('int'))
vectorizedPhase = vhex(tile_phase.astype('int'))

In [12]:
# search for a tile
def tileSearch(arg):
    vecpath = str(vectorizedPath[int(arg)])
    vecpath = vecpath[2:].zfill(4)
    proc = subprocess.check_output("cat ./tiling-server/tiling_files/assembly.00.hg19.fw.fwi | grep :" + vecpath, shell=True)
    return proc

In [13]:
# get the location of a tile
def getTileLocation(raw_tile_data):
    split_raw = raw_tile_data.split('\t')
    begin = int(split_raw[2])
    sequence = int(split_raw[1])
    hexVal = split_raw[0].split(':')[2]
    cmdToRun = "bgzip -c -b %d -s %d -d ./tiling-server/tiling_files/assembly.00.hg19.fw.gz | grep -B1 \"%s\s\"" % (begin, sequence, hexVal)
    return subprocess.check_output(cmdToRun, shell=True)

In [14]:
def getTileInfo(index):
    print "Tile Path:", vectorizedPath[index]
    print "Tile Step", vectorizedStep[index]
    print "Tile Phase:", vectorizedPhase[index]
    print
    tile = tileSearch(index)
    print tile
    print getTileLocation(tile)

In [15]:
# load generated coefficients (sorted by weight)
coefs = np.load("coefs.pkl")

In [16]:
getTileInfo(1792420)
getTileInfo(1792421)

Tile Path: 0x288
Tile Step 0x415
Tile Phase: 0x0

hg19:chr15:0288	99840	132519571	15	16

0287	  28264893
0288	  28265118

Tile Path: 0x288
Tile Step 0x415
Tile Phase: 0x1

hg19:chr15:0288	99840	132519571	15	16

0287	  28264893
0288	  28265118



In [16]:
# search for the specific tile location from the coefficients
tileLocations = []
for item in coefs:
    tile = tileSearch(item)
    print tile, item, '\n'
    tileLocations.append(tile)

hg19:chr15:0288	99840	132519571	15	16
1792420 

hg19:chr12:0221	500208	115007292	15	16
1570048 



In [17]:
# get the location of the tile with the highest coefficient
print getTileLocation(tileLocations[0])

0287	  28264893
0288	  28265118



According to the to the NIH Genetics Home Reference (https://ghr.nlm.nih.gov/gene/HERC2#location), the HERC2 gene, responsible for coding the protein that produces the blue eye color is located from base pairs 28,111,037 to 28,322,173 on chromosome 15. The classifier created the following output:

```
0287	  28264893
0288	  28265118

```
`getTileLocation(tileLocations[0])` returns the chromosome location responsible for the tile with the highest coefficient generated from the Support Vector Classifier. As evident by the results, the classifier correctly predicted that eye color is reliant on base pairs 28,264,893 to 28,265,118.