In [7]:
# searches for a tile path given its location
# note: requires unix for system 'cat' command.

import numpy as np
import subprocess
import re

In [8]:
# load the coefficient paths from pgp data and generate tile path, step, and phase.
coefPaths = np.load("../tile-searcher/tiling-files/hiq-pgp-info")
tile_path = np.trunc(coefPaths/(16**5))
tile_step = np.trunc((coefPaths - tile_path*16**5)/2)
tile_phase = np.trunc((coefPaths - tile_path*16**5 - 2*tile_step))

# generate vectorized path, step, and phase
vhex = np.vectorize(hex)
vectorizedPath = vhex(tile_path.astype('int'))
vectorizedStep = vhex(tile_step.astype('int'))
vectorizedPhase = vhex(tile_phase.astype('int'))

In [9]:
# search for a tile
def tileSearch(arg):
    vecpath = str(vectorizedPath[int(arg)])
    vecpath = vecpath[2:].zfill(4)
    proc = subprocess.check_output("cat ../tile-searcher/tiling-files/assembly.00.hg19.fw.fwi | grep :" + vecpath, shell=True)
    return proc

In [15]:
# get the location of a tile
def getTileLocation(arg, raw_tile_data):
    split_raw = raw_tile_data.split('\t')
    begin = int(split_raw[2])
    sequence = int(split_raw[1])
    vecStep = str(vectorizedStep[int(arg)])
    vecStep = vecStep[2:].zfill(4)
    cmdToRun = "bgzip -c -b %d -s %d -d ../tile-searcher/tiling-files/assembly.00.hg19.fw.gz | grep -B1 \"%s\s\"" % (begin, sequence, vecStep)
    return subprocess.check_output(cmdToRun, shell=True)

In [16]:
def getTileInfo(index):
    print "Tile Path:", vectorizedPath[index]
    print "Tile Step", vectorizedStep[index]
    print "Tile Phase:", vectorizedPhase[index], "\n"
    tile = tileSearch(index)
    print tile
    print getTileLocation(tile)

In [17]:
# load generated coefficients (sorted by weight)
coefs = np.load("coefs.pkl")

In [22]:
# search for the specific tile location from the coefficients
tileLocations = []
for item in coefs:
    tile = tileSearch(item)
    print tile, item, '\n'
    tileLocations.append((item, tile))

hg19:chr15:0288	99840	132519571	15	16
1792420 

hg19:chr12:0221	500208	115007292	15	16
1570048 

hg19:chr11:0212	188832	112450397	15	16
1531647 

hg19:chr5:0111	185056	60522352	15	16
813895 

hg19:chrX:0341	479632	165407285	15	16
2334997 

hg19:chr9:01a2	354480	91636736	15	16
1229091 

hg19:chr2:005a	86128	19509248	15	16
293783 

hg19:chr11:01f3	219008	105713678	15	16
1433155 

hg19:chr15:0287	154720	132364834	15	16
1792082 

hg19:chr13:024d	218464	122570568	15	16
1673845 

hg19:chrX:032e	468528	161810261	15	16
2221351 

hg19:chrX:0335	297104	163539765	15	16
2274388 

hg19:chrX:033e	223680	164983781	15	16
2319627 

hg19:chrX:033c	82384	164726261	15	16
2308447 

hg19:chr5:0108	128144	58075936	15	16
775746 

hg19:chr19:02ee	242160	151921673	15	16
2063747 

hg19:chr1:003e	313504	13078864	15	16
200986 

hg19:chr3:00ac	201424	36904272	15	16
534975 

hg19:chr6:0124	248992	64466512	15	16
878875 

hg19:chr16:02b2	126704	140220157	15	16
1911970 

hg19:chrX:0351	296400	169072325	15	16
2453399 



In [23]:
# get the location of the tile with the highest coefficient
print getTileLocation(tileLocations[0][0], tileLocations[0][1])

0414	  28365579
0415	  28365804



According to the to the NIH Genetics Home Reference (https://www.ncbi.nlm.nih.gov/gene/8924), the HERC2 gene, responsible for coding the protein that produces the blue eye color is located on:
`GRCh37.p13 (GCF_000001405.25)	15	NC_000015.9 (28356183..28567313, complement)`

The classifier created the following output:

```
0414	  28365579
0415	  28365804

```
`getTileLocation(tileLocations[0][0], tileLocations[0][1])` returns the chromosome location responsible for the tile with the highest coefficient generated from the Support Vector Classifier. As evident by the results, the classifier correctly predicted that eye color is reliant on base pairs 28,365,579 to 28,365,804, which is on the HERC2 locations of 28,356,183 to 28,567,313.