In [7]:
# searches for a tile path given its location
# note: requires unix for system 'cat' command.

import numpy as np
import subprocess
import re

In [8]:
# load the coefficient paths from pgp data and generate tile path, step, and phase.
coefPaths = np.load("./tiling/hiq-pgp-info")
tile_path = np.trunc(coefPaths/(16**5))
tile_step = np.trunc((coefPaths - tile_path*16**5)/2)
tile_phase = np.trunc((coefPaths - tile_path*16**5 - 2*tile_step))
vhex = np.vectorize(hex)
vectorizedPath = vhex(tile_path.astype('int'))
vectorizedStep = vhex(tile_step.astype('int'))

In [9]:
# search for a tile
def tileSearch(arg):
    vecpath = str(vectorizedPath[int(arg)])
    vecpath = vecpath[2:].zfill(4)
    proc = subprocess.check_output("cat ./tiling/assembly.00.hg19.fw.fwi | grep :" + vecpath, shell=True)
    return proc

In [10]:
# get the location of a tile
def getTileLocation(raw_tile_data):
    split_raw = raw_tile_data.split('\t')
    begin = int(split_raw[2])
    sequence = int(split_raw[1])
    hexVal = split_raw[0].split(':')[2]
    cmdToRun = "bgzip -c -b %d -s %d -d ./tiling/assembly.00.hg19.fw.gz | grep -B1 \"%s\s\"" % (begin, sequence, hexVal)
    return subprocess.check_output(cmdToRun, shell=True)

In [5]:
# load generated coefficients (sorted by weight)
coefs = np.load("coefs.pkl")

In [12]:
val = 1792420

print tileSearch(val)
print getTileLocation(tileSearch(val))

hg19:chr15:0288	99840	132519571	15	16

0287	  28264893
0288	  28265118



In [29]:
# search for the specific tile location from the coefficients
tileLocations = []
for item in coefs:
    tile = tileSearch(item)
    print tile, item
    tileLocations.append(tile)

hg19:chr15:0288	99840	132519571	15	16
1792420
hg19:chr9:01bc	222800	95268704	15	16
1280301
hg19:chr13:0244	142304	121208015	15	16
1654420
hg19:chr4:00e3	342576	49787616	15	16
678939
hg19:chr6:0145	221344	71918544	15	16
969853
hg19:chr5:00ff	271856	55839008	15	16
751247
hg19:chr2:0058	332112	18964000	15	16
286179
hg19:chr9:01bb	192096	95076592	15	16
1276707
hg19:chr3:00a7	266528	35696448	15	16
516464
hg19:chr6:0125	355424	64715520	15	16
884661
hg19:chr10:01cd	236128	98240648	15	16
1322343
hg19:chr3:00a7	266528	35696448	15	16
516454
hg19:chr17:02ce	242560	145038457	15	16
1983840
hg19:chr22:0321	73632	159663260	15	16
2170593
hg19:chr1:0029	86656	7899136	15	16
119177
hg19:chr16:02a3	124704	137549390	15	16
1870557
hg19:chr14:0270	160272	128985595	15	16
1741051
hg19:chr8:019a	383408	90025344	15	16
1208426
hg19:chr8:019a	383408	90025344	15	16
1208428
hg19:chr5:010b	199728	58956464	15	16
786173
hg19:chr2:0045	235248	14909296	15	16
228774
hg19:chr6:0146	394592	72139904	15	16
975213
hg19:chr4:00

In [27]:
# get the location of the tile with the highest coefficient
print getTileLocation(tileLocations[0])

0287	  28264893
0288	  28265118



According to the to the NIH Genetics Home Reference (https://ghr.nlm.nih.gov/gene/HERC2#location), the HERC2 gene, responsible for coding the protein that produces the blue eye color is located from base pairs 28,111,037 to 28,322,173 on chromosome 15. The classifier created the following output:

```
0287	  28264893
0288	  28265118

```
`getTileLocation(tileLocations[0])` returns the chromosome location responsible for the tile with the highest coefficient generated from the Support Vector Classifier. As you can see, the classifier correctly predicted that eye color is reliant on base pairs 28,264,893 to 28,265,118.