In [49]:
from sklearn.svm import LinearSVC
from sklearn import preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import collections
import os
import seaborn
seaborn.set()

In [50]:
excludeHazel = True
fileName = 'svc_no_hazel.pkl'

In [51]:
# read names that have provided survey eye color data
columns = ['name', 'timestamp', 'id', 'blood_type', 'height', 'weight', 'hw_comments', 'left', 'right', 'left_desc', 'right_desc', 'eye_comments', 'hair', 'hair_desc', 'hair_comments', 'misc', 'handedness']

# pgp eye color data from survey
surveyData = pd.read_csv("./eye_color_data/PGP-Survey.csv", names=columns, na_values=['nan', '', 'NaN'])

# names of the pgp participants
surveyNames = np.asarray(surveyData['name'].values.tolist())

In [52]:
tiled_data_dir = "/data-sdd/home/kfang/keep/by_id/su92l-4zz18-b8rs5x7t6gry16k/"
def get_file(name, np_file = True):
    if np_file: 
        return np.load(os.path.join(tiled_data_dir, name))
    else:
        return open(os.path.join(tiled_data_dir, name), 'r')

In [53]:
names_file = get_file("names.npy", np_file = False)
names = []
for line in names_file:
    names.append(line[:-1])

In [54]:
get_name = lambda full_name: full_name[45:53]
names = map(get_name, names)

In [55]:
# simple lambda function to return if the input is a string
isstr = lambda val: isinstance(val, str)

eye_color = collections.namedtuple("EyeColor", ['left', 'right'])

# lookup a name in the survey data and return a tuple of the eye colors
def getData(name, surveyData, excludeHazel=False):
    for index, row in surveyData.iterrows():
        if row['name'] == name:
            if not excludeHazel:
                return eye_color(row['left'], row['right'])
            else:
                if isstr(row['left_desc']) and isstr(row['right_desc']):
                    if 'azel' in row['left_desc'] or 'azel' in row['right_desc']:
                        return None
                return eye_color(row['left'], row['right'])

In [56]:
# list of tuples for index and name with eye color data (idx, name)
nameEyeMap = []
namePair = collections.namedtuple("NamePair", ['index', 'name'])

# dictionary of left and right eye colors with respective name, i.e., {"huID": 12}
leftEyeMap = {}
rightEyeMap = {}

existingNames = []

# loop through pgpNames and add eye color to maps, making sure not to add the same name twice
for i, name in enumerate(names):
    if name in surveyNames and name not in existingNames:
        existingNames.append(name)
        # change `excludeHazel=True` to include hazel in the training/testing data.
        eyeData = getData(name, surveyData, excludeHazel=excludeHazel)
        if eyeData == None:
            pass
        elif isstr(eyeData.left) and isstr(eyeData.right):
            nameEyeMap.append(namePair(i, name))
            leftEyeMap[name] = eyeData.left
            rightEyeMap[name] = eyeData.right

# create lists containing the known eye color names and the unknown eye colors.
nameIndices, correspondingNames = [], []
for pair in nameEyeMap:
    nameIndices.append(pair.index)
    correspondingNames.append(pair.name)

In [57]:
# convert dictionaries to lists 
leftEyeList = []
rightEyeList = []
# nametuple looks like (index, name)
for _, name in nameEyeMap:
    if isstr(leftEyeMap[name]):
        leftEyeList.append(leftEyeMap[name])
    if isstr(rightEyeMap[name]):
        rightEyeList.append(rightEyeMap[name])

blueOrNot = lambda color: 0 if int(color) > 13 else 1
leftEyeList = map(blueOrNot, leftEyeList)

In [58]:
np.save("./npy_data/leftEyeList.npy", leftEyeList)

In [59]:
# load genome data
all_data = get_file('all.npy')

In [66]:
path_data = get_file('all-info.npy')
print(path_data.shape)

(21310012,)


In [61]:
# save genomes that we know the eye color of from surveys
knownData = all_data[nameIndices]
unknownData = np.delete(all_data, nameIndices, axis=0)

In [62]:
# only keep data with less than 10% missing data
nnz = np.count_nonzero(knownData, axis=0)
fracnnz = np.divide(nnz.astype(float), knownData.shape[0])

idxKeep = fracnnz >= 0.90
idxOP = np.arange(knownData.shape[1])
knownData = knownData[:, idxKeep]

In [63]:
#path_data = path_data[nnz]

In [67]:
# save information about deleting missing/spanning data
varvals = np.full(50 * knownData.shape[1], np.nan)
nx = 0

varlist = []
for j in range(0, knownData.shape[1]):
    u = np.unique(knownData[:,j])
    varvals[nx : nx + u.size] = u
    nx = nx + u.size
    varlist.append(u)

varvals = varvals[~np.isnan(varvals)]

print(varvals.shape)
np.save("./npy_data/varvals.npy", varvals)

(4737575,)


In [68]:
def foo(col):
    u = np.unique(col)
    nunq = u.shape
    return nunq

invals = np.apply_along_axis(foo, 0, knownData)
invals = invals[0]

In [69]:
# used later to find coefPaths
pathdataOH = np.repeat(path_data[idxKeep], invals)
# used later to find the original location of the path from non one hot
oldpath = np.repeat(idxOP[idxKeep], invals)

In [70]:
np.save("./npy_data/idx_keep.npy", idxKeep)
np.save("./npy_data/path_data_oh.npy", pathdataOH)
np.save("./npy_data/old_path.npy", oldpath)

In [71]:
np.save('./npy_data/all_data_exc.npy', knownData)

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [None]:
# one hot encode the data - fit, transform, then save for future processing
# takes a lot of RAM -> push it off to an Arvados cluster
transformed = enc.fit(knownData)
data = enc.transform(knownData)
encoded = data.toarray()
np.save("./npy_data/data_encoded.npy", encoded)