In [1]:
from sklearn.svm import LinearSVC
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.externals import joblib

In [2]:
# read names that have provided survey eye color data
columns = ['name', 'left', 'right', 'left_desc', 'right_desc']

# pgp eye color data from survey
surveyData = pd.read_csv("PGP-eyecolor.csv", names=columns, na_values=['nan', ''])

# names of the pgp participants
surveyNames = np.asarray(surveyData['name'].values.tolist())

In [3]:
# load numpy array of names and keep only the huID
pgpNames = np.load("names.npy")
for i in range(len(pgpNames)):
    pgpNames[i] = pgpNames[i][:8]

# load numpy array of tiled PGP data 
pgp = np.load("hiq-pgp.npy")

In [4]:
# lookup a name in the survey data and return a tuple of the eye colors
def getData(name, surveyData):
    for index, row in surveyData.iterrows():
        if row['name'] == name:
            return (row['left'], row['right'])

In [5]:
# list of tuples for index and name with eye color data
namePairIndices = []

# dictionary of left and right eye colors with respective name, i.e., {"huID": 12}
nameLeftEyeMap = {}
nameRightEyeMap = {}

existingNames = []

# loop through pgpNames and add eye color to maps, making sure not to add the same name twice
for i in range(len(pgpNames)):
    name = pgpNames[i]
    if name in surveyNames and name not in existingNames:
        existingNames.append(name)
        eyeData = getData(name, surveyData)
        namePairIndices.append((i, name))
        nameLeftEyeMap[name] = eyeData[0]
        nameRightEyeMap[name] = eyeData[1]

# create lists containing the known eye color names and the unknown eye colors.
nameIndices = [nameIndex[0] for nameIndex in namePairIndices]
knownData = pgp[nameIndices]
unknownData = np.delete(pgp, nameIndices, axis=0)

In [24]:
# convert dictionaries to lists 
leftEyeNameList = []
rightEyeNameList = []

for nameTuple in namePairIndices:
    leftEyeNameList.append(nameLeftEyeMap[nameTuple[1]])
    rightEyeNameList.append(nameRightEyeMap[nameTuple[1]])
    
for i in range(len(leftEyeNameList)): # classify only blue/not blue, so change all the colors above 12 to not blue
    if leftEyeNameList[i] > 12:
        leftEyeNameList[i] = 1 # not blue
    else:
        leftEyeNameList[i] = 0 # blue

In [34]:
svc_test = joblib.load("svc.pkl")

In [35]:
print knownData.shape

(78, 2469062)


In [36]:
eye_pred = svc_test.predict(knownData)
accuracy_score(leftEyeNameList, eye_pred)

0.96153846153846156

In [43]:
nonzeroes = np.nonzero(svc_test.coef_[0])[0]
coefs = zip(nonzeroes, svc_test.coef_[0][nonzeroes])
coefs.sort(key = lambda x: x[1], reverse=True)
print coefs

[(1257816, 0.073470813698001028), (342019, 0.072255171659541415), (1321008, 0.054244558436764949), (903627, 0.044503739771904909), (200986, 0.038440259945633166), (71357, 0.037266173236257985), (481212, 0.036143179847734748), (137247, 0.033629689903357719), (768874, 0.030771588419761825), (2393717, 0.028110278182580602), (1170439, 0.023029095721417838), (12624, 0.022702893256663269), (402803, 0.017654211616017861), (1594144, 0.01602715821042271), (1081477, 0.015125042922764006), (2189921, 0.01439709409320934), (2088511, 0.014212737290436723), (2043901, 0.013033377057775714), (2043646, 0.012529949843646243), (674644, 0.010970582597430089), (2461525, 0.006872291074819019), (1021400, 0.0064551184068559779), (596168, 0.006083773790519633), (1435117, 0.0056687916769290409), (2166165, 0.0036715755740112199), (579760, 0.0032887504867792649), (1903788, 0.0030532207139391779), (2088513, -0.00016439543626351803), (890474, -0.00074713963241949423), (2461523, -0.001497068963951909), (438621, -0.00

In [47]:
posLocs = [coef[0] for coef in coefs]
print posLocs
pos = np.asarray(posLocs)
pos.dump("coefs.pkl")

[1257816, 342019, 1321008, 903627, 200986, 71357, 481212, 137247, 768874, 2393717, 1170439, 12624, 402803, 1594144, 1081477, 2189921, 2088511, 2043901, 2043646, 674644, 2461525, 1021400, 596168, 1435117, 2166165, 579760, 1903788, 2088513, 890474, 2461523, 438621, 1736283, 1801942, 1335343, 1580503, 2256145, 809281, 279543, 1843740, 834607, 2086601, 1794813, 2143615, 1370239, 1660670, 238309, 174453, 579768, 489736, 865338, 444129, 1007310, 1472632, 981082, 1792420]


In [45]:
print pos

[[  1.25781600e+06   7.34708137e-02]
 [  3.42019000e+05   7.22551717e-02]
 [  1.32100800e+06   5.42445584e-02]
 [  9.03627000e+05   4.45037398e-02]
 [  2.00986000e+05   3.84402599e-02]
 [  7.13570000e+04   3.72661732e-02]
 [  4.81212000e+05   3.61431798e-02]
 [  1.37247000e+05   3.36296899e-02]
 [  7.68874000e+05   3.07715884e-02]
 [  2.39371700e+06   2.81102782e-02]
 [  1.17043900e+06   2.30290957e-02]
 [  1.26240000e+04   2.27028933e-02]
 [  4.02803000e+05   1.76542116e-02]
 [  1.59414400e+06   1.60271582e-02]
 [  1.08147700e+06   1.51250429e-02]
 [  2.18992100e+06   1.43970941e-02]
 [  2.08851100e+06   1.42127373e-02]
 [  2.04390100e+06   1.30333771e-02]
 [  2.04364600e+06   1.25299498e-02]
 [  6.74644000e+05   1.09705826e-02]
 [  2.46152500e+06   6.87229107e-03]
 [  1.02140000e+06   6.45511841e-03]
 [  5.96168000e+05   6.08377379e-03]
 [  1.43511700e+06   5.66879168e-03]
 [  2.16616500e+06   3.67157557e-03]
 [  5.79760000e+05   3.28875049e-03]
 [  1.90378800e+06   3.05322071e-03]
 