In [60]:
from sklearn.svm import LinearSVC
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.externals import joblib

In [61]:
# read names that have provided survey eye color data
columns = ['name', 'left', 'right', 'left_desc', 'right_desc']

# pgp eye color data from survey
surveyData = pd.read_csv("PGP-eyecolor.csv", names=columns, na_values=['nan', ''])

# names of the pgp participants
surveyNames = np.asarray(surveyData['name'].values.tolist())

In [62]:
# load numpy array of names and keep only the huID
pgpNames = np.load("names.npy")
for i in range(len(pgpNames)):
    pgpNames[i] = pgpNames[i][:8]

# load numpy array of tiled PGP data 
pgp = np.load("hiq-pgp.npy")

In [63]:
# lookup a name in the survey data and return a tuple of the eye colors
def getData(name, surveyData):
    for index, row in surveyData.iterrows():
        if row['name'] == name:
            return (row['left'], row['right'])

In [64]:
# list of tuples for index and name with eye color data
namePairIndices = []

# dictionary of left and right eye colors with respective name, i.e., {"huID": 12}
nameLeftEyeMap = {}
nameRightEyeMap = {}

existingNames = []

# loop through pgpNames and add eye color to maps, making sure not to add the same name twice
for i in range(len(pgpNames)):
    name = pgpNames[i]
    if name in surveyNames and name not in existingNames:
        existingNames.append(name)
        eyeData = getData(name, surveyData)
        namePairIndices.append((i, name))
        nameLeftEyeMap[name] = eyeData[0]
        nameRightEyeMap[name] = eyeData[1]

# create lists containing the known eye color names and the unknown eye colors.
nameIndices = [nameIndex[0] for nameIndex in namePairIndices]
knownData = pgp[nameIndices]
unknownData = np.delete(pgp, nameIndices, axis=0)

In [65]:
# convert dictionaries to lists 
leftEyeNameList = []
rightEyeNameList = []

for nameTuple in namePairIndices:
    leftEyeNameList.append(nameLeftEyeMap[nameTuple[1]])
    rightEyeNameList.append(nameRightEyeMap[nameTuple[1]])
    
for i in range(len(leftEyeNameList)): # classify only blue/not blue, so change all the colors above 12 to not blue
    if leftEyeNameList[i] > 12:
        leftEyeNameList[i] = 0 # not blue
    else:
        leftEyeNameList[i] = 1 # blue

In [66]:
svc_test = joblib.load("svc.pkl")

In [None]:
print knownData.shape

In [67]:
eye_pred = svc_test.predict(knownData)
accuracy_score(leftEyeNameList, eye_pred)

0.96153846153846156

In [68]:
nonzeroes = np.nonzero(svc_test.coef_[0])[0]
coefs = zip(nonzeroes, svc_test.coef_[0][nonzeroes])
coefs.sort(key = lambda x: x[1], reverse=True)
print coefs

[(1792420, 0.32372423698020208), (981082, 0.10071041931276269), (1472632, 0.10004396832934007), (1007310, 0.073566122923813926), (444129, 0.044629981320942716), (865338, 0.039916922273430237), (489736, 0.032746360432997959), (174453, 0.03031268322706512), (579768, 0.029431511532881996), (238309, 0.027389850869883967), (1660670, 0.018410151623680211), (1370239, 0.016924862588293038), (2143615, 0.016114548858249506), (1794813, 0.014777673487784682), (2086601, 0.014334325702934676), (834607, 0.012982954140380689), (1843740, 0.0093281579129839368), (2461521, 0.0084142172400514127), (279543, 0.0082597949683368543), (2256145, 0.0081954169725747348), (809281, 0.0081823669457118213), (1580503, 0.0072350212088080875), (1335343, 0.005520470020886065), (1801942, 0.0050030821177857385), (1736283, 0.0036793361727867828), (438621, 0.0019042156548594329), (890474, 0.00071180190238712691), (2088513, 3.3726492822508257e-07), (2461523, 2.4804420457025974e-07), (1903788, -0.0029959753099066161), (2166165

In [69]:
posLocs = [coef[0] for coef in coefs]
print posLocs
pos = np.asarray(posLocs)
pos.dump("coefs.pkl")

[1792420, 981082, 1472632, 1007310, 444129, 865338, 489736, 174453, 579768, 238309, 1660670, 1370239, 2143615, 1794813, 2086601, 834607, 1843740, 2461521, 279543, 2256145, 809281, 1580503, 1335343, 1801942, 1736283, 438621, 890474, 2088513, 2461523, 1903788, 2166165, 1435117, 596168, 579760, 1021400, 674644, 2043646, 2043901, 2189921, 2088511, 1081477, 1594144, 402803, 12624, 1170439, 2393717, 768874, 137247, 481212, 71357, 200986, 903627, 1321008, 342019, 1257816]


In [None]:
print pos