In [1]:
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import pickle

In [2]:
# read names that have provided survey eye color data
columns = ['name', 'left', 'right', 'left_desc', 'right_desc']
# pgp eye color data from survey
surveyData = pd.read_csv("PGP-eyecolor.csv", names=columns, na_values=['nan', ''])
# names of the pgp participants
surveyNames = np.asarray(surveyData['name'].values.tolist()
leftEyeColors = surveyData['left']
rightEyeColors = surveyData['right']

In [5]:
# load numpy array of names
pgpNames = np.load("names.npy")
# load numpy array of tiled data and delete the extra information
pgp = np.load("hiq-pgp.npy")
for i in range(len(pgpNames)):
    pgpNames[i] = pgpNames[i][:8]

In [7]:
def getData(name, surveyData):
    for index, row in surveyData.iterrows():
        if row['name'] == name:
            return (row['left'], row['right'])

In [8]:
# populate name dictionaries with respective eye color data
namePairIndices = []
nameLeftEyeMap = {}
nameRightEyeMap = {}
for i in range(len(pgpNames)):
    if pgpNames[i] in surveyNames:
        eyeData = getData(pgpNames[i], surveyData)
        namePairIndices.append((i, pgpNames[i]))
        nameLeftEyeMap[pgpNames[i]] = eyeData[0]
        nameRightEyeMap[pgpNames[i]] = eyeData[1]

# create lists containing the known eye color names and the unknown ones.
nameIndices = [nameIndex[0] for nameIndex in namePairIndices]
knownData = pgp[nameIndices]
unknownData = np.delete(pgp, nameIndices, axis=0)

In [9]:
# convert dictionaries to lists using knownData
leftEyeNameList = []
rightEyeNameList = []

knownDataNames = [nameIndices]
for nameTuple in namePairIndices:
    leftEyeNameList.append(nameLeftEyeMap[nameTuple[1]])
    rightEyeNameList.append(nameRightEyeMap[nameTuple[1]])

In [11]:
from sklearn.model_selection import GridSearchCV

crange = range(.001, 10, .1)

clf = SVC(C)
clf.fit(knownData, leftEyeNameList)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
from sklearn.metrics import accuracy_score
pred = clf.predict(knownData)

In [17]:
pred

array([ 13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,
        13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,
        13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,
        13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,
        13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,
        13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,
        13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,
        13.,  13.])