In [1]:
from sklearn.svm import LinearSVC
from sklearn import preprocessing
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read names that have provided survey eye color data
columns = ['name', 'left', 'right', 'left_desc', 'right_desc']

# pgp eye color data from survey
surveyData = pd.read_csv("./eye_color_data/PGP-eyecolor.csv", names=columns, na_values=['nan', ''])

# names of the pgp participants
surveyNames = np.asarray(surveyData['name'].values.tolist())

In [3]:
# load numpy array of names and keep only the huID
pgpNames = np.load("names.npy")
for i in range(len(pgpNames)):
    pgpNames[i] = pgpNames[i][:8]

# load numpy array of tiled PGP data 
pgp = np.load("hiq-pgp.npy")

In [4]:
# lookup a name in the survey data and return a tuple of the eye colors
def getData(name, surveyData):
    for index, row in surveyData.iterrows():
        if row['name'] == name:
            return (row['left'], row['right'])

In [5]:
# list of tuples for index and name with eye color data
namePairIndices = []

# dictionary of left and right eye colors with respective name, i.e., {"huID": 12}
nameLeftEyeMap = {}
nameRightEyeMap = {}

existingNames = []

# loop through pgpNames and add eye color to maps, making sure not to add the same name twice
for i in range(len(pgpNames)):
    name = pgpNames[i]
    if name in surveyNames and name not in existingNames:
        existingNames.append(name)
        eyeData = getData(name, surveyData)
        namePairIndices.append((i, name))
        nameLeftEyeMap[name] = eyeData[0]
        nameRightEyeMap[name] = eyeData[1]

# create lists containing the known eye color names and the unknown eye colors.
nameIndices = [nameIndex[0] for nameIndex in namePairIndices]
knownData = pgp[nameIndices]
unknownData = np.delete(pgp, nameIndices, axis=0)

In [6]:
clf = joblib.load("svc.pkl")

In [7]:
unknownData = preprocessing.scale(unknownData.astype('double'))
unknownData

array([[  8.30662386,  11.78982612,   0.        , ...,   0.        ,
          0.        ,  -0.08481889],
       [ -0.12038585,  -0.08481889,   0.        , ...,   0.        ,
          0.        ,  -0.08481889],
       [ -0.12038585,  -0.08481889,   0.        , ...,   0.        ,
          0.        ,  -0.08481889],
       ..., 
       [ -0.12038585,  -0.08481889,   0.        , ...,   0.        ,
          0.        ,  -0.08481889],
       [ -0.12038585,  -0.08481889,   0.        , ...,   0.        ,
          0.        ,  -0.08481889],
       [ -0.12038585,  -0.08481889,   0.        , ...,   0.        ,
          0.        ,  -0.08481889]])

In [11]:
vals = clf.predict(unknownData)
print vals

[0 0 1 0 1 1 0 0 0 1 0 0 1 1 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0
 1 0 0 0 0 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 0 1 1 0 1 0 1 1 1
 1 1 1 0 0 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 1 1 0 0
 1 0 0 0 0 1 1 1 0 0 1 0 1 1 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1]


In [9]:
import collections
print collections.Counter(vals)

Counter({0: 74, 1: 66})


In [19]:
for data in unknownData:
    print data[1792420]

-0.816639532516
-0.816639532516
2.49726001972
-0.816639532516
0.840310243604
0.840310243604
-0.816639532516
-0.816639532516
-0.816639532516
0.840310243604
-0.816639532516
-0.816639532516
0.840310243604
0.840310243604
-0.816639532516
-0.816639532516
-0.816639532516
-0.816639532516
-0.816639532516
-0.816639532516
0.840310243604
0.840310243604
0.840310243604
-0.816639532516
-0.816639532516
-0.816639532516
0.840310243604
-0.816639532516
0.840310243604
-0.816639532516
0.840310243604
-0.816639532516
-0.816639532516
-0.816639532516
-0.816639532516
-0.816639532516
-0.816639532516
0.840310243604
-0.816639532516
-0.816639532516
-0.816639532516
-0.816639532516
0.840310243604
0.840310243604
-0.816639532516
0.840310243604
0.840310243604
0.840310243604
-0.816639532516
0.840310243604
-0.816639532516
0.840310243604
-0.816639532516
0.840310243604
0.840310243604
-0.816639532516
0.840310243604
-0.816639532516
5.81115957196
-0.816639532516
0.840310243604
0.840310243604
-0.816639532516
0.840310243604
-0.81