In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import collections
from sklearn import preprocessing

In [5]:
# read names that have provided survey eye color data
columns = ['name', 'timestamp', 'id', 'blood_type', 'height', 'weight', 'hw_comments', 'left', 'right', 'left_desc', 'right_desc', 'eye_comments', 'hair', 'hair_desc', 'hair_comments', 'misc', 'handedness']

# pgp eye color data from survey
surveyData = pd.read_csv("../eye_color_data/PGP-Survey.csv", names=columns, na_values=['nan', '', 'NaN'])

# names of the pgp participants
surveyNames = np.asarray(surveyData['name'].values.tolist())

# load numpy array of tiled PGP data 
#pgp = preprocessing.scale(np.load("../hiq-pgp").astype('double'))
pgp = np.load('../hiq-pgp')

In [6]:
# load numpy array of names and keep only the huID
pgpNames = np.load("../names")
pgpNames = map(lambda name: name[:8], pgpNames)

# simple lambda function to return if the input is a string
isstr = lambda val: isinstance(val, str)

In [7]:
eye_color = collections.namedtuple("EyeColor", ['left', 'right'])

# lookup a name in the survey data and return a tuple of the eye colors
def getData(name, surveyData, excludeHazel=False):
    for index, row in surveyData.iterrows():
        if row['name'] == name:
            if not excludeHazel:
                return eye_color(row['left'], row['right'])
            else:
                if isstr(row['left_desc']) and isstr(row['right_desc']):
                    if 'azel' in row['left_desc'] or 'azel' in row['right_desc']:
                        return None
                return eye_color(row['left'], row['right'])

In [8]:
# list of tuples for index and name with eye color data (idx, name)
nameEyeMap = []
namePair = collections.namedtuple("NamePair", ['index', 'name'])

# dictionary of left and right eye colors with respective name, i.e., {"huID": 12}
leftEyeMap = {}
rightEyeMap = {}

existingNames = []

# loop through pgpNames and add eye color to maps, making sure not to add the same name twice
for i, name in enumerate(pgpNames):
    if name in surveyNames and name not in existingNames:
        existingNames.append(name)
        eyeData = getData(name, surveyData, excludeHazel=False)
        if eyeData == None:
            pass
        elif isstr(eyeData.left) and isstr(eyeData.right):
            nameEyeMap.append(namePair(i, name))
            leftEyeMap[name] = eyeData.left
            rightEyeMap[name] = eyeData.right

# create lists containing the known eye color names and the unknown eye colors.
nameIndices, correspondingNames = [], []
for pair in nameEyeMap:
    nameIndices.append(pair.index)
    correspondingNames.append(pair.name)
knownData = pgp[nameIndices]
unknownData = np.delete(pgp, nameIndices, axis=0)

In [9]:
# convert dictionaries to lists 
leftEyeNameList = []
rightEyeNameList = []
# nametuple looks like (index, name)
for _, name in nameEyeMap:
    if isstr(leftEyeMap[name]):
        leftEyeNameList.append(leftEyeMap[name])
    if isstr(rightEyeMap[name]):
        rightEyeNameList.append(rightEyeMap[name])

blueOrNot = lambda color: 0 if int(color) > 13 else 1
leftEyeNameList = map(blueOrNot, leftEyeNameList)

In [7]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

X = sel.fit_transform(knownData)

In [10]:
X = knownData

In [11]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, leftEyeNameList, test_size=.2)

In [13]:
clf = LinearSVC(penalty='l1', class_weight='balanced', C=.06, dual=False, verbose=1, max_iter=10000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print accuracy_score(y_test, y_pred)

[LibLinear]0.625


In [14]:
nonzeroes = np.nonzero(clf.coef_)

In [16]:
nonzeroes

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]),
 array([ 101347,  168609,  178117,  202426,  347649,  367531,  407409,
         519909,  644769,  694437,  843591,  849755,  871416,  871417,
         872655,  874407,  975029,  999219, 1046955, 1083805, 1089423,
        1089713, 1090083, 1091013, 1097077, 1207239, 1214713, 1218339,
        1219022, 1307183, 1411609, 1518505, 1548016, 1548017, 1562605,
        1680291, 1706439, 1709485, 1709936, 1866329, 1866609, 1872371,
        1872439, 1949505, 2077541, 2144473, 2179025, 2200081, 2459513]))

In [15]:
clf.coef_[nonzeroes]

array([ 0.00719407, -0.00975006,  0.03518786,  0.00747615,  0.01553719,
        0.00205468, -0.00651249, -0.04742712,  0.00889458, -0.00703326,
        0.01275795,  0.01571272, -0.0153985 , -0.01441888,  0.00371866,
        0.01445346, -0.00173451,  0.01315061,  0.03113534, -0.00446064,
       -0.01304336, -0.00383793, -0.00329613, -0.02512534, -0.01195998,
       -0.01411035, -0.02666117,  0.0124683 ,  0.02372019, -0.00888826,
       -0.01843789,  0.04209346, -0.06357569, -0.01897876,  0.02190467,
       -0.0164766 ,  0.03439163, -0.00580005, -0.05477452, -0.00115066,
        0.0138529 ,  0.02009274,  0.06891974,  0.00675809,  0.01106549,
       -0.00010362,  0.00309612,  0.00140014,  0.00031186])

In [17]:
X.dump('inputs.npy')

In [18]:
oneHot = lambda item: [1, 0] if item == 0 else [0, 1]

In [19]:
leftEyeNameList = np.asarray(map(oneHot, leftEyeNameList))

In [20]:
leftEyeNameList.dump('eyes.npy')

In [21]:
leftEyeNameList.shape

(78, 2)

In [22]:
X.shape

(78, 2469062)