In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import collections
from sklearn import preprocessing

In [2]:
# read names that have provided survey eye color data
columns = ['name', 'timestamp', 'id', 'blood_type', 'height', 'weight', 'hw_comments', 'left', 'right', 'left_desc', 'right_desc', 'eye_comments', 'hair', 'hair_desc', 'hair_comments', 'misc', 'handedness']

# pgp eye color data from survey
surveyData = pd.read_csv("../eye_color_data/PGP-Survey.csv", names=columns, na_values=['nan', '', 'NaN'])

# names of the pgp participants
surveyNames = np.asarray(surveyData['name'].values.tolist())

# load numpy array of tiled PGP data 
pgp = preprocessing.scale(np.load("../hiq-pgp").astype('double'))

In [3]:
# load numpy array of names and keep only the huID
pgpNames = np.load("../names")
pgpNames = map(lambda name: name[:8], pgpNames)

# simple lambda function to return if the input is a string
isstr = lambda val: isinstance(val, str)

In [4]:
eye_color = collections.namedtuple("EyeColor", ['left', 'right'])

# lookup a name in the survey data and return a tuple of the eye colors
def getData(name, surveyData, excludeHazel=False):
    for index, row in surveyData.iterrows():
        if row['name'] == name:
            if not excludeHazel:
                return eye_color(row['left'], row['right'])
            else:
                if isstr(row['left_desc']) and isstr(row['right_desc']):
                    if 'azel' in row['left_desc'] or 'azel' in row['right_desc']:
                        return None
                return eye_color(row['left'], row['right'])

In [5]:
# list of tuples for index and name with eye color data (idx, name)
nameEyeMap = []
namePair = collections.namedtuple("NamePair", ['index', 'name'])

# dictionary of left and right eye colors with respective name, i.e., {"huID": 12}
leftEyeMap = {}
rightEyeMap = {}

existingNames = []

# loop through pgpNames and add eye color to maps, making sure not to add the same name twice
for i, name in enumerate(pgpNames):
    if name in surveyNames and name not in existingNames:
        existingNames.append(name)
        eyeData = getData(name, surveyData, excludeHazel=False)
        if eyeData == None:
            pass
        elif isstr(eyeData.left) and isstr(eyeData.right):
            nameEyeMap.append(namePair(i, name))
            leftEyeMap[name] = eyeData.left
            rightEyeMap[name] = eyeData.right

# create lists containing the known eye color names and the unknown eye colors.
nameIndices, correspondingNames = [], []
for pair in nameEyeMap:
    nameIndices.append(pair.index)
    correspondingNames.append(pair.name)
knownData = pgp[nameIndices]
unknownData = np.delete(pgp, nameIndices, axis=0)

In [6]:
# convert dictionaries to lists 
leftEyeNameList = []
rightEyeNameList = []
# nametuple looks like (index, name)
for _, name in nameEyeMap:
    if isstr(leftEyeMap[name]):
        leftEyeNameList.append(leftEyeMap[name])
    if isstr(rightEyeMap[name]):
        rightEyeNameList.append(rightEyeMap[name])

blueOrNot = lambda color: 0 if int(color) > 13 else 1
leftEyeNameList = map(blueOrNot, leftEyeNameList)

In [7]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

X = sel.fit_transform(knownData)

In [8]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, leftEyeNameList, test_size=.2)

In [10]:
clf = LinearSVC(penalty='l1', class_weight='balanced', C=.06, dual=False, verbose=1, max_iter=10000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print accuracy_score(y_test, y_pred)

[LibLinear]0.9375


In [11]:
nonzeroes = np.nonzero(clf.coef_)

In [12]:
clf.coef_[nonzeroes]

array([  1.02940771e-02,   2.65810629e-03,   2.58060223e-03,
        -5.47508037e-03,  -1.47724029e-03,  -9.98973426e-03,
         2.53068572e-02,   4.96767012e-04,  -8.69678182e-04,
         6.63580997e-04,  -1.35586871e-02,  -2.81555514e-02,
         2.57001833e-02,  -3.37442287e-03,   3.28227586e-02,
         9.44019917e-02,   8.63024503e-02,  -6.13392775e-03,
        -1.34358938e-02,  -4.20239981e-02,   5.95976652e-01,
        -7.47853011e-03,  -2.49183840e-03,   4.42441341e-02,
        -6.71084759e-03,  -3.18799326e-02,   1.07359853e-02,
         5.62233142e-02,  -1.16960377e-02,   3.57927648e-02,
         1.87705618e-03,  -2.75752396e-02,  -2.99359322e-02,
         3.96274934e-02])

In [13]:
X.dump('inputs.npy')

In [14]:
oneHot = lambda item: [1, 0] if item == 0 else [0, 1]

In [15]:
leftEyeNameList = np.asarray(map(oneHot, leftEyeNameList))

In [16]:
leftEyeNameList.dump('eyes.npy')

In [17]:
leftEyeNameList.shape

(78, 2)

In [18]:
X.shape

(78, 1198777)