In [21]:
from sklearn import preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import collections
from sklearn.linear_model import SGDClassifier

In [22]:
# read names that have provided survey eye color data
columns = ['name', 'timestamp', 'id', 'blood_type', 'height', 'weight', 'hw_comments', 'left', 'right', 'left_desc', 'right_desc', 'eye_comments', 'hair', 'hair_desc', 'hair_comments', 'misc', 'handedness']

# pgp eye color data from survey
surveyData = pd.read_csv("./eye_color_data/PGP-Survey.csv", names=columns, na_values=['nan', '', 'NaN'])

# names of the pgp participants
surveyNames = np.asarray(surveyData['name'].values.tolist())

# load numpy array of tiled PGP data 
pgp = preprocessing.scale(np.load("hiq-pgp").astype('double'))
pgp_unscaled = np.load("hiq-pgp")

In [23]:
# load numpy array of names and keep only the huID
pgpNames = np.load("names")
pgpNames = map(lambda name: name[:8], pgpNames)

# simple lambda function to return if the input is a string
isstr = lambda val: isinstance(val, str)

In [24]:
eye_color = collections.namedtuple("EyeColor", ['left', 'right'])

# lookup a name in the survey data and return a tuple of the eye colors
def getData(name, surveyData, excludeHazel=False):
    for index, row in surveyData.iterrows():
        if row['name'] == name:
            if not excludeHazel:
                return eye_color(row['left'], row['right'])
            else:
                if isstr(row['left_desc']) and isstr(row['right_desc']):
                    if 'azel' in row['left_desc'] or 'azel' in row['right_desc']:
                        return None
                return eye_color(row['left'], row['right'])

In [25]:
# list of tuples for index and name with eye color data (idx, name)
nameEyeMap = []
namePair = collections.namedtuple("NamePair", ['index', 'name'])

# dictionary of left and right eye colors with respective name, i.e., {"huID": 12}
leftEyeMap = {}
rightEyeMap = {}

existingNames = []

# loop through pgpNames and add eye color to maps, making sure not to add the same name twice
for i, name in enumerate(pgpNames):
    if name in surveyNames and name not in existingNames:
        existingNames.append(name)
        eyeData = getData(name, surveyData, excludeHazel=True)
        if eyeData == None:
            pass
        elif isstr(eyeData.left) and isstr(eyeData.right):
            nameEyeMap.append(namePair(i, name))
            leftEyeMap[name] = eyeData.left
            rightEyeMap[name] = eyeData.right

# create lists containing the known eye color names and the unknown eye colors.
nameIndices, correspondingNames = [], []
for pair in nameEyeMap:
    nameIndices.append(pair.index)
    correspondingNames.append(pair.name)
knownData = pgp[nameIndices]
knownData_unscaled = pgp_unscaled[nameIndices]
unknownData = np.delete(pgp, nameIndices, axis=0)

In [26]:
# convert dictionaries to lists 
leftEyeNameList = []
rightEyeNameList = []
# nametuple looks like (index, name)
for _, name in nameEyeMap:
    if isstr(leftEyeMap[name]):
        leftEyeNameList.append(leftEyeMap[name])
    if isstr(rightEyeMap[name]):
        rightEyeNameList.append(rightEyeMap[name])

blueOrNot = lambda color: 0 if int(color) > 13 else 1
leftEyeNameList = map(blueOrNot, leftEyeNameList)

In [41]:
print knownData.shape

from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
knownData = sel.fit_transform(knownData)
print "after", knownData.shape

(78, 2469062)
after (78, 1198777)


In [27]:
X_train, X_test, y_train, y_test = train_test_split(knownData, leftEyeNameList, test_size=.2, random_state=2)


In [42]:
svc = LinearSVC(penalty='l1', class_weight='balanced', C=.06, max_iter=1000, verbose=True, dual=False)
svc.fit(X_train, y_train)
print accuracy_score(svc.predict(X_test), y_test)

[LibLinear]1.0


In [51]:
from sklearn.svm import LinearSVC

model = SGDClassifier(penalty='l1', 
                      class_weight='balanced',
                      n_iter=1, 
                      alpha=0.273,
                      verbose=True, 
                      loss='hinge',
                      warm_start=False,
                      random_state=2,
                      n_jobs=-1,
                      shuffle=True)
                      
model.fit(X_train, y_train, coef_init=svc.coef_, intercept_init=svc.intercept_)
y_pred = model.predict(X_test)
print "Accuracy: ", accuracy_score(y_test, y_pred)

-- Epoch 1
Norm: 51897.22, NNZs: 1517913, Bias: -155.609733, T: 48, Avg. loss: 104682.394010
Total training time: 1.73 seconds.
Accuracy:  0.461538461538


In [None]:
randomArray = np.random.randint(10, size=(500, 2469062))
results = []

for index, arr in enumerate(randomArray):
    if arr[1792420] % 2 == 0:
        randomArray[index][1792420] = 0
        results.append(0)
    else:
        randomArray[index][1792420] = 1
        results.append(1)

In [29]:
print cv_scores.mean(), cv_scores.std()

0.54380952381 0.0860785908895


In [29]:
nonzeroes = np.nonzero(model.coef_)[1]
coefs = zip(nonzeroes, abs(model.coef_[0][nonzeroes]))
coefs.sort(key = lambda x: x[1], reverse=True)

In [30]:
coefs

[(2386364, 7.441622731249633),
 (2337764, 7.3935125486476787),
 (2337765, 7.3935125486476787),
 (1937403, 7.124263254730705),
 (946501, 6.8653090147339304),
 (621409, 6.6819260362721611),
 (1776401, 6.6795449024968736),
 (603153, 6.6504918319263728),
 (170718, 6.6230700349812075),
 (170724, 6.6230700349812075),
 (923854, 6.6230700349812075),
 (1069426, 6.6230700349812075),
 (1177137, 6.5325547301549474),
 (362283, 6.428266974002212),
 (835769, 6.3188942811264583),
 (1403333, 6.2795478513664564),
 (1822747, 6.2525774976035251),
 (22919, 6.1895035575049855),
 (30489, 6.1895035575049855),
 (37541, 6.1895035575049855),
 (66113, 6.1895035575049855),
 (68271, 6.1895035575049855),
 (68721, 6.1895035575049855),
 (72843, 6.1895035575049855),
 (79373, 6.1895035575049855),
 (85001, 6.1895035575049855),
 (119811, 6.1895035575049855),
 (127437, 6.1895035575049855),
 (199155, 6.1895035575049855),
 (205427, 6.1895035575049855),
 (225079, 6.1895035575049855),
 (284501, 6.1895035575049855),
 (298035, 6

In [None]:
arange = np.logspace(-2, 4, 10).tolist()
means = []
stds = []
for Aval in arange:
    svc_test = SGDClassifier(penalty='l1', class_weight='balanced', alpha=Aval, n_iter=1000, shuffle=True)
    cv_score = cross_val_score(svc_test, knownData, leftEyeNameList, cv=10, scoring='accuracy')
    means.append(cv_score.mean())
    stds.append(cv_score.std())
    print "alpha:", Aval, "mean:", cv_score.mean(), "std:", cv_score.std()