In [1]:
from sklearn import preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import collections
from sklearn.linear_model import SGDClassifier

In [2]:
# read names that have provided survey eye color data
columns = ['name', 'timestamp', 'id', 'blood_type', 'height', 'weight', 'hw_comments', 'left', 'right', 'left_desc', 'right_desc', 'eye_comments', 'hair', 'hair_desc', 'hair_comments', 'misc', 'handedness']

# pgp eye color data from survey
surveyData = pd.read_csv("./eye_color_data/PGP-Survey.csv", names=columns, na_values=['nan', '', 'NaN'])

# names of the pgp participants
surveyNames = np.asarray(surveyData['name'].values.tolist())

# load numpy array of tiled PGP data 
pgp = preprocessing.scale(np.load("hiq-pgp").astype('double'))
#pgp = np.load("hiq-pgp")

In [3]:
# load numpy array of names and keep only the huID
pgpNames = np.load("names")
pgpNames = map(lambda name: name[:8], pgpNames)

# simple lambda function to return if the input is a string
isstr = lambda val: isinstance(val, str)

In [4]:
eye_color = collections.namedtuple("EyeColor", ['left', 'right'])

# lookup a name in the survey data and return a tuple of the eye colors
def getData(name, surveyData, excludeHazel=False):
    for index, row in surveyData.iterrows():
        if row['name'] == name:
            if not excludeHazel:
                return eye_color(row['left'], row['right'])
            else:
                if isstr(row['left_desc']) and isstr(row['right_desc']):
                    if 'azel' in row['left_desc'] or 'azel' in row['right_desc']:
                        return None
                return eye_color(row['left'], row['right'])

In [5]:
# list of tuples for index and name with eye color data (idx, name)
nameEyeMap = []
namePair = collections.namedtuple("NamePair", ['index', 'name'])

# dictionary of left and right eye colors with respective name, i.e., {"huID": 12}
leftEyeMap = {}
rightEyeMap = {}

existingNames = []

# loop through pgpNames and add eye color to maps, making sure not to add the same name twice
for i, name in enumerate(pgpNames):
    if name in surveyNames and name not in existingNames:
        existingNames.append(name)
        eyeData = getData(name, surveyData, excludeHazel=True)
        if eyeData == None:
            pass
        elif isstr(eyeData.left) and isstr(eyeData.right):
            nameEyeMap.append(namePair(i, name))
            leftEyeMap[name] = eyeData.left
            rightEyeMap[name] = eyeData.right

# create lists containing the known eye color names and the unknown eye colors.
nameIndices, correspondingNames = [], []
for pair in nameEyeMap:
    nameIndices.append(pair.index)
    correspondingNames.append(pair.name)
knownData = pgp[nameIndices]
unknownData = np.delete(pgp, nameIndices, axis=0)

In [6]:
# convert dictionaries to lists 
leftEyeNameList = []
rightEyeNameList = []
# nametuple looks like (index, name)
for _, name in nameEyeMap:
    if isstr(leftEyeMap[name]):
        leftEyeNameList.append(leftEyeMap[name])
    if isstr(rightEyeMap[name]):
        rightEyeNameList.append(rightEyeMap[name])

blueOrNot = lambda color: 0 if int(color) > 13 else 1
leftEyeNameList = map(blueOrNot, leftEyeNameList)

In [7]:
print 'test'

test


In [2]:
randomArray = np.random.randint(10, size=(500, 2469062))
results = []

for index, arr in enumerate(randomArray):
    if arr[1792420] % 2 == 0:
        randomArray[index][1792420] = 0
        results.append(0)
    else:
        randomArray[index][1792420] = 1
        results.append(1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(randomArray, results, test_size=.2, random_state=2)

model = SGDClassifier(penalty='l1', 
                      class_weight=None,
                      n_iter=100, 
                      verbose=True, 
                      learning_rate='optimal',
                      random_state=1,
                      n_jobs=-1,
                      shuffle=True)
#solution = np.zeros(2469062)
#solution[1792420] = 100
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print "Accuracy: ", accuracy_score(y_test, y_pred)

-- Epoch 1
Norm: 541494.51, NNZs: 2467108, Bias: 7.169658, T: 400, Avg. loss: 104784101.061249
Total training time: 17.26 seconds.
-- Epoch 2
Norm: 672206.60, NNZs: 2466324, Bias: 0.027750, T: 800, Avg. loss: 68876559.149821
Total training time: 34.39 seconds.
-- Epoch 3
Norm: 709034.24, NNZs: 2465656, Bias: 0.024873, T: 1200, Avg. loss: 49750822.384142
Total training time: 51.16 seconds.
-- Epoch 4
Norm: 716145.36, NNZs: 2464982, Bias: 0.040422, T: 1600, Avg. loss: 37669616.921068
Total training time: 67.67 seconds.
-- Epoch 5
Norm: 716145.36, NNZs: 2464378, Bias: 0.040422, T: 2000, Avg. loss: 30135693.536855
Total training time: 84.12 seconds.
-- Epoch 6
Norm: 716145.36, NNZs: 2463832, Bias: 0.040422, T: 2400, Avg. loss: 25113077.947379
Total training time: 100.56 seconds.
-- Epoch 7
Norm: 716145.36, NNZs: 2463360, Bias: 0.040422, T: 2800, Avg. loss: 21525495.383468
Total training time: 116.99 seconds.
-- Epoch 8
Norm: 716145.36, NNZs: 2462917, Bias: 0.040422, T: 3200, Avg. loss: 188

KeyboardInterrupt: 

In [10]:
nonzeroes = np.nonzero(model.coef_)[1]
coefs = zip(nonzeroes, abs(model.coef_[0][nonzeroes]))
coefs.sort(key = lambda x: x[1], reverse=True)

In [11]:
coefs

[(1792420, 1138.2151672085795),
 (1632484, 207.51496137765454),
 (1750937, 206.2283838877471),
 (122201, 198.65025940535958),
 (1976675, 195.55344395390418),
 (1316699, 193.73935780618996),
 (668185, 191.21161752796786),
 (1664883, 189.27661113095388),
 (1517125, 188.73434141118776),
 (774699, 188.06256994610931),
 (79113, 187.73735983954538),
 (2000887, 187.47617297719378),
 (656644, 186.89316839799031),
 (532033, 186.71638481468179),
 (690416, 184.55296985872647),
 (1007890, 184.5295055384303),
 (1101627, 184.38679049593514),
 (1319461, 184.23092853963595),
 (573869, 184.15627693756505),
 (1338405, 183.83345767405982),
 (550550, 183.67190368119324),
 (408880, 183.20934187540522),
 (251346, 183.12551553150078),
 (1665352, 182.99335893547001),
 (1665356, 182.99335893547001),
 (1665362, 182.99335893547001),
 (1665386, 182.99335893547001),
 (1665388, 182.99335893547001),
 (190899, 182.81052000472863),
 (137073, 182.73348973588352),
 (1408683, 182.23850604599468),
 (2254945, 182.081727119

In [None]:
arange = np.logspace(-2, 4, 10).tolist()
means = []
stds = []
for Aval in arange:
    svc_test = SGDClassifier(penalty='l1', class_weight='balanced', alpha=Aval, n_iter=1000, shuffle=True)
    cv_score = cross_val_score(svc_test, knownData, leftEyeNameList, cv=10, scoring='accuracy')
    means.append(cv_score.mean())
    stds.append(cv_score.std())
    print "alpha:", Aval, "mean:", cv_score.mean(), "std:", cv_score.std()