In [13]:
from sklearn.linear_model import SGDClassifier
from sklearn import preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import collections

In [14]:
def get_shape_partial(filename):
    with open(filename, 'rb') as fhandle:
        major, minor = np.lib.format.read_magic(fhandle)
        shape, _, _ = np.lib.format.read_array_header_1_0(fhandle)
        return shape

In [15]:
def read_npy_chunk(filename, start_row, num_rows): # from https://gist.github.com/dwf/1766222
    assert start_row >= 0 and num_rows > 0
    with open(filename, 'rb') as fhandle:
        major, minor = np.lib.format.read_magic(fhandle)
        shape, fortran, dtype = np.lib.format.read_array_header_1_0(fhandle)
        assert not fortran, "Fortran order arrays not supported"
        # Make sure the offsets aren't invalid.
        assert start_row < shape[0], (
            'start_row is beyond end of file'
        )
        assert start_row + num_rows <= shape[0], (
            'start_row + num_rows > shape[0]'
        )
        # Get the number of elements in one 'row' by taking
        # a product over all other dimensions.
        row_size = np.prod(shape[1:])
        start_byte = start_row * row_size * dtype.itemsize
        fhandle.seek(start_byte, 1)
        n_items = row_size * num_rows
        flat = np.fromfile(fhandle, count=n_items, dtype=dtype)
        return flat.reshape((-1,) + shape[1:])

In [16]:
# read names that have provided survey eye color data
columns = ['name', 'timestamp', 'id', 'blood_type', 'height', 'weight', 'hw_comments', 'left', 'right', 'left_desc', 'right_desc', 'eye_comments', 'hair', 'hair_desc', 'hair_comments', 'misc', 'handedness']

# pgp eye color data from survey
surveyData = pd.read_csv("./eye_color_data/PGP-Survey.csv", names=columns, na_values=['nan', '', 'NaN'])

# names of the pgp participants
surveyNames = np.asarray(surveyData['name'].values.tolist())

In [17]:
# load numpy array of names and keep only the huID
pgpNames = np.load("names")
pgpNames = map(lambda name: name[:8], pgpNames)

# simple lambda function to return if the input is a string
isstr = lambda val: isinstance(val, str)

In [18]:
eye_color = collections.namedtuple("EyeColor", ['left', 'right'])

# lookup a name in the survey data and return a tuple of the eye colors
def getData(name, surveyData, excludeHazel=False):
    for index, row in surveyData.iterrows():
        if row['name'] == name:
            if not excludeHazel:
                return eye_color(row['left'], row['right'])
            else:
                if isstr(row['left_desc']) and isstr(row['right_desc']):
                    if 'azel' in row['left_desc'] or 'azel' in row['right_desc']:
                        return None
                return eye_color(row['left'], row['right'])

In [19]:
# list of tuples for index and name with eye color data (idx, name)
nameEyeMap = []
namePair = collections.namedtuple("NamePair", ['index', 'name'])

# dictionary of left and right eye colors with respective name, i.e., {"huID": 12}
leftEyeMap = {}
rightEyeMap = {}

existingNames = []

# loop through pgpNames and add eye color to maps, making sure not to add the same name twice
for i, name in enumerate(pgpNames):
    if name in surveyNames and name not in existingNames:
        existingNames.append(name)
        eyeData = getData(name, surveyData, excludeHazel=True)
        if eyeData == None:
            pass
        elif isstr(eyeData.left) and isstr(eyeData.right):
            nameEyeMap.append(namePair(i, name))
            leftEyeMap[name] = eyeData.left
            rightEyeMap[name] = eyeData.right

# create lists containing the known eye color names and the unknown eye colors.
nameIndices, correspondingNames = [], []
for pair in nameEyeMap:
    nameIndices.append(pair.index)
    correspondingNames.append(pair.name)

In [20]:
# convert dictionaries to lists 
leftEyeNameList = []
rightEyeNameList = []
# nametuple looks like (index, name)
for _, name in nameEyeMap:
    if isstr(leftEyeMap[name]):
        leftEyeNameList.append(leftEyeMap[name])
    if isstr(rightEyeMap[name]):
        rightEyeNameList.append(rightEyeMap[name])

blueOrNot = lambda color: 0 if int(color) > 13 else 1
leftEyeNameList = np.asarray(map(blueOrNot, leftEyeNameList))

In [21]:
model = SGDClassifier(loss='hinge', penalty='l1')

In [22]:
total_length = get_shape_partial('hiq-pgp')[0] # total length of array
iter_size = 40
iter_loc = 0

while iter_loc < total_length:
    if (iter_loc + iter_size > total_length):
        iter_size = total_length - iter_loc
        
    chunk = read_npy_chunk('hiq-pgp', iter_loc, iter_size)
    validNameIndices = []
    
    for index in nameIndices:
        if index >= iter_loc and index < iter_loc + iter_size:
            validNameIndices.append(index)
    
    validNameIndices = np.asarray(validNameIndices) - iter_loc
    print validNameIndices
    knownData = chunk[validNameIndices]
    
    # find range from the iterlocation divided by the size to the iter location plus the shape
    knownResults = leftEyeNameList[range(iter_loc / iter_size, iter_loc / iter_size + knownData.shape[0])]
    #print knownResults.shape
    model.partial_fit(knownData, knownResults, classes=[0, 1])

    print "Finished iteration: ", iter_loc
    iter_loc += iter_size

[ 2  4  7 13 16 21 24 26 33 36 38]
Finished iteration:  0
[ 2  4 15 16 18 23 24 29 34 35]
Finished iteration:  40
[ 1  4  5  9 10 15 19 23 27 30 35 36]
Finished iteration:  80
[ 5 15 20 21 23 26 27 34 37 38]
Finished iteration:  120
[ 0  8 12 13 14 18 26 29 31 32 33 38]
Finished iteration:  160
[ 1  2  4  6  9 10]
Finished iteration:  200


In [23]:
newchunk = read_npy_chunk('hiq-pgp', 193, 18)

validNameIndices = []
for index in nameIndices:
    if index >= 193 and index < 218:
        validNameIndices.append(index)
    
validNameIndices = np.asarray(validNameIndices) - 193

knownData = newchunk[validNameIndices]
print knownData.shape
knownResults = leftEyeNameList[53:]
print knownResults.shape
   # model.partial_fit(knownData, knownResults, classes=[0, 1])

(8, 2469062)
(8,)


In [24]:
model.predict(knownData.astype('double'))

array([0, 0, 1, 1, 1, 0, 0, 1])

In [25]:
knownResults

array([1, 1, 0, 0, 0, 0, 0, 1])

In [26]:
nonzeroes = np.nonzero(model.coef_)[1]

In [27]:
len(nonzeroes)

1008819