In [1]:
from sklearn import preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import collections
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
import sklearn

In [2]:
# read names that have provided survey eye color data
columns = ['name', 'timestamp', 'id', 'blood_type', 'height', 'weight', 'hw_comments', 'left', 'right', 'left_desc', 'right_desc', 'eye_comments', 'hair', 'hair_desc', 'hair_comments', 'misc', 'handedness']

# pgp eye color data from survey
surveyData = pd.read_csv("../eye_color_data/PGP-Survey.csv", names=columns, na_values=['nan', '', 'NaN'])

# names of the pgp participants
surveyNames = np.asarray(surveyData['name'].values.tolist())

# load numpy array of tiled PGP data 
pgp = preprocessing.scale(np.load("../hiq-pgp").astype('double'))
pgp_unscaled = np.load("../hiq-pgp")

In [3]:
# load numpy array of names and keep only the huID
pgpNames = np.load("../names")
pgpNames = map(lambda name: name[:8], pgpNames)

# simple lambda function to return if the input is a string
isstr = lambda val: isinstance(val, str)

In [4]:
eye_color = collections.namedtuple("EyeColor", ['left', 'right'])

# lookup a name in the survey data and return a tuple of the eye colors
def getData(name, surveyData, excludeHazel=False):
    for index, row in surveyData.iterrows():
        if row['name'] == name:
            if not excludeHazel:
                return eye_color(row['left'], row['right'])
            else:
                if isstr(row['left_desc']) and isstr(row['right_desc']):
                    if 'azel' in row['left_desc'] or 'azel' in row['right_desc']:
                        return None
                return eye_color(row['left'], row['right'])

In [5]:
# list of tuples for index and name with eye color data (idx, name)
nameEyeMap = []
namePair = collections.namedtuple("NamePair", ['index', 'name'])

# dictionary of left and right eye colors with respective name, i.e., {"huID": 12}
leftEyeMap = {}
rightEyeMap = {}

existingNames = []

# loop through pgpNames and add eye color to maps, making sure not to add the same name twice
for i, name in enumerate(pgpNames):
    if name in surveyNames and name not in existingNames:
        existingNames.append(name)
        eyeData = getData(name, surveyData, excludeHazel=True)
        if eyeData == None:
            pass
        elif isstr(eyeData.left) and isstr(eyeData.right):
            nameEyeMap.append(namePair(i, name))
            leftEyeMap[name] = eyeData.left
            rightEyeMap[name] = eyeData.right

# create lists containing the known eye color names and the unknown eye colors.
nameIndices, correspondingNames = [], []
for pair in nameEyeMap:
    nameIndices.append(pair.index)
    correspondingNames.append(pair.name)
knownData = pgp[nameIndices]
knownData_unscaled = pgp_unscaled[nameIndices]
unknownData = np.delete(pgp, nameIndices, axis=0)

In [6]:
# convert dictionaries to lists 
leftEyeNameList = []
rightEyeNameList = []
# nametuple looks like (index, name)
for _, name in nameEyeMap:
    if isstr(leftEyeMap[name]):
        leftEyeNameList.append(leftEyeMap[name])
    if isstr(rightEyeMap[name]):
        rightEyeNameList.append(rightEyeMap[name])

blueOrNot = lambda color: 0 if int(color) > 13 else 1
leftEyeNameList = map(blueOrNot, leftEyeNameList)

In [7]:
print knownData.shape

from sklearn.feature_selection import VarianceThreshold
#sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
#knownData = sel.fit_transform(knownData)
print "after", knownData.shape

(61, 2469062)
after (61, 2469062)


In [29]:
X_train, X_test, y_train, y_test = train_test_split(knownData, leftEyeNameList, test_size=.2, random_state=2)


In [30]:
svc = LinearSVC(penalty='l1', class_weight='balanced', C=.06, max_iter=1000, verbose=True, dual=False)
svc.fit(X_train, y_train)
print accuracy_score(svc.predict(X_test), y_test)

[LibLinear]1.0


In [8]:
svc_test = SGDClassifier(penalty='l1', class_weight='balanced', 
                     alpha=.25, verbose=1, n_iter=100,
                     loss='hinge')

svc_test.fit(knownData, leftEyeNameList)

score = svc_test.score(knownData,leftEyeNameList)
print(score)

-- Epoch 1
Norm: 2302.45, NNZs: 74857, Bias: -5.857419, T: 61, Avg. loss: 3371.462726
Total training time: 1.76 seconds.
-- Epoch 2
Norm: 2310.35, NNZs: 48611, Bias: -6.347053, T: 122, Avg. loss: 1853.982797
Total training time: 2.98 seconds.
-- Epoch 3
Norm: 2312.58, NNZs: 39181, Bias: -6.616030, T: 183, Avg. loss: 1291.718454
Total training time: 4.19 seconds.
-- Epoch 4
Norm: 2313.84, NNZs: 33870, Bias: -6.814588, T: 244, Avg. loss: 993.813642
Total training time: 5.40 seconds.
-- Epoch 5
Norm: 2315.05, NNZs: 30028, Bias: -7.046356, T: 305, Avg. loss: 808.566679
Total training time: 6.75 seconds.
-- Epoch 6
Norm: 2315.75, NNZs: 27862, Bias: -7.216432, T: 366, Avg. loss: 680.795708
Total training time: 8.07 seconds.
-- Epoch 7
Norm: 2316.24, NNZs: 25994, Bias: -7.351297, T: 427, Avg. loss: 587.708350
Total training time: 9.27 seconds.
-- Epoch 8
Norm: 2316.54, NNZs: 24607, Bias: -7.445773, T: 488, Avg. loss: 516.479431
Total training time: 10.41 seconds.
-- Epoch 9
Norm: 2316.84, NNZ

Norm: 2319.05, NNZs: 10737, Bias: -9.566613, T: 4148, Avg. loss: 64.407878
Total training time: 85.18 seconds.
-- Epoch 69
Norm: 2319.05, NNZs: 10709, Bias: -9.579415, T: 4209, Avg. loss: 63.491949
Total training time: 86.37 seconds.
-- Epoch 70
Norm: 2319.06, NNZs: 10670, Bias: -9.596629, T: 4270, Avg. loss: 62.608357
Total training time: 87.67 seconds.
-- Epoch 71
Norm: 2319.06, NNZs: 10629, Bias: -9.609077, T: 4331, Avg. loss: 61.742014
Total training time: 88.86 seconds.
-- Epoch 72
Norm: 2319.07, NNZs: 10606, Bias: -9.624704, T: 4392, Avg. loss: 60.906174
Total training time: 90.13 seconds.
-- Epoch 73
Norm: 2319.07, NNZs: 10572, Bias: -9.639023, T: 4453, Avg. loss: 60.085863
Total training time: 91.37 seconds.
-- Epoch 74
Norm: 2319.08, NNZs: 10541, Bias: -9.652067, T: 4514, Avg. loss: 59.288242
Total training time: 92.59 seconds.
-- Epoch 75
Norm: 2319.08, NNZs: 10509, Bias: -9.668119, T: 4575, Avg. loss: 58.515807
Total training time: 93.89 seconds.
-- Epoch 76
Norm: 2319.09, N

In [25]:
nonzeroes = np.nonzero(svc_test.coef_[0])[0]

In [31]:
for non in nonzeroes:
    print non

829
1005
1133
1383
1532
1567
1797
1991
2605
2675
2890
2940
2996
3009
3104
3105
3134
3319
3413
3419
3453
3833
4121
4331
5819
6429
6529
7149
7299
7790
8407
8557
8582
8973
8980
9014
9026
9033
9477
9563
9595
9597
9802
9889
10408
10530
10647
11344
11639
12000
12338
13139
13331
13825
14187
14311
14657
15094
15453
15765
16152
16164
16165
16559
16819
17707
17825
17851
17867
17970
18867
19085
19189
20727
21734
21744
21746
21750
21760
22160
22164
22166
22172
22190
22196
22198
22200
22202
22204
22228
22232
22236
22254
22262
22390
22584
22593
22631
23697
25204
25250
25270
25290
25847
26217
26791
27110
27286
27302
27310
27322
27334
27348
27401
27711
27767
28121
28190
28263
28503
29151
29336
29381
29501
29523
29735
30383
30857
31001
31013
31105
31120
31155
31467
31485
32761
33641
33855
34441
34545
37865
38940
39065
39655
39685
39957
40321
40429
40531
41227
41405
41466
42113
42257
42445
44046
44827
45815
46319
46321
46417
47003
48317
48951
49163
50137
50245
50609
50730
50746
50750
50760
51057
51369
5

727181
728433
729386
729422
729460
729471
729567
729678
729777
729820
729866
730045
730061
730165
730335
730461
731073
731153
732129
732164
732412
732413
732415
732439
733009
733500
733504
733532
733564
733569
733576
733646
733865
734083
734093
734138
734144
734913
734921
735311
735685
735687
736278
736282
736286
736288
736649
736749
736759
736761
736766
736768
737060
737063
737121
737161
737539
737957
738085
738925
739138
739555
739997
740459
740465
740955
741015
741121
741208
741333
741779
741839
741864
742063
742135
742179
742245
743481
743787
743805
743829
744079
745089
746325
747039
748227
748285
748291
748403
748977
749147
750103
750173
750322
750328
750861
750953
751075
751137
751193
751719
751839
752242
752961
753575
753589
753907
753931
754265
754483
754655
754686
754718
754722
754728
754738
754780
754784
754802
754847
755562
756083
756201
756587
756862
757047
757491
757692
757728
757732
757734
757738
757740
757790
757798
757809
757863
758045
758073
758081
758355
758621
758651

1107855
1108333
1108521
1108531
1108549
1109085
1109255
1109595
1110125
1111103
1111145
1111435
1111473
1111481
1111695
1111755
1112793
1113011
1113141
1113214
1113395
1113601
1113651
1113709
1114430
1114432
1114451
1114461
1114471
1114473
1114517
1114713
1115889
1116413
1117079
1117129
1117329
1117688
1117770
1117801
1117803
1117807
1117815
1117817
1117823
1117837
1117958
1117962
1117964
1118137
1118223
1118355
1118403
1118497
1118544
1118567
1118735
1118973
1120142
1120248
1120260
1120276
1120841
1120899
1120985
1121200
1121742
1121874
1122034
1122128
1122144
1122147
1122468
1123201
1123388
1123389
1123475
1124137
1124641
1124793
1125077
1125423
1125664
1125666
1125791
1125987
1125989
1126195
1126319
1126331
1126403
1126417
1126493
1126523
1126549
1126589
1126625
1126827
1126939
1126973
1127077
1127457
1127783
1128153
1128720
1128723
1128733
1128776
1129093
1129141
1129163
1130517
1131433
1132387
1133599
1133857
1134297
1134417
1135373
1135401
1135547
1135638
1135646
1135749
1135759


1622542
1622558
1622566
1622578
1622584
1622588
1622722
1622756
1622760
1622927
1622988
1623054
1623066
1623084
1623497
1623971
1624009
1624705
1625668
1625672
1625738
1625809
1625993
1626115
1626169
1626353
1626365
1626387
1626449
1626509
1626849
1627399
1627847
1629281
1629829
1631707
1631729
1631832
1631892
1631929
1631993
1632057
1632437
1632795
1633000
1633014
1633016
1633022
1633217
1634081
1634287
1634437
1634445
1634447
1634449
1634461
1634471
1634503
1634531
1634654
1634690
1635579
1635662
1635981
1636839
1638860
1639182
1639184
1639185
1639439
1639581
1639615
1639633
1639681
1639694
1639708
1640288
1640452
1640469
1640697
1640771
1640773
1640783
1640800
1640881
1641532
1641534
1641552
1641564
1641745
1642469
1642577
1642817
1642889
1642895
1642911
1642997
1643037
1643353
1643845
1643857
1644058
1644059
1644189
1644431
1644600
1644893
1645129
1645401
1645445
1648251
1648447
1649263
1649550
1649573
1649677
1649717
1649881
1649932
1649946
1649948
1649950
1649954
1649956
1649964


2145239
2145275
2145529
2145729
2145855
2146541
2147403
2147586
2148763
2148979
2149261
2149727
2150491
2150547
2150555
2150573
2150867
2151639
2152391
2152597
2152627
2153589
2153909
2153913
2154159
2154207
2154713
2155239
2155317
2156237
2156326
2156487
2157911
2158429
2158603
2158679
2158681
2158683
2159353
2159953
2160201
2160298
2160873
2161365
2161436
2161438
2163276
2163277
2163443
2164081
2164097
2164306
2165080
2165210
2165628
2166103
2166286
2167332
2167375
2167392
2167407
2167429
2167437
2167447
2167561
2168585
2168589
2168691
2168753
2168798
2168833
2169296
2169304
2169352
2169660
2170082
2170118
2170671
2171654
2171662
2171666
2171668
2172331
2172927
2173043
2173125
2173137
2173145
2173411
2173893
2173945
2174429
2174725
2174903
2174973
2175025
2175179
2175281
2175465
2175551
2175643
2176799
2178057
2178287
2178929
2179879
2180205
2180367
2180429
2180524
2180721
2181477
2181721
2181844
2182095
2182180
2182376
2182575
2183537
2184410
2184557
2185081
2185521
2185569
2185601


In [39]:
np.argmax(svc_test.coef_[0][nonzeroes])

24

In [40]:
nonzeroes[24]

5819

In [21]:
nonzeroes[0][9788]

2468433

In [39]:
X_train.shape

(48, 2469062)

In [None]:
nonzeroes = np.nonzero(model.coef_)[1]
coefs = zip(nonzeroes, abs(model.coef_[0][nonzeroes]))
coefs.sort(key = lambda x: x[1], reverse=True)

In [None]:
coefs

In [None]:
arange = np.logspace(-2, 4, 10).tolist()
means = []
stds = []
for Aval in arange:
    svc_test = SGDClassifier(penalty='l1', class_weight='balanced', alpha=Aval, n_iter=1000, shuffle=True)
    cv_score = cross_val_score(svc_test, knownData, leftEyeNameList, cv=10, scoring='accuracy')
    means.append(cv_score.mean())
    stds.append(cv_score.std())
    print "alpha:", Aval, "mean:", cv_score.mean(), "std:", cv_score.std()