In [3]:
import mhcflurry, seaborn, numpy, pandas, pickle, sklearn, collections, scipy, time
import mhcflurry.dataset
import fancyimpute, locale

import sklearn.metrics
import sklearn.cross_validation

def print_full(x):
    pandas.set_option('display.max_rows', len(x))
    print(x)
    pandas.reset_option('display.max_rows')

In [4]:
max_ic50 = 50000
data_dir = "../data/"

In [6]:
all_train_data = mhcflurry.dataset.Dataset.from_csv(data_dir + "bdata.2009.mhci.public.1.txt")

In [7]:
def make_scores(ic50_y, ic50_y_pred, sample_weight=None, threshold_nm=500):     
    y_pred = mhcflurry.common.ic50_to_regression_target(ic50_y_pred, max_ic50)
    try:
        auc = sklearn.metrics.roc_auc_score(ic50_y <= threshold_nm, y_pred, sample_weight=sample_weight)
    except ValueError:
        auc = numpy.nan
    try:
        f1 = sklearn.metrics.f1_score(ic50_y <= threshold_nm, ic50_y_pred <= threshold_nm, sample_weight=sample_weight)
    except ValueError:
        f1 = numpy.nan
    try:
        tau = scipy.stats.kendalltau(ic50_y_pred, ic50_y)[0]
    except ValueError:
        tau = numpy.nan
    
    return dict(
        auc=auc,
        f1=f1,
        tau=tau,
    )  

In [11]:
models = pandas.read_csv("../data/validation_models.csv", converters={'layer_sizes': eval})
models["layer_size"] = [x[0] for x in models.layer_sizes]
del models["activation"]
models

Unnamed: 0,dropout_probability,embedding_output_dim,impute,layer_sizes,layer_size
0,0.0,32,False,[64],64
1,0.0,8,False,[4],4
2,0.5,32,False,[64],64
3,0.5,8,False,[4],4
4,0.0,32,True,[64],64
5,0.0,8,True,[4],4
6,0.5,32,True,[64],64
7,0.5,8,True,[4],4
8,0.0,32,False,[64],64
9,0.0,8,False,[4],4


In [12]:
def name_model(row):
    size = "big" if row.embedding_output_dim == 32 else "small"
    pieces = [size]
    if row.dropout_probability > 0:
        pieces.append("dropout")
    if row.impute:
        pieces.append("impute")
    return " ".join(pieces)

models["num"] = models.index
models["name"] = [name_model(row) for (_, row) in models.iterrows()]
models

Unnamed: 0,dropout_probability,embedding_output_dim,impute,layer_sizes,layer_size,num,name
0,0.0,32,False,[64],64,0,big
1,0.0,8,False,[4],4,1,small
2,0.5,32,False,[64],64,2,big dropout
3,0.5,8,False,[4],4,3,small dropout
4,0.0,32,True,[64],64,4,big impute
5,0.0,8,True,[4],4,5,small impute
6,0.5,32,True,[64],64,6,big dropout impute
7,0.5,8,True,[4],4,7,small dropout impute
8,0.0,32,False,[64],64,8,big
9,0.0,8,False,[4],4,9,small


In [10]:
model_groups = models.groupby("name").num.unique()
model_groups

name
big                      [0, 8, 16, 24, 32, 40, 48, 56, 64, 72]
big dropout             [2, 10, 18, 26, 34, 42, 50, 58, 66, 74]
big dropout impute      [6, 14, 22, 30, 38, 46, 54, 62, 70, 78]
big impute              [4, 12, 20, 28, 36, 44, 52, 60, 68, 76]
small                    [1, 9, 17, 25, 33, 41, 49, 57, 65, 73]
small dropout           [3, 11, 19, 27, 35, 43, 51, 59, 67, 75]
small dropout impute    [7, 15, 23, 31, 39, 47, 55, 63, 71, 79]
small impute            [5, 13, 21, 29, 37, 45, 53, 61, 69, 77]
Name: num, dtype: object

In [13]:
validation_df_with_mhcflurry_results = pandas.read_csv("../data/validation_predictions_full2.csv")
validation_df_with_mhcflurry_results

Unnamed: 0,allele,peptide,length,meas,netmhc,netmhcpan,smmpmbec_cpp,mhcflurry 0,mhcflurry 1,mhcflurry 2,...,mhcflurry 70,mhcflurry 71,mhcflurry 72,mhcflurry 73,mhcflurry 74,mhcflurry 75,mhcflurry 76,mhcflurry 77,mhcflurry 78,mhcflurry 79
0,H-2-DB,AAACNVATA,9,657.657837,154.881662,711.213514,438.530698,466.814768,172.641984,343.793510,...,355.844970,825.249008,236.052464,956.955845,407.920138,1335.326524,12.870162,146.240141,387.025293,1175.574895
1,H-2-DB,AAFEFVYV,8,30831.879502,6456.542290,785.235635,10351.421667,15087.574426,10433.579881,10853.152403,...,11752.227891,15845.444839,20566.621149,9283.359745,11241.797195,14002.382696,9334.197863,7675.906356,12913.688599,15777.427472
2,H-2-DB,AAFVNDYSL,9,77.446180,17.458222,7.516229,28.054336,47.820807,16.303794,29.588240,...,39.930163,217.650523,40.095338,46.914299,32.406285,179.564236,26.909081,16.457839,34.553959,218.821497
3,H-2-DB,AAIANQAAV,9,1.999862,9.638290,9.749896,25.703958,2.887869,7.430676,23.204965,...,23.685004,90.313619,5.017098,11.156760,19.407683,70.818218,4.119397,7.337271,19.713879,103.209533
4,H-2-DB,AAIANQAVV,9,1.517050,8.550667,8.336812,28.773984,2.579753,5.215993,17.054924,...,19.165591,74.765437,4.378619,5.170134,16.111638,69.824731,3.507305,6.210591,15.772759,86.157611
5,H-2-DB,AAIENYVRF,9,37.844258,252.348077,114.815362,187.068214,91.632010,77.986067,288.144351,...,302.361030,493.962110,374.752767,217.663016,301.008748,415.441450,402.005916,195.863576,368.765718,533.387993
6,H-2-DB,AAINFITTM,9,3.155005,199.986187,389.045145,200.909281,474.790652,183.215053,43.296033,...,43.854696,288.187373,16.122187,28.382899,41.544168,186.442729,76.292963,156.352985,44.815033,282.989902
7,H-2-DB,AAIPAPPPI,9,3243.396173,1059.253725,493.173804,295.120923,623.878847,7834.652456,178.830163,...,229.813055,285.925339,352.893901,246.850697,184.103734,213.947696,2278.596615,207.562759,219.872392,329.439928
8,H-2-DB,AAKLNRPPL,9,654.636174,66.374307,77.268059,38.459178,88.917004,63.996155,60.123046,...,88.217258,337.759556,199.810086,177.468492,67.696375,149.009658,30.536993,87.145714,72.876322,240.820089
9,H-2-DB,AALDMVDAL,9,229.614865,547.015963,597.035287,225.423921,5459.146719,678.959069,605.313435,...,454.257039,627.492474,3152.181431,1722.232625,438.567638,535.066532,2163.586824,617.108573,843.582091,868.502679


In [12]:
# Extend with ensemble predictions
all_indices = sorted(set.union(*[set(indices) for (name, indices) in model_groups.iteritems()]))
all_indices_impute = sorted(
    set.union(*[set(indices) for (name, indices) in model_groups.iteritems() if 'impute' in name]))
all_indices_not_impute = sorted(
    set.union(*[set(indices) for (name, indices) in model_groups.iteritems() if 'impute' not in name]))

for (name, indices) in list(model_groups.iteritems()) + [("all", all_indices), ("all impute", all_indices_impute), ("all not impute", all_indices_not_impute)]:
    validation_df_with_mhcflurry_results["mhcflurry ensemble %s" % name] = \
        scipy.stats.mstats.gmean(
            validation_df_with_mhcflurry_results[["mhcflurry %d" % i for i in indices]],
            axis=1)

validation_df_with_mhcflurry_results

Unnamed: 0,allele,peptide,length,meas,netmhc,netmhcpan,smmpmbec_cpp,mhcflurry 0,mhcflurry 1,mhcflurry 2,...,mhcflurry ensemble big dropout,mhcflurry ensemble big dropout impute,mhcflurry ensemble big impute,mhcflurry ensemble small,mhcflurry ensemble small dropout,mhcflurry ensemble small dropout impute,mhcflurry ensemble small impute,mhcflurry ensemble all,mhcflurry ensemble all impute,mhcflurry ensemble all not impute
0,H-2-DB,AAACNVATA,9,657.657837,154.881662,711.213514,438.530698,466.814768,172.641984,343.793510,...,412.970922,412.076508,127.076246,354.917879,1032.431732,979.070265,316.292139,382.541823,356.850347,410.082958
1,H-2-DB,AAFEFVYV,8,30831.879502,6456.542290,785.235635,10351.421667,15087.574426,10433.579881,10853.152403,...,11546.349645,11523.365476,12119.332257,10582.779265,15236.594111,15530.316894,10563.990149,12481.240163,12303.157490,12661.900504
2,H-2-DB,AAFVNDYSL,9,77.446180,17.458222,7.516229,28.054336,47.820807,16.303794,29.588240,...,35.800397,36.332714,26.661012,14.234040,198.349515,191.311593,36.190375,45.449943,50.889386,40.591910
3,H-2-DB,AAIANQAAV,9,1.999862,9.638290,9.749896,25.703958,2.887869,7.430676,23.204965,...,21.979447,22.149854,5.514130,9.571188,75.476672,73.397048,10.311149,17.935116,17.436463,18.448030
4,H-2-DB,AAIANQAVV,9,1.517050,8.550667,8.336812,28.773984,2.579753,5.215993,17.054924,...,17.009966,17.210544,4.087731,5.404075,66.898285,66.277773,6.168712,13.117819,13.022964,13.213366
5,H-2-DB,AAIENYVRF,9,37.844258,252.348077,114.815362,187.068214,91.632010,77.986067,288.144351,...,308.430443,313.295340,283.004210,138.379134,453.930512,463.507669,126.418515,252.636475,268.474851,237.732466
6,H-2-DB,AAINFITTM,9,3.155005,199.986187,389.045145,200.909281,474.790652,183.215053,43.296033,...,47.644859,46.924741,29.162618,75.234690,201.404054,208.736308,69.728882,71.024356,66.805124,75.510064
7,H-2-DB,AAIPAPPPI,9,3243.396173,1059.253725,493.173804,295.120923,623.878847,7834.652456,178.830163,...,224.408829,201.811546,474.297442,661.731263,203.524213,224.200582,295.562535,298.738170,282.208914,316.235562
8,H-2-DB,AAKLNRPPL,9,654.636174,66.374307,77.268059,38.459178,88.917004,63.996155,60.123046,...,78.328625,76.676509,193.105629,150.854140,187.610007,214.239343,191.850182,152.833062,157.065094,148.715061
9,H-2-DB,AALDMVDAL,9,229.614865,547.015963,597.035287,225.423921,5459.146719,678.959069,605.313435,...,571.537601,693.972088,1713.786138,1086.358455,752.720509,709.941692,1346.778760,1012.509851,1032.652955,992.759662


In [20]:
scores_df = collections.defaultdict(list)
predictors = validation_df_with_mhcflurry_results.columns[4:]
pairs = [
    ("overall", validation_df_with_mhcflurry_results)
] + list(validation_df_with_mhcflurry_results.groupby("allele"))

for (allele, grouped) in pairs:
    scores_df["allele"].append(allele)
    scores_df["test_size"].append(len(grouped.meas))
    for predictor in predictors:
        scores = make_scores(grouped.meas, grouped[predictor])
        for (key, value) in scores.items():
            scores_df["%s_%s" % (predictor, key)].append(value)
            
scores_df = pandas.DataFrame(scores_df)
scores_df["train_size"] = [
    len(set(all_train_data[a].original_peptides)) if a != 'overall' else numpy.nan
    for a in scores_df.allele
]
scores_df.index = scores_df.allele
scores_df

Unnamed: 0_level_0,allele,mhcflurry 0_auc,mhcflurry 0_f1,mhcflurry 0_tau,mhcflurry 10_auc,mhcflurry 10_f1,mhcflurry 10_tau,mhcflurry 11_auc,mhcflurry 11_f1,mhcflurry 11_tau,...,netmhc_f1,netmhc_tau,netmhcpan_auc,netmhcpan_f1,netmhcpan_tau,smmpmbec_cpp_auc,smmpmbec_cpp_f1,smmpmbec_cpp_tau,test_size,train_size
allele,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
overall,overall,0.916791,0.770198,0.551841,0.924137,0.77942,0.564822,0.907951,0.70091,0.538382,...,0.800909,0.582258,0.932924,0.79317,0.579586,0.919208,0.784295,0.561996,27680,
H-2-DB,H-2-DB,0.891129,0.636704,0.607055,0.894199,0.544643,0.620387,0.889827,0.349727,0.615268,...,0.621212,0.600337,0.874574,0.577236,0.574262,0.884187,0.628571,0.571252,564,3216.0
H-2-KB,H-2-KB,0.88774,0.795053,0.551784,0.910714,0.821239,0.569634,0.903935,0.760456,0.561134,...,0.813675,0.573199,0.825565,0.665354,0.486836,0.915994,0.859967,0.589218,558,3407.0
H-2-KD,H-2-KD,0.775344,0.613139,0.359457,0.787208,0.575758,0.378337,0.785797,0.555556,0.365622,...,0.657718,0.403275,0.819189,0.64557,0.390333,0.753692,0.538462,0.365247,229,452.0
HLA-A0101,HLA-A0101,0.901156,0.605405,0.501124,0.911578,0.587571,0.521736,0.889515,0.299213,0.498039,...,0.619565,0.524866,0.894895,0.594286,0.498767,0.832665,0.437811,0.428064,696,3725.0
HLA-A0201,HLA-A0201,0.901909,0.845684,0.573243,0.925812,0.871728,0.62319,0.913368,0.861688,0.599266,...,0.884336,0.635498,0.930479,0.880963,0.637338,0.927358,0.885121,0.626224,2126,9565.0
HLA-A0202,HLA-A0202,0.83204,0.717391,0.498543,0.905765,0.791209,0.646429,0.912694,0.787234,0.656408,...,0.755556,0.627143,0.898697,0.769231,0.62428,0.882206,0.727273,0.606938,126,3919.0
HLA-A0203,HLA-A0203,0.951758,0.924731,0.550961,0.974648,0.954869,0.594523,0.970475,0.948503,0.577628,...,0.948626,0.586911,0.974158,0.944578,0.591463,0.972885,0.946746,0.583908,651,5542.0
HLA-A0206,HLA-A0206,0.862964,0.831633,0.443633,0.899344,0.875306,0.518406,0.883182,0.868159,0.496525,...,0.872902,0.543184,0.910796,0.866258,0.535067,0.904317,0.878282,0.527571,682,4827.0
HLA-A0301,HLA-A0301,0.891751,0.841542,0.53083,0.922273,0.86551,0.601402,0.918873,0.728411,0.582643,...,0.900621,0.629236,0.927287,0.885106,0.61124,0.933966,0.897275,0.610891,811,6141.0


In [15]:
print(" ".join(scores_df.index))
print(scores_df.shape)

overall H-2-DB H-2-KB H-2-KD HLA-A0101 HLA-A0201 HLA-A0202 HLA-A0203 HLA-A0206 HLA-A0301 HLA-A1101 HLA-A2301 HLA-A2402 HLA-A2501 HLA-A2601 HLA-A2602 HLA-A2603 HLA-A2902 HLA-A3001 HLA-A3002 HLA-A3101 HLA-A3201 HLA-A3301 HLA-A6801 HLA-A6802 HLA-A6901 HLA-A8001 HLA-B0702 HLA-B0801 HLA-B0802 HLA-B0803 HLA-B1501 HLA-B1503 HLA-B1509 HLA-B1517 HLA-B1801 HLA-B2703 HLA-B2705 HLA-B3501 HLA-B3801 HLA-B3901 HLA-B4001 HLA-B4002 HLA-B4402 HLA-B4403 HLA-B4501 HLA-B4601 HLA-B5101 HLA-B5301 HLA-B5401 HLA-B5701 HLA-B5801 Mamu-A01 Mamu-A02
(54, 285)


In [16]:
print_full(scores_df.ix["overall"].sort(inplace=False, ascending=False))

allele                                           overall
test_size                                          27680
netmhcpan_auc                                  0.9329235
netmhc_auc                                     0.9299468
mhcflurry 66_auc                               0.9276946
mhcflurry ensemble all not impute_auc          0.9274469
mhcflurry ensemble all_auc                     0.9273025
mhcflurry ensemble all impute_auc              0.9270227
mhcflurry 78_auc                               0.9269599
mhcflurry 30_auc                               0.9267281
mhcflurry 34_auc                                0.926603
mhcflurry ensemble big dropout_auc             0.9265897
mhcflurry 2_auc                                0.9265458
mhcflurry 6_auc                                 0.926512
mhcflurry 58_auc                               0.9265101
mhcflurry 26_auc                               0.9264645
mhcflurry 70_auc                                0.926379
mhcflurry ensemble big dropout 

In [19]:
print_full(scores_df.ix[(scores_df.index != "overall") & (scores_df.train_size >= 1000)].mean(0).sort(inplace=False, ascending=False))

train_size                                     3131.285714
test_size                                       605.371429
mhcflurry ensemble all_auc                        0.904481
mhcflurry ensemble all impute_auc                 0.904463
mhcflurry ensemble all not impute_auc             0.904237
mhcflurry ensemble small impute_auc               0.903239
mhcflurry ensemble small_auc                      0.902855
mhcflurry 70_auc                                  0.901710
mhcflurry 61_auc                                  0.901698
mhcflurry 2_auc                                   0.901526
mhcflurry 25_auc                                  0.901514
mhcflurry 10_auc                                  0.901496
mhcflurry 30_auc                                  0.901488
mhcflurry 54_auc                                  0.901413
mhcflurry 42_auc                                  0.901378
mhcflurry 38_auc                                  0.901373
mhcflurry ensemble big dropout impute_auc         0.9013

In [18]:
print_full(scores_df.ix[(scores_df.index != "overall") & (scores_df.train_size < 1000)].mean(0).sort(inplace=False, ascending=False))

train_size                                     565.888889
test_size                                      360.666667
netmhcpan_auc                                    0.936087
netmhc_auc                                       0.926065
mhcflurry 30_auc                                 0.918959
mhcflurry ensemble all not impute_auc            0.918559
mhcflurry ensemble big dropout impute_auc        0.918455
mhcflurry ensemble all_auc                       0.918428
mhcflurry 66_auc                                 0.918324
mhcflurry ensemble big dropout_auc               0.917994
mhcflurry ensemble all impute_auc                0.917694
mhcflurry 70_auc                                 0.917616
mhcflurry 54_auc                                 0.917061
mhcflurry 38_auc                                 0.916187
mhcflurry 2_auc                                  0.916160
mhcflurry 78_auc                                 0.915861
mhcflurry 34_auc                                 0.915812
mhcflurry 10_a

In [21]:
print_full(scores_df.ix[scores_df.index != "overall"].mean(0).sort(inplace=False, ascending=False))

train_size                                     2260.018868
test_size                                       522.264151
netmhcpan_auc                                     0.912458
mhcflurry ensemble all_auc                        0.909130
mhcflurry ensemble all not impute_auc             0.909011
netmhc_auc                                        0.908938
mhcflurry ensemble all impute_auc                 0.908873
mhcflurry 30_auc                                  0.907312
mhcflurry ensemble big dropout impute_auc         0.907048
mhcflurry 70_auc                                  0.907012
mhcflurry 66_auc                                  0.906921
mhcflurry ensemble big dropout_auc                0.906889
mhcflurry ensemble small_auc                      0.906681
mhcflurry 54_auc                                  0.906629
mhcflurry 2_auc                                   0.906404
mhcflurry 38_auc                                  0.906311
mhcflurry ensemble small impute_auc               0.9061

In [22]:
scores_df.ix[:, "mhcflurry ensemble small impute_auc"] / scores_df.ix[:, "netmhc_auc"]

allele
overall      0.995303
H-2-DB       1.011136
H-2-KB       1.023012
H-2-KD       0.958486
HLA-A0101    1.012477
HLA-A0201    0.997279
HLA-A0202    1.009183
HLA-A0203    1.000951
HLA-A0206    0.995024
HLA-A0301    0.986422
HLA-A1101    0.988892
HLA-A2301    0.988901
HLA-A2402    0.995979
HLA-A2501    1.000491
HLA-A2601    1.005599
HLA-A2602    0.993474
HLA-A2603    0.990339
HLA-A2902    1.028103
HLA-A3001    1.021084
HLA-A3002    0.989223
HLA-A3101    0.995926
HLA-A3201    1.016410
HLA-A3301    1.015176
HLA-A6801    0.996876
HLA-A6802    1.000162
HLA-A6901    1.005053
HLA-A8001    0.989457
HLA-B0702    1.001494
HLA-B0801    1.002733
HLA-B0802    0.984341
HLA-B0803    0.961754
HLA-B1501    1.003644
HLA-B1503    0.966286
HLA-B1509    0.968437
HLA-B1517    1.019622
HLA-B1801    1.012538
HLA-B2703         NaN
HLA-B2705    0.998979
HLA-B3501    1.004012
HLA-B3801    0.962732
HLA-B3901    0.985603
HLA-B4001    0.979640
HLA-B4002    0.998922
HLA-B4402    0.981135
HLA-B4403    0.952167
HLA

In [23]:
def sub_df(name):
    result = scores_df[[c for c in list(scores_df.columns) if (name + "_") in c]].copy()
    result.columns = [c.split("_")[-1] for c in list(result.columns)]
    return result

d = dict((name, sub_df(name)) for name in predictors)
panel = pandas.Panel(d)
 
panel

<class 'pandas.core.panel.Panel'>
Dimensions: 94 (items) x 54 (major_axis) x 3 (minor_axis)
Items axis: mhcflurry 0 to smmpmbec_cpp
Major_axis axis: overall to Mamu-A02
Minor_axis axis: auc to tau