In [3]:
import mhcflurry, seaborn, numpy, pandas, pickle, sklearn, collections, scipy, time
import mhcflurry.data
import mhcflurry.imputation
import fancyimpute, locale

import sklearn.metrics
import sklearn.cross_validation



In [4]:
min_peptides_to_consider_allele = 10
max_ic50 = 50000

In [5]:
all_train_data = mhcflurry.data.load_allele_datasets("../data/bdata.2009.mhci.public.1.txt")
all_validation_data = mhcflurry.data.load_allele_datasets("../data/bdata.2013.mhci.public.blind.1.txt")


In [6]:
alleles = [allele for allele in all_train_data if len(all_train_data[allele].Y) >= min_peptides_to_consider_allele]

In [7]:
all_validation_data["HLA-A0201"]

AlleleData(X_index=array([[ 0,  0,  0, ..., 13,  6,  9],
       [ 0,  0,  0, ...,  0, 12,  9],
       [ 0,  0,  2, ..., 16, 15, 20],
       ..., 
       [20, 18,  7, ...,  8,  7,  7],
       [20, 18, 10, ..., 16, 20,  4],
       [20, 20,  4, ...,  9,  4, 17]]), X_binary=array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]]), Y=array([ 0.21249889,  0.08468665,  0.08468665, ...,  0.        ,
        0.35084077,  0.55012623]), peptides=array(['AAAFVNQHL', 'AAAQGQAPL', 'AADSFATSY', ..., 'YWIREGKII',
       'YWMGGTTYF', 'YYFSYPLFV'], 
      dtype='<U9'), ic50=array([  5017.,  20000.,  20000., ...,  78125.,   1123.,    130.]), original_peptides=array(['AAAFVNQHL', 'AAAQGQAPL', 'AADSFATSY', ..., 'YWIREGKII',
       'YWMGGTTYF', 'YYFSYPLFV'], 
      dtype='<U10'), original_l

In [14]:
def expanded_measurements(allele_data_dict):
    return sum(len(x.Y) for x in allele_data_dict.values())

def measurements(allele_data_dict):
    return sum(len(set(x.original_peptides)) for x in allele_data_dict.values())

def alleles_with_enough_data(allele_data_dict):
    return len([x for x in allele_data_dict.values() if len(x.Y) >= min_peptides_to_consider_allele])

def thousands(num):
    locale.setlocale(locale.LC_ALL, 'en_US')
    return locale.format("%d", num, grouping=True)

def format_pair(lst1, lst2):
    return ["%s (%s)" % (thousands(a), thousands(b)) for (a,b) in zip(lst1, lst2)]

description_df = pandas.DataFrame(index= ["BD2009", "BLIND"])
description_df["Alleles"] = [thousands(len(all_train_data)), thousands(len(all_validation_data))]
description_df["Alleles w/ %d+ measurements" % min_peptides_to_consider_allele] = [
    thousands(alleles_with_enough_data(all_train_data)), 
    thousands(alleles_with_enough_data(all_validation_data))
]
description_df["IC50 Measurements"] = [
    thousands(measurements(all_train_data)), 
    thousands(measurements(all_validation_data))
]
description_df["Expanded 9mers"] = [
    thousands(expanded_measurements(all_train_data)), 
    thousands(expanded_measurements(all_validation_data))
]
print(description_df.to_latex(index_names=False))

\begin{tabular}{lllll}
\toprule
{} & Alleles & Alleles w/ 10+ measurements & IC50 Measurements & Expanded 9mers \\
\midrule
BD2009 &     106 &                          98 &           137,654 &        470,170 \\
BLIND  &      53 &                          53 &            27,680 &         83,752 \\
\bottomrule
\end{tabular}



In [22]:
description_df.to_latex?