Skip to content

Commit

Permalink
Merge b1dbb36 into bbcfc2b
Browse files Browse the repository at this point in the history
  • Loading branch information
timodonnell committed Nov 28, 2017
2 parents bbcfc2b + b1dbb36 commit 086a92e
Show file tree
Hide file tree
Showing 40 changed files with 1,571 additions and 428 deletions.
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,24 @@ Run `mhcflurry-predict -h` for details.
See the [class1_allele_specific_models.ipynb](https://github.com/hammerlab/mhcflurry/blob/master/examples/class1_allele_specific_models.ipynb)
notebook for an overview of the Python API, including fitting your own predictors.

## Scanning protein sequences for predicted epitopes

The [mhctools](https://github.com/hammerlab/mhctools) package provides support
for scanning protein sequences to find predicted epitopes. It supports MHCflurry
as well as other binding predictors. Here is an example:

```
# First install mhctools if needed:
pip install mhctools
# Now generate predictions for protein sequences in FASTA format:
mhctools \
--mhc-predictor mhcflurry \
--input-fasta-file INPUT.fasta \
--mhc-alleles A02:01,A03:01 \
--extract-subsequences \
--out RESULT.csv
```

## Details on the downloadable models

Expand Down
83 changes: 83 additions & 0 deletions downloads-generation/cross_validation_class1/GENERATE.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/bin/bash

set -e
set -x

DOWNLOAD_NAME=cross_validation_class1
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")

NFOLDS=5

mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"

# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)

# Log some environment info
date
pip freeze
git status

cd $SCRATCH_DIR/$DOWNLOAD_NAME

cp $SCRIPT_DIR/hyperparameters.yaml .
cp $SCRIPT_DIR/split_folds.py .
cp $SCRIPT_DIR/score.py .

time python split_folds.py \
"$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \
--min-measurements-per-allele 100 \
--folds $NFOLDS \
--random-state 1 \
--output-pattern-test "./test.fold_{}.csv" \
--output-pattern-train "./train.fold_{}.csv"

# Kill child processes if parent exits:
trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT

for fold in $(seq 0 $(expr $NFOLDS - 1))
do
mhcflurry-class1-train-allele-specific-models \
--data train.fold_${fold}.csv \
--hyperparameters hyperparameters.yaml \
--out-models-dir models.fold_${fold} \
--min-measurements-per-allele 0 \
--percent-rank-calibration-num-peptides-per-length 0 \
2>&1 | tee -a LOG.train.fold_${fold}.txt &
done
wait

echo "DONE TRAINING. NOW PREDICTING."

for fold in $(seq 0 $(expr $NFOLDS - 1))
do
mhcflurry-predict \
test.fold_${fold}.csv \
--models models.fold_${fold} \
--no-throw \
--include-individual-model-predictions \
--out predictions.fold_${fold}.csv &
done
wait

time python score.py \
predictions.fold_*.csv \
--out-combined predictions.combined.csv \
--out-scores scores.csv \
--out-summary summary.all.csv

grep -v single summary.all.csv > summary.ensemble.csv

cp $SCRIPT_ABSOLUTE_PATH .
for i in $(ls *.txt)
do
bzip2 $i
done
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *

echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
7 changes: 7 additions & 0 deletions downloads-generation/cross_validation_class1/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Cross validation of standard Class I models

This download contains cross validation results and intermediate data for
class I allele-specific MHCflurry models.

This exists to track the exact steps used to generate cross-validation results.
Users will probably not interact with this directly.
101 changes: 101 additions & 0 deletions downloads-generation/cross_validation_class1/score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
"""
Scoring script for cross-validation.
"""
import argparse
import sys
import collections

import pandas
from mhcflurry.scoring import make_scores


parser = argparse.ArgumentParser(usage = __doc__)

parser.add_argument(
"input", metavar="INPUT.csv", help="Input CSV", nargs="+")

parser.add_argument(
"--out-scores",
metavar="RESULT.csv")

parser.add_argument(
"--out-combined",
metavar="COMBINED.csv")

parser.add_argument(
"--out-summary",
metavar="RESULT.csv")

def run(argv):
args = parser.parse_args(argv)

df = None
for (i, filename) in enumerate(args.input):
input_df = pandas.read_csv(filename)
assert not input_df.mhcflurry_prediction.isnull().any()

cols_to_merge = []
input_df["prediction_%d" % i] = input_df.mhcflurry_prediction
cols_to_merge.append(input_df.columns[-1])
if 'mhcflurry_model_single_0' in input_df.columns:
input_df["prediction_single_%d" % i] = input_df.mhcflurry_model_single_0
cols_to_merge.append(input_df.columns[-1])

if df is None:
df = input_df[
["allele", "peptide", "measurement_value"] + cols_to_merge
].copy()
else:
df = pandas.merge(
df,
input_df[['allele', 'peptide'] + cols_to_merge],
on=['allele', 'peptide'],
how='outer')

print("Loaded data:")
print(df.head(5))

if args.out_combined:
df.to_csv(args.out_combined, index=False)
print("Wrote: %s" % args.out_combined)

prediction_cols = [
c
for c in df.columns
if c.startswith("prediction_")
]

scores_rows = []
for (allele, allele_df) in df.groupby("allele"):
for prediction_col in prediction_cols:
sub_df = allele_df.loc[~allele_df[prediction_col].isnull()]
scores = collections.OrderedDict()
scores['allele'] = allele
scores['fold'] = prediction_col.replace("prediction_", "").replace("single_", "")
scores['kind'] = "single" if "single" in prediction_col else "ensemble"
scores['train_size'] = allele_df[prediction_col].isnull().sum()
scores['test_size'] = len(sub_df)
scores.update(
make_scores(
sub_df.measurement_value, sub_df[prediction_col]))
scores_rows.append(scores)
scores_df = pandas.DataFrame(scores_rows)
print(scores_df)

if args.out_scores:
scores_df.to_csv(args.out_scores, index=False)
print("Wrote: %s" % args.out_scores)

summary_df = scores_df.groupby(["allele", "kind"])[
["train_size", "test_size", "auc", "f1", "tau"]
].mean().reset_index()
print("Summary:")
print(summary_df)

if args.out_summary:
summary_df.to_csv(args.out_summary, index=False)
print("Wrote: %s" % args.out_summary)

if __name__ == '__main__':
run(sys.argv[1:])

113 changes: 113 additions & 0 deletions downloads-generation/cross_validation_class1/split_folds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""
Split training data into CV folds.
"""
import argparse
import sys
from os.path import abspath

import pandas
from sklearn.model_selection import StratifiedKFold

parser = argparse.ArgumentParser(usage = __doc__)

parser.add_argument(
"input", metavar="INPUT.csv", help="Input CSV")

parser.add_argument(
"--folds", metavar="N", type=int, default=5)

parser.add_argument(
"--allele",
nargs="+",
help="Include only the specified allele(s)")

parser.add_argument(
"--min-measurements-per-allele",
type=int,
metavar="N",
help="Use only alleles with >=N measurements.")

parser.add_argument(
"--subsample",
type=int,
metavar="N",
help="Subsample to first N rows")

parser.add_argument(
"--random-state",
metavar="N",
type=int,
help="Specify an int for deterministic splitting")

parser.add_argument(
"--output-pattern-train",
default="./train.fold_{}.csv",
help="Pattern to use to generate output filename. Default: %(default)s")

parser.add_argument(
"--output-pattern-test",
default="./test.fold_{}.csv",
help="Pattern to use to generate output filename. Default: %(default)s")


def run(argv):
args = parser.parse_args(argv)

df = pandas.read_csv(args.input)
print("Loaded data with shape: %s" % str(df.shape))

df = df.ix[
(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)
]
print("Subselected to 8-15mers: %s" % (str(df.shape)))

allele_counts = df.allele.value_counts()

if args.allele:
alleles = args.allele
else:
alleles = list(
allele_counts.ix[
allele_counts > args.min_measurements_per_allele
].index)

df = df.ix[df.allele.isin(alleles)]
print("Potentially subselected by allele to: %s" % str(df.shape))

print("Data has %d alleles: %s" % (
df.allele.nunique(), " ".join(df.allele.unique())))

df = df.groupby(["allele", "peptide"]).measurement_value.median().reset_index()
print("Took median for each duplicate peptide/allele pair: %s" % str(df.shape))

if args.subsample:
df = df.head(args.subsample)
print("Subsampled to: %s" % str(df.shape))

kf = StratifiedKFold(
n_splits=args.folds,
shuffle=True,
random_state=args.random_state)

# Stratify by both allele and binder vs. nonbinder.
df["key"] = [
"%s_%s" % (
row.allele,
"binder" if row.measurement_value < 500 else "nonbinder")
for (_, row) in df.iterrows()
]

for i, (train, test) in enumerate(kf.split(df, df.key)):
train_filename = args.output_pattern_train.format(i)
test_filename = args.output_pattern_test.format(i)

df.iloc[train].to_csv(train_filename, index=False)
print("Wrote: %s" % abspath(train_filename))

df.iloc[test].to_csv(test_filename, index=False)
print("Wrote: %s" % abspath(test_filename))


if __name__ == '__main__':
run(sys.argv[1:])

3 changes: 2 additions & 1 deletion downloads-generation/models_class1/GENERATE.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ time mhcflurry-class1-train-allele-specific-models \
--data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \
--hyperparameters hyperparameters.yaml \
--out-models-dir models \
--min-measurements-per-allele 200
--percent-rank-calibration-num-peptides-per-length 1000000 \
--min-measurements-per-allele 100

cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
Expand Down
13 changes: 5 additions & 8 deletions downloads-generation/models_class1/hyperparameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 12,
"n_models": 8,

##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 10,
"patience": 20,
"early_stopping": true,
"validation_split": 0.2,
"minibatch_size": 128,

##########################################
# RANDOM NEGATIVE PEPTIDES
Expand All @@ -26,17 +27,13 @@
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": false, # maintained for backward compatability
"embedding_output_dim": 8, # only used if using embedding
"kmer_size": 15,

##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
},
{
"filters": 8,
"activation": "tanh",
Expand All @@ -46,7 +43,7 @@
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
32
16
],
"dense_layer_l1_regularization": 0.001,
"batch_normalization": false,
Expand Down
Loading

0 comments on commit 086a92e

Please sign in to comment.