In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import json
from collections import defaultdict

import torch
import gpytorch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from scipy.stats import norm
from rdkit import Chem
from botorch.models.gpytorch import GPyTorchModel
from botorch.fit import fit_gpytorch_model
from map4 import MAP4Calculator

from mobius import SequenceGA, ScaffoldGA, GA
from mobius import Map4Fingerprint, SequenceDescriptors, DMTSimulation
from mobius import expected_improvement, TanimotoSimilarityKernel
from mobius import HELMGeneticOperators
from mobius import MHCIPeptideScorer
from mobius import affinity_binding_to_energy, energy_to_affinity_binding, compute_probability_matrix, plot_results

In [3]:
mhci = pd.read_csv('../mhc/binding_data_2013/bdata.20130222.mhci.csv')
print(mhci[mhci['mhc_allele'].str.contains("HLA")]['mhc_allele'].unique().shape)

(119,)


In [4]:
# We removed those binding affinity values
# A lot of peptides were set with those values. Looks like some default values assigned...
dirty_values = [1, 2, 3, 5000, 10000, 20000, 43424, 50000, 69444.44444, 78125]

# Split dataset in training and testing sets
mhci = mhci[(mhci['mhc_allele'] == 'HLA-A*02:01') &
            (8 <= mhci['length']) &
            (mhci['length'] <= 11) &
            (~mhci['affinity_binding'].isin(dirty_values))]

In [5]:
with open('../helm_ga/HELMCoreLibrary.json') as f:
    helm_core_library = json.load(f)

monomer_names = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
smiles = [x['smiles'] for x in helm_core_library if x['symbol'] in monomer_names and x['monomerType'] == 'Backbone']
    
probability_matrix = compute_probability_matrix(smiles)

helmgo = HELMGeneticOperators(monomer_names, probability_matrix)

In [37]:
n_peptides = [50] * 4
peptide_lengths = [8, 9, 10, 11]
energy_bounds = [-8.235, -4.944] # about between 1 uM and 250 uM
#energy_bounds = [-4.944, -4.531] # about between 250 uM and 500 uM
#energy_bounds = [-4.531, -4.118] # about between 500 uM and 1 mM
#energy_bounds = [-8.649, -8.235] # about between 500 nM and 1uM
energy_cutoff = -4.11 # 1 mM
#energy_cutoff = -4.944 # 250 uM
#energy_cutoff = -8.235 # 1 uM

pssm_files = ['../mhc/IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-8.txt',
              '../mhc/IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-9.txt',
              '../mhc/IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-10.txt',
              '../mhc/IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-11.txt']
mps = MHCIPeptideScorer(pssm_files, mhci, energy_cutoff=energy_cutoff)

# Generate random peptides
random_peptides = []
random_peptide_scores = []

for n_peptide, peptide_length in zip(n_peptides, peptide_lengths):
    p, s = mps.generate_random_peptides(n_peptide, peptide_length, energy_bounds)
    random_peptides.extend(p)
    random_peptide_scores.extend(s)

random_peptides = np.array(random_peptides)
random_peptide_scores = np.array(random_peptide_scores)

clusters = defaultdict(list)
for i, sequence in enumerate(random_peptides):
    clusters[sequence.count('.')].append(i)
print('Distribution:', ['%d: %d' % (k, len(clusters[k])) for k in sorted(clusters.keys())])
print('')

print(len(random_peptides))
print(random_peptides)
print(random_peptide_scores)

----- Peptide global -----
N peptide: 8471
R2: 0.620
RMSD : 1.177 kcal/mol

Distribution: ['8: 50', '9: 50', '10: 50', '11: 50']

200
['PEPTIDE1{W.L.Y.R.R.I.P.T}$$$$V2.0' 'PEPTIDE1{C.L.F.H.V.Q.F.G}$$$$V2.0'
 'PEPTIDE1{Y.L.F.L.W.T.A.A}$$$$V2.0' 'PEPTIDE1{C.L.T.I.E.P.E.Y}$$$$V2.0'
 'PEPTIDE1{N.S.A.L.M.W.P.W}$$$$V2.0' 'PEPTIDE1{M.Q.W.W.Y.I.T.S}$$$$V2.0'
 'PEPTIDE1{H.T.G.D.D.C.L.G}$$$$V2.0' 'PEPTIDE1{L.N.T.I.A.G.L.V}$$$$V2.0'
 'PEPTIDE1{R.L.K.F.N.W.G.D}$$$$V2.0' 'PEPTIDE1{Y.V.M.Y.R.M.W.V}$$$$V2.0'
 'PEPTIDE1{I.G.M.W.S.P.V.S}$$$$V2.0' 'PEPTIDE1{N.D.G.N.D.E.I.N}$$$$V2.0'
 'PEPTIDE1{I.I.M.N.A.K.M.F}$$$$V2.0' 'PEPTIDE1{K.I.L.S.I.T.I.I}$$$$V2.0'
 'PEPTIDE1{V.I.T.V.G.F.D.T}$$$$V2.0' 'PEPTIDE1{M.P.G.I.S.Q.H.M}$$$$V2.0'
 'PEPTIDE1{G.G.C.F.W.F.F.E}$$$$V2.0' 'PEPTIDE1{Q.M.W.D.Y.H.M.D}$$$$V2.0'
 'PEPTIDE1{H.M.K.H.W.A.W.N}$$$$V2.0' 'PEPTIDE1{F.D.S.F.G.I.H.P}$$$$V2.0'
 'PEPTIDE1{K.W.F.F.E.H.H.C}$$$$V2.0' 'PEPTIDE1{Y.N.W.S.L.H.R.P}$$$$V2.0'
 'PEPTIDE1{L.H.S.L.A.A.S.M}$$$$V2.0' 'PEPTIDE1{M.E.R.W.F.C.N.Q}

In [None]:
map4 = Map4Fingerprint(input_type='helm')

parameters = {'n_candidates': 96, 'GA': GA, 'n_gen': 1, 'helmgo': helmgo, 'oracle': mps, 'acq_function': expected_improvement, 'kernel': TanimotoSimilarityKernel(), 'seq_transformer': map4, 
              'sequence_n_gen': 1000, 'sequence_n_children': 100, 'sequence_temperature': 0.025, 'sequence_elitism': True, 'sequence_total_attempts': 20, 'sequence_minimum_mutations': 1, 'sequence_maximum_mutations': 3, 'sequence_n_process': None,
              'scaffold_n_gen': 1, 'scaffold_n_children': 100, 'scaffold_temperature': 0.025, 'scaffold_elitism': True, 'scaffold_total_attempts': 20, 'scaffold_only_terminus': True, 'scaffold_minimum_size': 8, 'scaffold_maximum_size': 11}

dmt = DMTSimulation(4, 10)
df = dmt.run(random_peptides, random_peptide_scores, **parameters)

Run: 1
Exp dataset size: (200, 4096)


Init.
N pep:  200
Best peptide: -8.198
N pep under -14 kcal/mol: 000
N pep under -13 kcal/mol: 000
N pep under -12 kcal/mol: 000
N pep under -11 kcal/mol: 000
N pep under -10 kcal/mol: 000
N pep under -9 kcal/mol: 000
N pep under -8 kcal/mol: 004
Non binding pep        : 000


Generation: 1
N 001 - Score: -0.022156 - Seq: 9 - PEPTIDE1{F.L.K.V.M.S.P.E.C}$$$$V2.0 (000/020) - 45
End Scaffold GA - Best score: -0.022156 - Seq: 9 - PEPTIDE1{F.L.K.V.M.S.P.E.C}$$$$V2.0
N 001 - Score: -0.033992 - Seq: 9 - PEPTIDE1{F.L.A.V.M.S.P.P.C}$$$$V2.0 (000/020) - 94
N 001 - Score: -0.019244 - Seq: 8 - PEPTIDE1{F.L.M.V.M.S.P.E}$$$$V2.0 (000/020) - 104
N 001 - Score: -0.012298 - Seq: 10 - PEPTIDE1{P.F.L.K.V.M.S.P.A.C}$$$$V2.0 (000/020) - 115
N 002 - Score: -0.020323 - Seq: 8 - PEPTIDE1{L.K.V.M.S.P.E.V}$$$$V2.0 (000/020) - 38
N 002 - Score: -0.033992 - Seq: 9 - PEPTIDE1{F.L.A.V.M.S.P.P.C}$$$$V2.0 (001/020) - 46
N 002 - Score: -0.015275 - Seq: 10 - PEPTIDE1{F.L.G.V.M.S.P

N 021 - Score: -0.038249 - Seq: 10 - PEPTIDE1{A.A.L.A.A.M.I.P.A.V}$$$$V2.0 (002/020) - 96N 017 - Score: -0.026641 - Seq: 11 - PEPTIDE1{G.A.A.L.V.L.M.V.P.Q.A}$$$$V2.0 (000/020) - 89

N 028 - Score: -0.042527 - Seq: 8 - PEPTIDE1{L.M.A.M.I.P.V.I}$$$$V2.0 (001/020) - 44
N 018 - Score: -0.026641 - Seq: 11 - PEPTIDE1{G.A.A.L.V.L.M.V.P.Q.A}$$$$V2.0 (001/020) - 4
N 022 - Score: -0.038249 - Seq: 10 - PEPTIDE1{A.A.L.A.A.M.I.P.A.V}$$$$V2.0 (003/020) - 40
N 029 - Score: -0.042527 - Seq: 8 - PEPTIDE1{L.M.A.M.I.P.V.I}$$$$V2.0 (002/020) - 60
N 024 - Score: -0.045731 - Seq: 9 - PEPTIDE1{M.L.A.M.M.I.P.P.T}$$$$V2.0 (005/020) - 72
N 030 - Score: -0.042527 - Seq: 8 - PEPTIDE1{L.M.A.M.I.P.V.I}$$$$V2.0 (003/020) - 53
N 025 - Score: -0.046399 - Seq: 9 - PEPTIDE1{A.L.A.A.M.I.P.P.T}$$$$V2.0 (000/020) - 45
N 023 - Score: -0.038249 - Seq: 10 - PEPTIDE1{A.A.L.A.A.M.I.P.A.V}$$$$V2.0 (004/020) - 84
N 019 - Score: -0.027706 - Seq: 11 - PEPTIDE1{L.A.A.L.V.V.M.V.P.T.A}$$$$V2.0 (000/020) - 101
N 024 - Score: -0.038249 

N 046 - Score: -0.054386 - Seq: 9 - PEPTIDE1{M.L.A.S.A.I.P.A.T}$$$$V2.0 (014/020) - 35
N 043 - Score: -0.043428 - Seq: 10 - PEPTIDE1{A.A.L.A.M.V.I.P.A.V}$$$$V2.0 (012/020) - 102N 036 - Score: -0.037496 - Seq: 11 - PEPTIDE1{M.A.A.L.V.V.I.V.P.A.A}$$$$V2.0 (007/020) - 14

N 057 - Score: -0.043923 - Seq: 8 - PEPTIDE1{L.M.A.I.Q.P.A.V}$$$$V2.0 (016/020) - 71
N 044 - Score: -0.043428 - Seq: 10 - PEPTIDE1{A.A.L.A.M.V.I.P.A.V}$$$$V2.0 (013/020) - 31
N 047 - Score: -0.058073 - Seq: 9 - PEPTIDE1{M.L.A.A.M.I.P.A.T}$$$$V2.0 (000/020) - 65
N 058 - Score: -0.043923 - Seq: 8 - PEPTIDE1{L.M.A.I.Q.P.A.V}$$$$V2.0 (017/020) - 35
N 048 - Score: -0.058073 - Seq: 9 - PEPTIDE1{M.L.A.A.M.I.P.A.T}$$$$V2.0 (001/020) - 43
N 059 - Score: -0.044819 - Seq: 8 - PEPTIDE1{L.M.A.I.Q.P.V.I}$$$$V2.0 (000/020) - 44
N 045 - Score: -0.043428 - Seq: 10 - PEPTIDE1{A.A.L.A.M.V.I.P.A.V}$$$$V2.0 (014/020) - 77
N 037 - Score: -0.037496 - Seq: 11 - PEPTIDE1{M.A.A.L.V.V.I.V.P.A.A}$$$$V2.0 (008/020) - 103
N 038 - Score: -0.037496 - S

N 058 - Score: -0.038293 - Seq: 11 - PEPTIDE1{M.A.A.L.V.V.L.V.P.A.T}$$$$V2.0 (001/020) - 33
N 093 - Score: -0.046574 - Seq: 8 - PEPTIDE1{L.M.A.V.L.P.V.I}$$$$V2.0 (020/020) - 58
Reached maximum number of attempts (20), no improvement observed!
End SequenceGA - Best score: -0.046574 - Seq: 8 - PEPTIDE1{L.M.A.V.L.P.V.I}$$$$V2.0
N 059 - Score: -0.038293 - Seq: 11 - PEPTIDE1{M.A.A.L.V.V.L.V.P.A.T}$$$$V2.0 (002/020) - 80
N 060 - Score: -0.038293 - Seq: 11 - PEPTIDE1{M.A.A.L.V.V.L.V.P.A.T}$$$$V2.0 (003/020) - 19
N 061 - Score: -0.038293 - Seq: 11 - PEPTIDE1{M.A.A.L.V.V.L.V.P.A.T}$$$$V2.0 (004/020) - 83
N 062 - Score: -0.038293 - Seq: 11 - PEPTIDE1{M.A.A.L.V.V.L.V.P.A.T}$$$$V2.0 (005/020) - 30
N 063 - Score: -0.038293 - Seq: 11 - PEPTIDE1{M.A.A.L.V.V.L.V.P.A.T}$$$$V2.0 (006/020) - 89
N 064 - Score: -0.038293 - Seq: 11 - PEPTIDE1{M.A.A.L.V.V.L.V.P.A.T}$$$$V2.0 (007/020) - 20
N 065 - Score: -0.038293 - Seq: 11 - PEPTIDE1{M.A.A.L.V.V.L.V.P.A.T}$$$$V2.0 (008/020) - 100
N 066 - Score: -0.038293 - S